nishan-chatterjee
commited on
Commit
•
7ea9682
1
Parent(s):
6aa9d30
Add model and tokenizer files with Git LFS
Browse files- .gitattributes +5 -0
- README.md +49 -0
- config.json +3 -0
- images/persuasion_techniques_hierarchy_graph.png +0 -0
- inference.py +87 -0
- model.safetensors +3 -0
- requirements.txt +7 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +3 -0
- tokenizer.json +3 -0
- tokenizer_config.json +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
config.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
|
39 |
+
sentencepiece.bpe.model filter=lfs diff=lfs merge=lfs -text
|
40 |
+
tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,52 @@
|
|
1 |
---
|
|
|
|
|
|
|
2 |
license: gpl-3.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: multilingual-persuasion-detection-from-text
|
3 |
+
app_file: inference.py
|
4 |
+
pinned: false
|
5 |
license: gpl-3.0
|
6 |
+
language:
|
7 |
+
- multilingual
|
8 |
+
tags:
|
9 |
+
- mbart-50
|
10 |
+
- text-classification
|
11 |
+
- multi-label-classification
|
12 |
+
- persuasion-detection
|
13 |
+
- meme-analysis
|
14 |
+
- social-media-analysis
|
15 |
+
- propaganda-detection
|
16 |
+
- hierarchical-classification
|
17 |
+
- multilingual
|
18 |
+
pipeline_tag: text-classification
|
19 |
+
inference: True
|
20 |
---
|
21 |
+
|
22 |
+
# Multilingual Persuasion Detection in Memes
|
23 |
+
|
24 |
+
Given only the “textual content” of a meme, the goal is to identify which of the 20 persuasion techniques, organized in a hierarchy, it uses. Selecting only the ancestor node of a technique gives only a partial reward. This is a hierarchical multi-label classification problem based on the [SemEval 2024 Task 4 Subtask 1 of "Multilingual Detection of Persuasion Techniques in Memes"](https://propaganda.math.unipd.it/semeval2024task4/index.html).
|
25 |
+
|
26 |
+
### Hierarchy
|
27 |
+
<img src="images/persuasion_techniques_hierarchy_graph.png" width="622" height="350">
|
28 |
+
|
29 |
+
### Input Example
|
30 |
+
- **Input:** "I HATE TRUMP\n\nMOST TERRORIST DO",
|
31 |
+
- **Outputs:**
|
32 |
+
- Child-only Label List: ['Name calling/Labeling', 'Loaded Language']
|
33 |
+
- Complete Hierarchical Label List: ['Ethos', 'Ad Hominem', 'Name calling/Labeling', 'Pathos', 'Loaded Language']
|
34 |
+
|
35 |
+
## Training Hyperparameters
|
36 |
+
- Base Model: "facebook/mbart-large-50-many-to-many-mmt"
|
37 |
+
- Learning Rate: 5e-05
|
38 |
+
- Max Length: 256
|
39 |
+
- Batch Size: 64
|
40 |
+
- Epoch: 3
|
41 |
+
- Seed: 42
|
42 |
+
|
43 |
+
## Model Statistics
|
44 |
+
The model obtained the following metrics on the Development Set as of March 31st, 2024:
|
45 |
+
- Hierarchical F1: 63.58%
|
46 |
+
- Hierarchical Precision: 58.3%
|
47 |
+
- Hierarchical Recall: 69.9%
|
48 |
+
|
49 |
+
## Licensing
|
50 |
+
The model is available under the GNU General Public License v3.0 (GPL-3.0), which allows for free use, modification, and distribution under the same license. However, it is strictly for research purposes only and cannot be used for malicious activities, including but not limited to manipulation, targeted harassment, hate speech, deception, and discrimination.
|
51 |
+
|
52 |
+
The dataset is available on the [competition website](https://propaganda.math.unipd.it/semeval2024task4/). Users must accept an online agreement before downloading and using the data. This agreement stipulates that the data is for research purposes only and cannot be redistributed or used for malicious purposes as outlined above.
|
config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64a5e70a64d1755934c8946a9e79282de7761e3e04bb48599969bd4fdcea884b
|
3 |
+
size 2574
|
images/persuasion_techniques_hierarchy_graph.png
ADDED
inference.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
import networkx as nx
|
4 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
5 |
+
|
6 |
+
def _make_logits_consistent(x, R):
|
7 |
+
c_out = x.unsqueeze(1) + 10
|
8 |
+
c_out = c_out.expand(len(x), R.shape[1], R.shape[1])
|
9 |
+
R_batch = R.expand(len(x), R.shape[1], R.shape[1]).to(x.device)
|
10 |
+
final_out, _ = torch.max(R_batch * c_out, dim=2)
|
11 |
+
return final_out - 10
|
12 |
+
|
13 |
+
def initialize_model():
|
14 |
+
model_dir = "."
|
15 |
+
G = nx.DiGraph()
|
16 |
+
edges = [
|
17 |
+
("ROOT", "Logos"),
|
18 |
+
("Logos", "Repetition"), ("Logos", "Obfuscation, Intentional vagueness, Confusion"), ("Logos", "Reasoning"), ("Logos", "Justification"),
|
19 |
+
("Justification", "Slogans"), ("Justification", "Bandwagon"), ("Justification", "Appeal to authority"), ("Justification", "Flag-waving"), ("Justification", "Appeal to fear/prejudice"),
|
20 |
+
("Reasoning", "Simplification"),
|
21 |
+
("Simplification", "Causal Oversimplification"), ("Simplification", "Black-and-white Fallacy/Dictatorship"), ("Simplification", "Thought-terminating cliché"),
|
22 |
+
("Reasoning", "Distraction"),
|
23 |
+
("Distraction", "Misrepresentation of Someone's Position (Straw Man)"), ("Distraction", "Presenting Irrelevant Data (Red Herring)"), ("Distraction", "Whataboutism"),
|
24 |
+
("ROOT", "Ethos"),
|
25 |
+
("Ethos", "Appeal to authority"), ("Ethos", "Glittering generalities (Virtue)"), ("Ethos", "Bandwagon"), ("Ethos", "Ad Hominem"), ("Ethos", "Transfer"),
|
26 |
+
("Ad Hominem", "Doubt"), ("Ad Hominem", "Name calling/Labeling"), ("Ad Hominem", "Smears"), ("Ad Hominem", "Reductio ad hitlerum"), ("Ad Hominem", "Whataboutism"),
|
27 |
+
("ROOT", "Pathos"),
|
28 |
+
("Pathos", "Exaggeration/Minimisation"), ("Pathos", "Loaded Language"), ("Pathos", "Appeal to (Strong) Emotions"), ("Pathos", "Appeal to fear/prejudice"), ("Pathos", "Flag-waving"), ("Pathos", "Transfer")
|
29 |
+
]
|
30 |
+
G.add_edges_from(edges)
|
31 |
+
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
33 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
|
34 |
+
|
35 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
36 |
+
model.to(device)
|
37 |
+
|
38 |
+
A = nx.to_numpy_array(G).transpose()
|
39 |
+
R = np.zeros(A.shape)
|
40 |
+
np.fill_diagonal(R, 1)
|
41 |
+
g = nx.DiGraph(A)
|
42 |
+
for i in range(len(A)):
|
43 |
+
descendants = list(nx.descendants(g, i))
|
44 |
+
if descendants:
|
45 |
+
R[i, descendants] = 1
|
46 |
+
R = torch.tensor(R).transpose(1, 0).unsqueeze(0)
|
47 |
+
|
48 |
+
return tokenizer, model, R, G, device
|
49 |
+
|
50 |
+
def predict_persuasion_labels(text, tokenizer, model, R, G, device):
|
51 |
+
encoding = tokenizer.encode_plus(
|
52 |
+
text,
|
53 |
+
add_special_tokens=True,
|
54 |
+
max_length=128,
|
55 |
+
return_token_type_ids=False,
|
56 |
+
padding="max_length",
|
57 |
+
truncation=True,
|
58 |
+
return_attention_mask=True,
|
59 |
+
return_tensors="pt",
|
60 |
+
)
|
61 |
+
|
62 |
+
with torch.no_grad():
|
63 |
+
outputs = model(
|
64 |
+
input_ids=encoding["input_ids"].to(device),
|
65 |
+
attention_mask=encoding["attention_mask"].to(device),
|
66 |
+
)
|
67 |
+
logits = _make_logits_consistent(outputs.logits, R)
|
68 |
+
logits[:, 0] = -1.0
|
69 |
+
logits = logits > 0.0
|
70 |
+
complete_predicted_hierarchy = np.array(G.nodes)[logits[0].cpu().nonzero()].flatten().tolist()
|
71 |
+
|
72 |
+
child_only_labels = []
|
73 |
+
for label in complete_predicted_hierarchy:
|
74 |
+
if not list(G.successors(label)):
|
75 |
+
child_only_labels.append(label)
|
76 |
+
|
77 |
+
return complete_predicted_hierarchy, child_only_labels
|
78 |
+
|
79 |
+
tokenizer, model, R, G, device = initialize_model()
|
80 |
+
|
81 |
+
def inference(text):
|
82 |
+
return predict_persuasion_labels(text, tokenizer, model, R, G, device)
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
# ask the user for input
|
86 |
+
text = input("Enter the text: ")
|
87 |
+
print(inference(text))
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f22069b784f1dbe6dcdcfd85c6941a4330a81cdfd5e3d4996246fd6a500a877c
|
3 |
+
size 2447904292
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
numpy
|
3 |
+
networkx
|
4 |
+
transformers
|
5 |
+
tqdm
|
6 |
+
sentencepiece
|
7 |
+
protobuf
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1daf98a95c01a96007f2ab65fed7b31641e363f2de94f82e06d948b7855ed21d
|
3 |
+
size 992
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86f983b6563a9468794455498914bda0eaf9a60e5c9cd5a21669a24a625e490d
|
3 |
+
size 17109921
|
tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9aff9ce0b78ebc744dba8ac8ebf44d7e675d581e97971edb35b1203926d9586
|
3 |
+
size 10924
|