Add model and tokenizer files with Git LFS

Browse files

Files changed (11) hide show

.gitattributes +5 -0
README.md +49 -0
config.json +3 -0
images/persuasion_techniques_hierarchy_graph.png +0 -0
inference.py +87 -0
model.safetensors +3 -0
requirements.txt +7 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +3 -0
tokenizer.json +3 -0
tokenizer_config.json +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+config.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
+sentencepiece.bpe.model filter=lfs diff=lfs merge=lfs -text
+tokenizer_config.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,52 @@
 ---
 license: gpl-3.0
 ---

 ---
+title: multilingual-persuasion-detection-from-text
+app_file: inference.py
+pinned: false
 license: gpl-3.0
+language:
+  - multilingual
+tags:
+  - mbart-50
+  - text-classification
+  - multi-label-classification
+  - persuasion-detection
+  - meme-analysis
+  - social-media-analysis
+  - propaganda-detection
+  - hierarchical-classification
+  - multilingual
+pipeline_tag: text-classification
+inference: True
 ---
+# Multilingual Persuasion Detection in Memes
+Given only the “textual content” of a meme, the goal is to identify which of the 20 persuasion techniques, organized in a hierarchy, it uses. Selecting only the ancestor node of a technique gives only a partial reward. This is a hierarchical multi-label classification problem based on the [SemEval 2024 Task 4 Subtask 1 of "Multilingual Detection of Persuasion Techniques in Memes"](https://propaganda.math.unipd.it/semeval2024task4/index.html).
+### Hierarchy
+<img src="images/persuasion_techniques_hierarchy_graph.png" width="622" height="350">
+### Input Example
+- **Input:** "I HATE TRUMP\n\nMOST TERRORIST DO",
+- **Outputs:**
+  - Child-only Label List: ['Name calling/Labeling', 'Loaded Language']
+  - Complete Hierarchical Label List: ['Ethos', 'Ad Hominem', 'Name calling/Labeling', 'Pathos', 'Loaded Language']
+## Training Hyperparameters
+- Base Model: "facebook/mbart-large-50-many-to-many-mmt"
+- Learning Rate: 5e-05
+- Max Length: 256
+- Batch Size: 64
+- Epoch: 3
+- Seed: 42
+## Model Statistics
+The model obtained the following metrics on the Development Set as of March 31st, 2024:
+- Hierarchical F1: 63.58%
+- Hierarchical Precision: 58.3%
+- Hierarchical Recall: 69.9%
+## Licensing
+The model is available under the GNU General Public License v3.0 (GPL-3.0), which allows for free use, modification, and distribution under the same license. However, it is strictly for research purposes only and cannot be used for malicious activities, including but not limited to manipulation, targeted harassment, hate speech, deception, and discrimination.
+The dataset is available on the [competition website](https://propaganda.math.unipd.it/semeval2024task4/). Users must accept an online agreement before downloading and using the data. This agreement stipulates that the data is for research purposes only and cannot be redistributed or used for malicious purposes as outlined above.

config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64a5e70a64d1755934c8946a9e79282de7761e3e04bb48599969bd4fdcea884b
+size 2574

images/persuasion_techniques_hierarchy_graph.png ADDED Viewed

inference.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import numpy as np
+import networkx as nx
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+def _make_logits_consistent(x, R):
+    c_out = x.unsqueeze(1) + 10
+    c_out = c_out.expand(len(x), R.shape[1], R.shape[1])
+    R_batch = R.expand(len(x), R.shape[1], R.shape[1]).to(x.device)
+    final_out, _ = torch.max(R_batch * c_out, dim=2)
+    return final_out - 10
+def initialize_model():
+    model_dir = "."
+    G = nx.DiGraph()
+    edges = [
+        ("ROOT", "Logos"),
+            ("Logos", "Repetition"), ("Logos", "Obfuscation, Intentional vagueness, Confusion"), ("Logos", "Reasoning"), ("Logos", "Justification"),
+	            ("Justification", "Slogans"), ("Justification", "Bandwagon"), ("Justification", "Appeal to authority"), ("Justification", "Flag-waving"), ("Justification", "Appeal to fear/prejudice"),
+	            ("Reasoning", "Simplification"),
+	            	("Simplification", "Causal Oversimplification"), ("Simplification", "Black-and-white Fallacy/Dictatorship"), ("Simplification", "Thought-terminating cliché"),
+                    ("Reasoning", "Distraction"),
+                    	("Distraction", "Misrepresentation of Someone's Position (Straw Man)"), ("Distraction", "Presenting Irrelevant Data (Red Herring)"), ("Distraction", "Whataboutism"),
+        ("ROOT", "Ethos"),
+            ("Ethos", "Appeal to authority"), ("Ethos", "Glittering generalities (Virtue)"), ("Ethos", "Bandwagon"), ("Ethos", "Ad Hominem"), ("Ethos", "Transfer"),
+                ("Ad Hominem", "Doubt"), ("Ad Hominem", "Name calling/Labeling"), ("Ad Hominem", "Smears"), ("Ad Hominem", "Reductio ad hitlerum"), ("Ad Hominem", "Whataboutism"),
+        ("ROOT", "Pathos"),
+            ("Pathos", "Exaggeration/Minimisation"), ("Pathos", "Loaded Language"), ("Pathos", "Appeal to (Strong) Emotions"), ("Pathos", "Appeal to fear/prejudice"), ("Pathos", "Flag-waving"), ("Pathos", "Transfer")
+    ]
+    G.add_edges_from(edges)
+    tokenizer = AutoTokenizer.from_pretrained(model_dir)
+    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    A = nx.to_numpy_array(G).transpose()
+    R = np.zeros(A.shape)
+    np.fill_diagonal(R, 1)
+    g = nx.DiGraph(A)
+    for i in range(len(A)):
+        descendants = list(nx.descendants(g, i))
+        if descendants:
+            R[i, descendants] = 1
+    R = torch.tensor(R).transpose(1, 0).unsqueeze(0)
+    return tokenizer, model, R, G, device
+def predict_persuasion_labels(text, tokenizer, model, R, G, device):
+    encoding = tokenizer.encode_plus(
+        text,
+        add_special_tokens=True,
+        max_length=128,
+        return_token_type_ids=False,
+        padding="max_length",
+        truncation=True,
+        return_attention_mask=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        outputs = model(
+            input_ids=encoding["input_ids"].to(device),
+            attention_mask=encoding["attention_mask"].to(device),
+        )
+    logits = _make_logits_consistent(outputs.logits, R)
+    logits[:, 0] = -1.0
+    logits = logits > 0.0
+    complete_predicted_hierarchy = np.array(G.nodes)[logits[0].cpu().nonzero()].flatten().tolist()
+    child_only_labels = []
+    for label in complete_predicted_hierarchy:
+        if not list(G.successors(label)):
+            child_only_labels.append(label)
+    return complete_predicted_hierarchy, child_only_labels
+tokenizer, model, R, G, device = initialize_model()
+def inference(text):
+    return predict_persuasion_labels(text, tokenizer, model, R, G, device)
+if __name__ == "__main__":
+    # ask the user for input
+    text = input("Enter the text: ")
+    print(inference(text))

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f22069b784f1dbe6dcdcfd85c6941a4330a81cdfd5e3d4996246fd6a500a877c
+size 2447904292

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+numpy
+networkx
+transformers
+tqdm
+sentencepiece
+protobuf

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1daf98a95c01a96007f2ab65fed7b31641e363f2de94f82e06d948b7855ed21d
+size 992

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86f983b6563a9468794455498914bda0eaf9a60e5c9cd5a21669a24a625e490d
+size 17109921

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9aff9ce0b78ebc744dba8ac8ebf44d7e675d581e97971edb35b1203926d9586
+size 10924