cointegrated
commited on
Commit
•
2fa46f3
1
Parent(s):
4f6879b
Update README.md
Browse files
README.md
CHANGED
@@ -46,4 +46,27 @@ set | ROC AUC
|
|
46 |
detox | 0.857112
|
47 |
paraphraser | 0.858465
|
48 |
rupaws_qqp | 0.859195
|
49 |
-
rupaws_wiki | 0.906121
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
detox | 0.857112
|
47 |
paraphraser | 0.858465
|
48 |
rupaws_qqp | 0.859195
|
49 |
+
rupaws_wiki | 0.906121
|
50 |
+
|
51 |
+
Example usage:
|
52 |
+
|
53 |
+
```Python
|
54 |
+
import torch
|
55 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
56 |
+
|
57 |
+
model = AutoModelForSequenceClassification.from_pretrained('SkolkovoInstitute/ruRoberta-large-paraphrase-v1')
|
58 |
+
tokenizer = AutoTokenizer.from_pretrained('SkolkovoInstitute/ruRoberta-large-paraphrase-v1')
|
59 |
+
|
60 |
+
def get_similarity(text1, text2):
|
61 |
+
""" Predict the probability that two Russian sentences are paraphrases of each other. """
|
62 |
+
with torch.inference_mode():
|
63 |
+
batch = tokenizer(
|
64 |
+
text1, text2,
|
65 |
+
truncation=True, max_length=model.config.max_position_embeddings, return_tensors='pt',
|
66 |
+
).to(model.device)
|
67 |
+
proba = torch.softmax(model(**batch).logits, -1)
|
68 |
+
return proba[0][1].item()
|
69 |
+
|
70 |
+
print(get_similarity('Я тебя люблю', 'Ты мне нравишься')) # 0.9798
|
71 |
+
print(get_similarity('Я тебя люблю', 'Я тебя ненавижу')) # 0.0008
|
72 |
+
```
|