llm-blender
/

PairRM-hf

Text Generation

Inference Endpoints

Model card Files Files and versions Community

Dongfu Jiang commited on Jan 8

Commit

342b809

•

1 Parent(s): 8841a8a

Update README.md

Files changed (1) hide show

README.md +7 -5

README.md CHANGED Viewed

@@ -38,15 +38,17 @@ cand2_prefix = "<|candidate2|>"
 inputs = ["hello!", "I love you!"]
 candidates_A = ["hi!", "I hate you!"]
 candidates_B = ["f**k off!", "I love you, too!"]
-def tokenize_pair(sources:List[str], candidate1s:List[str], candidate2s:List[str]):
     ids = []
     assert len(sources) == len(candidate1s) == len(candidate2s)
     for i in range(len(sources)):
-        source_ids = tokenizer.encode(source_prefix + sources[i])
-        candidate1_ids = tokenizer.encode(cand1_prefix + candidate1s[i])
-        candidate2_ids = tokenizer.encode(cand2_prefix + candidate2s[i])
         ids.append(source_ids + candidate1_ids + candidate2_ids)
-    encodings = tokenizer.pad({"input_ids": ids}, return_tensors="pt")
     return encodings
 encodings = tokenize_pair(inputs, candidates_A, candidates_B)

 inputs = ["hello!", "I love you!"]
 candidates_A = ["hi!", "I hate you!"]
 candidates_B = ["f**k off!", "I love you, too!"]
+def tokenize_pair(sources:List[str], candidate1s:List[str], candidate2s:List[str], source_max_length=1224, candidate_max_length=412):
     ids = []
     assert len(sources) == len(candidate1s) == len(candidate2s)
+    max_length = source_max_length + 2 * candidate_max_length
     for i in range(len(sources)):
+        source_ids = tokenizer.encode(source_prefix + sources[i], max_length=source_max_length, truncation=True)
+        candidate_max_length = (max_length - len(source_ids)) // 2
+        candidate1_ids = tokenizer.encode(cand1_prefix + candidate1s[i], max_length=candidate_max_length, truncation=True)
+        candidate2_ids = tokenizer.encode(cand2_prefix + candidate2s[i], max_length=candidate_max_length, truncation=True)
         ids.append(source_ids + candidate1_ids + candidate2_ids)
+    encodings = tokenizer.pad({"input_ids": ids}, return_tensors="pt", padding=True, max_length=max_length)
     return encodings
 encodings = tokenize_pair(inputs, candidates_A, candidates_B)