real-time-tokenizer / test_transformers.py
bayartsogt's picture
trust_remote_code=True
0afb4f9
raw
history blame contribute delete
477 Bytes
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('tugstugi/bert-large-mongolian-cased', use_fast=False)
test_input = "Мөнгөө тушаачихсаныхаа дараа мэдэгдээрэй"
print("input:", test_input)
print("tokenizer.encode()", tokenizer.encode(test_input))
print("tokenizer decode", [(tokenizer.decode(token_id), token_id) for token_id in tokenizer.encode(test_input)])
print("tokenizer()", tokenizer(test_input))