Spaces:
Running
Running
""" 封装 sentencepiece.SentencePieceProcessor,以便符合transformers中的tokenizer标准 | |
## reference | |
## usage | |
- grok | |
""" | |
import sentencepiece as spm | |
from transformers import PreTrainedTokenizer | |
class SPTokenizerWrapper(PreTrainedTokenizer): | |
""" | |
## impl in PreTrainedTokenizer | |
- convert_ids_to_tokens | |
""" | |
def __init__(self, vocab_file): | |
self.vocab_file = vocab_file | |
self.sp_model = spm.SentencePieceProcessor(self.vocab_file) | |
super().__init__() | |
def vocab_size(self): | |
"""Returns vocab size""" | |
return self.sp_model.get_piece_size() | |
def get_vocab(self): | |
"""Returns vocab as a dict""" | |
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} | |
return vocab | |
def _convert_token_to_id(self, token): | |
"""Converts a token (str) in an id using the vocab.""" | |
return self.sp_model.piece_to_id(token) | |
def _convert_id_to_token(self, index): | |
"""Converts an index (integer) in a token (str) using the vocab.""" | |
token = self.sp_model.IdToPiece(index) | |
return token | |
# def (self, ids, skip_special_tokens=False): # impl in PreTrainedTokenizer | |
def encode(self, *args, **kwargs): | |
kwargs.pop("add_special_tokens", None) | |
kwargs.pop("allowed_special", None) | |
return self.sp_model.Encode(*args, **kwargs) | |
def decode(self, *args, **kwargs): | |
kwargs.pop("skip_special_tokens", None) | |
return self.sp_model.Decode(*args, **kwargs) | |
# PreTrainedTokenizer.convert_ids_to_tokens |