Spaces:
Running
Running
File size: 1,591 Bytes
1b7fc74 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
""" 封装 sentencepiece.SentencePieceProcessor,以便符合transformers中的tokenizer标准
## reference
## usage
- grok
"""
import sentencepiece as spm
from transformers import PreTrainedTokenizer
class SPTokenizerWrapper(PreTrainedTokenizer):
"""
## impl in PreTrainedTokenizer
- convert_ids_to_tokens
"""
def __init__(self, vocab_file):
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor(self.vocab_file)
super().__init__()
@property
def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.get_piece_size()
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
return vocab
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
return token
# def (self, ids, skip_special_tokens=False): # impl in PreTrainedTokenizer
def encode(self, *args, **kwargs):
kwargs.pop("add_special_tokens", None)
kwargs.pop("allowed_special", None)
return self.sp_model.Encode(*args, **kwargs)
def decode(self, *args, **kwargs):
kwargs.pop("skip_special_tokens", None)
return self.sp_model.Decode(*args, **kwargs)
# PreTrainedTokenizer.convert_ids_to_tokens |