Problems with tokenizer
#3
by
do-me
- opened
I tried to run this model on my CUDA GPU but get this error when running the sample code. Do you have any ideas why?
from transformers import AutoTokenizer, AutoModelWithLMHead, TranslationPipeline
pipeline = TranslationPipeline(
model=AutoModelWithLMHead.from_pretrained("SEBIS/legal_t5_small_trans_it_en_small_finetuned"),
tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "SEBIS/legal_t5_small_trans_it_en", do_lower_case=False,
skip_special_tokens=True),
device=0
)
it_text = "Supplenti presenti al momento della votazione finale"
pipeline([it_text], max_length=512)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_2819/3211885403.py in <module>
3 pipeline = TranslationPipeline(
4 model=AutoModelWithLMHead.from_pretrained("SEBIS/legal_t5_small_trans_it_en_small_finetuned"),
----> 5 tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "SEBIS/legal_t5_small_trans_it_en", do_lower_case=False,
6 skip_special_tokens=True),
7 device=0
~/.local/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
657 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
658 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 659 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
660 else:
661 if tokenizer_class_py is not None:
~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1799 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
1800
-> 1801 return cls._from_pretrained(
1802 resolved_vocab_files,
1803 pretrained_model_name_or_path,
~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, *init_inputs, **kwargs)
1954 # Instantiate tokenizer.
1955 try:
-> 1956 tokenizer = cls(*init_inputs, **init_kwargs)
1957 except OSError:
1958 raise OSError(
~/.local/lib/python3.8/site-packages/transformers/models/t5/tokenization_t5_fast.py in __init__(self, vocab_file, tokenizer_file, eos_token, unk_token, pad_token, extra_ids, additional_special_tokens, **kwargs)
131 )
132
--> 133 super().__init__(
134 vocab_file,
135 tokenizer_file=tokenizer_file,
~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py in __init__(self, *args, **kwargs)
112 elif slow_tokenizer is not None:
113 # We need to convert a slow tokenizer to build the backend
--> 114 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
115 elif self.slow_tokenizer_class is not None:
116 # We need to create and convert a slow tokenizer to build the backend
~/.local/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py in convert_slow_tokenizer(transformer_tokenizer)
1160 converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]
1161
-> 1162 return converter_class(transformer_tokenizer).converted()
~/.local/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py in __init__(self, *args)
436 super().__init__(*args)
437
--> 438 from .utils import sentencepiece_model_pb2 as model_pb2
439
440 m = model_pb2.ModelProto()
~/.local/lib/python3.8/site-packages/transformers/utils/sentencepiece_model_pb2.py in <module>
90 create_key=_descriptor._internal_create_key,
91 values=[
---> 92 _descriptor.EnumValueDescriptor(
93 name="UNIGRAM",
94 index=0,
~/.local/lib/python3.8/site-packages/google/protobuf/descriptor.py in __new__(cls, name, index, number, type, options, serialized_options, create_key)
794 type=None, # pylint: disable=redefined-builtin
795 options=None, serialized_options=None, create_key=None):
--> 796 _message.Message._CheckCalledFromGeneratedFile()
797 # There is no way we can build a complete EnumValueDescriptor with the
798 # given parameters (the name of the Enum is not known, for example).
TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
1. Downgrade the protobuf package to 3.20.x or lower.
2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).
More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates