RuntimeError: Internal: could not parse ModelProto
#1
by deepdml - opened
transformers Version: 5.5.3
from transformers import MarianMTModel, MarianTokenizer
model_name = "Helsinki-NLP/opus-mt_tiny_cat-spa"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
tok = tokenizer("El concepte prové de la Xina, on la flor del cirerer era la més apreciada.", return_tensors="pt").input_ids
output = model.generate(tok)[0]
tokenizer.decode(output, skip_special_tokens=True)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_11981/2099727806.py in <cell line: 0>()
1 from transformers import MarianMTModel, MarianTokenizer
2 model_name = "Helsinki-NLP/opus-mt_tiny_cat-spa"
----> 3 tokenizer = MarianTokenizer.from_pretrained(model_name)
4 model = MarianMTModel.from_pretrained(model_name)
5 tok = tokenizer("El concepte prové de la Xina, on la flor del cirerer era la més apreciada.", return_tensors="pt").input_ids
6 frames
/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
1727 continue
1728
-> 1729 return cls._from_pretrained(
1730 resolved_vocab_files,
1731 pretrained_model_name_or_path,
/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
1929 ) from e
1930 else:
-> 1931 raise e
1932 except OSError:
1933 raise OSError(
/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
1916
1917 try:
-> 1918 tokenizer = cls(*init_inputs, **init_kwargs)
1919 except import_protobuf_decode_error():
1920 raise RuntimeError(
/usr/local/lib/python3.12/dist-packages/transformers/models/marian/tokenization_marian.py in __init__(self, source_spm, target_spm, vocab, target_vocab_file, source_lang, target_lang, unk_token, eos_token, pad_token, model_max_length, sp_model_kwargs, separate_vocabs, **kwargs)
144 # load SentencePiece model for pre-processing
145 self.spm_source = load_spm(source_spm, self.sp_model_kwargs)
--> 146 self.spm_target = load_spm(target_spm, self.sp_model_kwargs)
147 self.current_spm = self.spm_source
148 self.current_encoder = self.encoder
/usr/local/lib/python3.12/dist-packages/transformers/models/marian/tokenization_marian.py in load_spm(path, sp_model_kwargs)
409 def load_spm(path: str, sp_model_kwargs: dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
410 spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
--> 411 spm.Load(path)
412 return spm
413
/usr/local/lib/python3.12/dist-packages/sentencepiece/__init__.py in Load(self, model_file, model_proto)
959 if model_proto:
960 return self.LoadFromSerializedProto(model_proto)
--> 961 return self.LoadFromFile(model_file)
962
963
/usr/local/lib/python3.12/dist-packages/sentencepiece/__init__.py in LoadFromFile(self, arg)
314
315 def LoadFromFile(self, arg):
--> 316 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
317
318 def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
RuntimeError: Internal: could not parse ModelProto from /root/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt_tiny_cat-spa/snapshots/cbf518e0e6d9a977514e179dd018a668d3d76fe3/target.spm
The same environment works with "Helsinki-NLP/opus-mt_tiny_fra-eng":
from transformers import MarianMTModel, MarianTokenizer
model_name = "Helsinki-NLP/opus-mt_tiny_fra-eng"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
tok = tokenizer("Les efforts visant à trouver le lieu de l’accident sont restreints par des intempéries et le terrain accidenté.", return_tensors="pt").input_ids
output = model.generate(tok)[0]
tokenizer.decode(output, skip_special_tokens=True)
>>> Efforts to find the scene of the accident are restricted by weather and the rugged terrain.