RuntimeError: Internal: could not parse ModelProto

by deepdml - opened 6 days ago

•

transformers Version: 5.5.3

from transformers import MarianMTModel, MarianTokenizer
model_name = "Helsinki-NLP/opus-mt_tiny_cat-spa"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
tok = tokenizer("El concepte prové de la Xina, on la flor del cirerer era la més apreciada.", return_tensors="pt").input_ids
output = model.generate(tok)[0]
tokenizer.decode(output, skip_special_tokens=True)

---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

/tmp/ipykernel_11981/2099727806.py in <cell line: 0>()
      1 from transformers import MarianMTModel, MarianTokenizer
      2 model_name = "Helsinki-NLP/opus-mt_tiny_cat-spa"
----> 3 tokenizer = MarianTokenizer.from_pretrained(model_name)
      4 model = MarianMTModel.from_pretrained(model_name)
      5 tok = tokenizer("El concepte prové de la Xina, on la flor del cirerer era la més apreciada.", return_tensors="pt").input_ids

6 frames

/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
   1727                 continue
   1728 
-> 1729         return cls._from_pretrained(
   1730             resolved_vocab_files,
   1731             pretrained_model_name_or_path,

/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
   1929                 ) from e
   1930             else:
-> 1931                 raise e
   1932         except OSError:
   1933             raise OSError(

/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
   1916 
   1917         try:
-> 1918             tokenizer = cls(*init_inputs, **init_kwargs)
   1919         except import_protobuf_decode_error():
   1920             raise RuntimeError(

/usr/local/lib/python3.12/dist-packages/transformers/models/marian/tokenization_marian.py in __init__(self, source_spm, target_spm, vocab, target_vocab_file, source_lang, target_lang, unk_token, eos_token, pad_token, model_max_length, sp_model_kwargs, separate_vocabs, **kwargs)
    144         # load SentencePiece model for pre-processing
    145         self.spm_source = load_spm(source_spm, self.sp_model_kwargs)
--> 146         self.spm_target = load_spm(target_spm, self.sp_model_kwargs)
    147         self.current_spm = self.spm_source
    148         self.current_encoder = self.encoder

/usr/local/lib/python3.12/dist-packages/transformers/models/marian/tokenization_marian.py in load_spm(path, sp_model_kwargs)
    409 def load_spm(path: str, sp_model_kwargs: dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
    410     spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
--> 411     spm.Load(path)
    412     return spm
    413 

/usr/local/lib/python3.12/dist-packages/sentencepiece/__init__.py in Load(self, model_file, model_proto)
    959       if model_proto:
    960         return self.LoadFromSerializedProto(model_proto)
--> 961       return self.LoadFromFile(model_file)
    962 
    963 

/usr/local/lib/python3.12/dist-packages/sentencepiece/__init__.py in LoadFromFile(self, arg)
    314 
    315     def LoadFromFile(self, arg):
--> 316         return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
    317 
    318     def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):

RuntimeError: Internal: could not parse ModelProto from /root/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt_tiny_cat-spa/snapshots/cbf518e0e6d9a977514e179dd018a668d3d76fe3/target.spm

The same environment works with "Helsinki-NLP/opus-mt_tiny_fra-eng":

from transformers import MarianMTModel, MarianTokenizer
model_name = "Helsinki-NLP/opus-mt_tiny_fra-eng"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
tok = tokenizer("Les efforts visant à trouver le lieu de l’accident sont restreints par des intempéries et le terrain accidenté.", return_tensors="pt").input_ids
output = model.generate(tok)[0]
tokenizer.decode(output, skip_special_tokens=True)
>>> Efforts to find the scene of the accident are restricted by weather and the rugged terrain.

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment