Translation
LiteRT
Safetensors
Spanish
Catalan
marian

RuntimeError: Internal: could not parse ModelProto

#1
by deepdml - opened

transformers Version: 5.5.3

from transformers import MarianMTModel, MarianTokenizer
model_name = "Helsinki-NLP/opus-mt_tiny_cat-spa"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
tok = tokenizer("El concepte prové de la Xina, on la flor del cirerer era la més apreciada.", return_tensors="pt").input_ids
output = model.generate(tok)[0]
tokenizer.decode(output, skip_special_tokens=True)
---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

/tmp/ipykernel_11981/2099727806.py in <cell line: 0>()
      1 from transformers import MarianMTModel, MarianTokenizer
      2 model_name = "Helsinki-NLP/opus-mt_tiny_cat-spa"
----> 3 tokenizer = MarianTokenizer.from_pretrained(model_name)
      4 model = MarianMTModel.from_pretrained(model_name)
      5 tok = tokenizer("El concepte prové de la Xina, on la flor del cirerer era la més apreciada.", return_tensors="pt").input_ids

6 frames

/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
   1727                 continue
   1728 
-> 1729         return cls._from_pretrained(
   1730             resolved_vocab_files,
   1731             pretrained_model_name_or_path,

/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
   1929                 ) from e
   1930             else:
-> 1931                 raise e
   1932         except OSError:
   1933             raise OSError(

/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
   1916 
   1917         try:
-> 1918             tokenizer = cls(*init_inputs, **init_kwargs)
   1919         except import_protobuf_decode_error():
   1920             raise RuntimeError(

/usr/local/lib/python3.12/dist-packages/transformers/models/marian/tokenization_marian.py in __init__(self, source_spm, target_spm, vocab, target_vocab_file, source_lang, target_lang, unk_token, eos_token, pad_token, model_max_length, sp_model_kwargs, separate_vocabs, **kwargs)
    144         # load SentencePiece model for pre-processing
    145         self.spm_source = load_spm(source_spm, self.sp_model_kwargs)
--> 146         self.spm_target = load_spm(target_spm, self.sp_model_kwargs)
    147         self.current_spm = self.spm_source
    148         self.current_encoder = self.encoder

/usr/local/lib/python3.12/dist-packages/transformers/models/marian/tokenization_marian.py in load_spm(path, sp_model_kwargs)
    409 def load_spm(path: str, sp_model_kwargs: dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
    410     spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
--> 411     spm.Load(path)
    412     return spm
    413 

/usr/local/lib/python3.12/dist-packages/sentencepiece/__init__.py in Load(self, model_file, model_proto)
    959       if model_proto:
    960         return self.LoadFromSerializedProto(model_proto)
--> 961       return self.LoadFromFile(model_file)
    962 
    963 

/usr/local/lib/python3.12/dist-packages/sentencepiece/__init__.py in LoadFromFile(self, arg)
    314 
    315     def LoadFromFile(self, arg):
--> 316         return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
    317 
    318     def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):

RuntimeError: Internal: could not parse ModelProto from /root/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt_tiny_cat-spa/snapshots/cbf518e0e6d9a977514e179dd018a668d3d76fe3/target.spm

The same environment works with "Helsinki-NLP/opus-mt_tiny_fra-eng":

from transformers import MarianMTModel, MarianTokenizer
model_name = "Helsinki-NLP/opus-mt_tiny_fra-eng"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
tok = tokenizer("Les efforts visant à trouver le lieu de l’accident sont restreints par des intempéries et le terrain accidenté.", return_tensors="pt").input_ids
output = model.generate(tok)[0]
tokenizer.decode(output, skip_special_tokens=True)
>>> Efforts to find the scene of the accident are restricted by weather and the rugged terrain.

Sign up or log in to comment