sneakyfree commited on
Commit
c3f201b
·
verified ·
1 Parent(s): faf40e0

upstream-archive byte-perfect snapshot of Helsinki-NLP/opus-mt-eo-caenes (ADR-039 Phase D)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ source.spm filter=lfs diff=lfs merge=lfs -text
37
+ target.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - eo
4
+ - en
5
+ - es
6
+ - ca
7
+ tags:
8
+ - translation
9
+ - machine-translation
10
+ - marian
11
+ - opus-mt
12
+ - multilingual
13
+ license: cc-by-4.0
14
+ pipeline_tag: translation
15
+ metrics:
16
+ - bleu
17
+ - chrf
18
+ ---
19
+
20
+ # Esperanto -> Catalan, English, Spanish MT Model
21
+
22
+ ## Model description
23
+
24
+ This repository contains a **multilingual MarianMT** model for **Esperanto → (English, Spanish, Catalan)** translation using language tags.
25
+
26
+ ## Usage
27
+
28
+ The model is loaded and used with `transformers` as:
29
+
30
+ ```python
31
+ from transformers import MarianMTModel, MarianTokenizer
32
+ import torch
33
+
34
+ model_name = "Helsinki-NLP/opus-mt-eo-caenes"
35
+
36
+ device = "cuda" if torch.cuda.is_available() else "cpu"
37
+ model = MarianMTModel.from_pretrained(model_name).to(device)
38
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
39
+
40
+ source_texts = [
41
+ ">>spa<< Saluton, kiel vi fartas?",
42
+ ">>eng<< Saluton, kiel vi fartas?",
43
+ ">>cat<< Saluton, kiel vi fartas?"
44
+ ]
45
+
46
+ inputs = tokenizer(source_texts, return_tensors="pt", padding=True, truncation=True)
47
+ inputs = {k: v.to(device) for k, v in inputs.items()}
48
+
49
+ translated_ids = model.generate(inputs["input_ids"])
50
+ translated_texts = tokenizer.batch_decode(translated_ids, skip_special_tokens=True)
51
+
52
+ for src, tgt in zip(source_texts, translated_texts):
53
+ print(f"Source: {src} => Translated: {tgt}")
54
+ ````
55
+
56
+ ### Supported target languages (via tags)
57
+
58
+ You control the target language by prefixing the source sentence with one of the following tags:
59
+
60
+ * `>>eng<<` → English
61
+ * `>>spa<<` → Spanish
62
+ * `>>cat<<` → Catalan
63
+
64
+ ## Training data
65
+
66
+ The model was trained using **Tatoeba** parallel data, with **FLORES-200** used as the development set.
67
+
68
+ Training sentence-pair counts:
69
+
70
+ * **ca-eo**: 672,931
71
+ * **es-eo**: 4,677,945
72
+ * **eo-en**: 5,000,000
73
+
74
+ ## Evaluation on FLORES
75
+
76
+ | Language Pair | BLEU | ChrF++ |
77
+ | ------------- | ----: | ----: |
78
+ | epo-spa | 19.98 | 49.11 |
79
+ | epo-cat | 28.35 | 55.42 |
80
+ | epo-eng | 37.47 | 63.09 |
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<s>": 32001
3
+ }
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "relu",
4
+ "architectures": [
5
+ "MarianMTModel"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "d_model": 512,
10
+ "decoder_attention_heads": 8,
11
+ "decoder_ffn_dim": 2048,
12
+ "decoder_layerdrop": 0.0,
13
+ "decoder_layers": 6,
14
+ "decoder_start_token_id": 32000,
15
+ "decoder_vocab_size": 32001,
16
+ "dropout": 0.1,
17
+ "dtype": "float16",
18
+ "encoder_attention_heads": 8,
19
+ "encoder_ffn_dim": 2048,
20
+ "encoder_layerdrop": 0.0,
21
+ "encoder_layers": 6,
22
+ "eos_token_id": 0,
23
+ "forced_eos_token_id": 0,
24
+ "init_std": 0.02,
25
+ "is_encoder_decoder": true,
26
+ "max_length": null,
27
+ "max_position_embeddings": 512,
28
+ "model_type": "marian",
29
+ "normalize_embedding": false,
30
+ "num_beams": null,
31
+ "num_hidden_layers": 6,
32
+ "pad_token_id": 32000,
33
+ "scale_embedding": true,
34
+ "share_encoder_decoder_embeddings": true,
35
+ "static_position_embeddings": true,
36
+ "transformers_version": "4.57.3",
37
+ "use_cache": true,
38
+ "vocab_size": 32001
39
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 32000
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 32000,
10
+ "eos_token_id": 0,
11
+ "forced_eos_token_id": 0,
12
+ "max_length": 512,
13
+ "num_beams": 8,
14
+ "pad_token_id": 32000,
15
+ "transformers_version": "4.57.3"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38b4b78e7bb9206834d231ae41785165b17f8deae57a5c286f1b80b982a9d7fa
3
+ size 153908130
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d561cdf0fc7ad693c1bf1fe21732c6434650623ec69dc712aceb36483587914d
3
+ size 805644
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d561cdf0fc7ad693c1bf1fe21732c6434650623ec69dc712aceb36483587914d
3
+ size 805644
test.cat.out ADDED
The diff for this file is too large to render. See raw diff
 
test.eng.out ADDED
The diff for this file is too large to render. See raw diff
 
test.spa.out ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "32000": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "32001": {
28
+ "content": "<s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "</s>",
39
+ "extra_special_tokens": {},
40
+ "model_max_length": 512,
41
+ "pad_token": "<pad>",
42
+ "separate_vocabs": false,
43
+ "source_lang": null,
44
+ "sp_model_kwargs": {},
45
+ "target_lang": null,
46
+ "tokenizer_class": "MarianTokenizer",
47
+ "unk_token": "<unk>"
48
+ }
train.log ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff