LH-Tech-AI commited on
Commit
d467b07
Β·
verified Β·
1 Parent(s): 3e79a89

Upload 11 files

Browse files
benchmarks.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
2
+ |------------------------------------------------------------|------:|------|-----:|---------------|---|-------:|---|------|
3
+ |arc_easy | 1|none | 0|acc |↑ | 0.3439|Β± |0.0097|
4
+ | | |none | 0|acc_norm |↑ | 0.3346|Β± |0.0097|
5
+ |blimp | 2|none | |acc |↑ | 0.6349|Β± |0.0016|
6
+ | - blimp_adjunct_island | 1|none | 0|acc |↑ | 0.7990|Β± |0.0127|
7
+ | - blimp_anaphor_gender_agreement | 1|none | 0|acc |↑ | 0.3290|Β± |0.0149|
8
+ | - blimp_anaphor_number_agreement | 1|none | 0|acc |↑ | 0.6330|Β± |0.0152|
9
+ | - blimp_animate_subject_passive | 1|none | 0|acc |↑ | 0.6550|Β± |0.0150|
10
+ | - blimp_animate_subject_trans | 1|none | 0|acc |↑ | 0.8180|Β± |0.0122|
11
+ | - blimp_causative | 1|none | 0|acc |↑ | 0.4900|Β± |0.0158|
12
+ | - blimp_complex_NP_island | 1|none | 0|acc |↑ | 0.5300|Β± |0.0158|
13
+ | - blimp_coordinate_structure_constraint_complex_left_branch| 1|none | 0|acc |↑ | 0.2990|Β± |0.0145|
14
+ | - blimp_coordinate_structure_constraint_object_extraction | 1|none | 0|acc |↑ | 0.7550|Β± |0.0136|
15
+ | - blimp_determiner_noun_agreement_1 | 1|none | 0|acc |↑ | 0.7910|Β± |0.0129|
16
+ | - blimp_determiner_noun_agreement_2 | 1|none | 0|acc |↑ | 0.8640|Β± |0.0108|
17
+ | - blimp_determiner_noun_agreement_irregular_1 | 1|none | 0|acc |↑ | 0.7020|Β± |0.0145|
18
+ | - blimp_determiner_noun_agreement_irregular_2 | 1|none | 0|acc |↑ | 0.8460|Β± |0.0114|
19
+ | - blimp_determiner_noun_agreement_with_adj_2 | 1|none | 0|acc |↑ | 0.7370|Β± |0.0139|
20
+ | - blimp_determiner_noun_agreement_with_adj_irregular_1 | 1|none | 0|acc |↑ | 0.5780|Β± |0.0156|
21
+ | - blimp_determiner_noun_agreement_with_adj_irregular_2 | 1|none | 0|acc |↑ | 0.7300|Β± |0.0140|
22
+ | - blimp_determiner_noun_agreement_with_adjective_1 | 1|none | 0|acc |↑ | 0.7060|Β± |0.0144|
23
+ | - blimp_distractor_agreement_relational_noun | 1|none | 0|acc |↑ | 0.2630|Β± |0.0139|
24
+ | - blimp_distractor_agreement_relative_clause | 1|none | 0|acc |↑ | 0.2060|Β± |0.0128|
25
+ | - blimp_drop_argument | 1|none | 0|acc |↑ | 0.7110|Β± |0.0143|
26
+ | - blimp_ellipsis_n_bar_1 | 1|none | 0|acc |↑ | 0.5800|Β± |0.0156|
27
+ | - blimp_ellipsis_n_bar_2 | 1|none | 0|acc |↑ | 0.7490|Β± |0.0137|
28
+ | - blimp_existential_there_object_raising | 1|none | 0|acc |↑ | 0.7470|Β± |0.0138|
29
+ | - blimp_existential_there_quantifiers_1 | 1|none | 0|acc |↑ | 0.8450|Β± |0.0115|
30
+ | - blimp_existential_there_quantifiers_2 | 1|none | 0|acc |↑ | 0.2720|Β± |0.0141|
31
+ | - blimp_existential_there_subject_raising | 1|none | 0|acc |↑ | 0.6560|Β± |0.0150|
32
+ | - blimp_expletive_it_object_raising | 1|none | 0|acc |↑ | 0.6820|Β± |0.0147|
33
+ | - blimp_inchoative | 1|none | 0|acc |↑ | 0.4210|Β± |0.0156|
34
+ | - blimp_intransitive | 1|none | 0|acc |↑ | 0.5750|Β± |0.0156|
35
+ | - blimp_irregular_past_participle_adjectives | 1|none | 0|acc |↑ | 0.9240|Β± |0.0084|
36
+ | - blimp_irregular_past_participle_verbs | 1|none | 0|acc |↑ | 0.6800|Β± |0.0148|
37
+ | - blimp_irregular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.7100|Β± |0.0144|
38
+ | - blimp_irregular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.8520|Β± |0.0112|
39
+ | - blimp_left_branch_island_echo_question | 1|none | 0|acc |↑ | 0.8390|Β± |0.0116|
40
+ | - blimp_left_branch_island_simple_question | 1|none | 0|acc |↑ | 0.3810|Β± |0.0154|
41
+ | - blimp_matrix_question_npi_licensor_present | 1|none | 0|acc |↑ | 0.0060|Β± |0.0024|
42
+ | - blimp_npi_present_1 | 1|none | 0|acc |↑ | 0.5420|Β± |0.0158|
43
+ | - blimp_npi_present_2 | 1|none | 0|acc |↑ | 0.5250|Β± |0.0158|
44
+ | - blimp_only_npi_licensor_present | 1|none | 0|acc |↑ | 0.3710|Β± |0.0153|
45
+ | - blimp_only_npi_scope | 1|none | 0|acc |↑ | 0.4090|Β± |0.0156|
46
+ | - blimp_passive_1 | 1|none | 0|acc |↑ | 0.7980|Β± |0.0127|
47
+ | - blimp_passive_2 | 1|none | 0|acc |↑ | 0.7770|Β± |0.0132|
48
+ | - blimp_principle_A_c_command | 1|none | 0|acc |↑ | 0.6410|Β± |0.0152|
49
+ | - blimp_principle_A_case_1 | 1|none | 0|acc |↑ | 1.0000|Β± | 0|
50
+ | - blimp_principle_A_case_2 | 1|none | 0|acc |↑ | 0.7200|Β± |0.0142|
51
+ | - blimp_principle_A_domain_1 | 1|none | 0|acc |↑ | 0.7350|Β± |0.0140|
52
+ | - blimp_principle_A_domain_2 | 1|none | 0|acc |↑ | 0.6190|Β± |0.0154|
53
+ | - blimp_principle_A_domain_3 | 1|none | 0|acc |↑ | 0.5460|Β± |0.0158|
54
+ | - blimp_principle_A_reconstruction | 1|none | 0|acc |↑ | 0.4780|Β± |0.0158|
55
+ | - blimp_regular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.7920|Β± |0.0128|
56
+ | - blimp_regular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.7970|Β± |0.0127|
57
+ | - blimp_sentential_negation_npi_licensor_present | 1|none | 0|acc |↑ | 1.0000|Β± | 0|
58
+ | - blimp_sentential_negation_npi_scope | 1|none | 0|acc |↑ | 0.6700|Β± |0.0149|
59
+ | - blimp_sentential_subject_island | 1|none | 0|acc |↑ | 0.4350|Β± |0.0157|
60
+ | - blimp_superlative_quantifiers_1 | 1|none | 0|acc |↑ | 0.9270|Β± |0.0082|
61
+ | - blimp_superlative_quantifiers_2 | 1|none | 0|acc |↑ | 0.5460|Β± |0.0158|
62
+ | - blimp_tough_vs_raising_1 | 1|none | 0|acc |↑ | 0.4410|Β± |0.0157|
63
+ | - blimp_tough_vs_raising_2 | 1|none | 0|acc |↑ | 0.6850|Β± |0.0147|
64
+ | - blimp_transitive | 1|none | 0|acc |↑ | 0.7490|Β± |0.0137|
65
+ | - blimp_wh_island | 1|none | 0|acc |↑ | 0.4360|Β± |0.0157|
66
+ | - blimp_wh_questions_object_gap | 1|none | 0|acc |↑ | 0.5880|Β± |0.0156|
67
+ | - blimp_wh_questions_subject_gap | 1|none | 0|acc |↑ | 0.8800|Β± |0.0103|
68
+ | - blimp_wh_questions_subject_gap_long_distance | 1|none | 0|acc |↑ | 0.9460|Β± |0.0072|
69
+ | - blimp_wh_vs_that_no_gap | 1|none | 0|acc |↑ | 0.9810|Β± |0.0043|
70
+ | - blimp_wh_vs_that_no_gap_long_distance | 1|none | 0|acc |↑ | 0.9880|Β± |0.0034|
71
+ | - blimp_wh_vs_that_with_gap | 1|none | 0|acc |↑ | 0.1180|Β± |0.0102|
72
+ | - blimp_wh_vs_that_with_gap_long_distance | 1|none | 0|acc |↑ | 0.0340|Β± |0.0057|
73
+ |wikitext | 2|none | 0|bits_per_byte |↓ | 1.4123|Β± | N/A|
74
+ | | |none | 0|byte_perplexity|↓ | 2.6617|Β± | N/A|
75
+ | | |none | 0|word_perplexity|↓ |187.7215|Β± | N/A|
76
+
77
+ |Groups|Version|Filter|n-shot|Metric| |Value | |Stderr|
78
+ |------|------:|------|------|------|---|-----:|---|-----:|
79
+ |blimp | 2|none | |acc |↑ |0.6349|Β± |0.0016|
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "head_dim": 48,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 192,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 768,
15
+ "max_position_embeddings": 1024,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 4,
19
+ "num_hidden_layers": 8,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 1,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_parameters": {
25
+ "rope_theta": 500000,
26
+ "rope_type": "default"
27
+ },
28
+ "tie_word_embeddings": true,
29
+ "transformers_version": "5.8.1",
30
+ "use_cache": false,
31
+ "vocab_size": 16384
32
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "5.8.1",
9
+ "use_cache": true
10
+ }
inference.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print("[*] Loading libraries...")
2
+ import torch
3
+ from transformers import LlamaForCausalLM, PreTrainedTokenizerFast
4
+
5
+ model_path = "./Supra-Mini-v5-8M-FINAL"
6
+
7
+ print("[*] Loading tokenizer...")
8
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)
9
+
10
+ print("[*] Loading model...")
11
+ model = LlamaForCausalLM.from_pretrained(model_path)
12
+ model.eval()
13
+
14
+ prompt = "The main concept of physics is " # "Artificial intelligence is " | "The main concept of physics is " | "Once upon a time, "
15
+ print(f"[*] Prompt: {prompt!r}")
16
+
17
+ inputs = tokenizer(prompt, return_tensors="pt")
18
+
19
+ with torch.no_grad():
20
+ outputs = model.generate(
21
+ input_ids=inputs["input_ids"],
22
+ attention_mask=inputs["attention_mask"],
23
+ max_new_tokens=256,
24
+ do_sample=True,
25
+ temperature=0.5,
26
+ top_p=0.9,
27
+ top_k=25,
28
+ repetition_penalty=1.3,
29
+ pad_token_id=tokenizer.pad_token_id,
30
+ eos_token_id=tokenizer.eos_token_id,
31
+ )
32
+
33
+ print("[*] Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7d1deeaccf5b8fad50dfd4506c92980f1bff6d612290982d2be6bfc68bd32a0
3
+ size 31478384
samples_BASE.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [*] Prompt: 'Once upon a time, '
2
+ [*] Output: Once upon a time, .. ...
3
+ - "The word 'pot' is not used in the past.") (A.D., A.D.)
4
+ - "It's like a thing of course that we're going to think about it. The phrase "these" means what you see and how much are there?").
5
+ - "You have no idea where I'm so, but if my son has any thought or understanding of his name, he will be able to understand him by saying something more than one day. This way they can tell us when you've got me at home. That is why I want your child to know which words are most important for them: "If you get this language from another person, then you'll find yourself in the same place as you read it," says Mike McNamara, who was born with an English friend, Jennifer Batharinee, who had been diagnosed with dementia during her lifetime. He said he would learn how to say things such as "a lot of things," and "you don't really need to do anything else." It may seem simple because he didn't feel good before he went out and asked whether he could make sense of it. But he wanted to take advantage of the fact
6
+
7
+ [*] Prompt: 'Artificial intelligence is '
8
+ [*] Output: Artificial intelligence is _______.
9
+ I was a scientist at the University of Cambridge in London, and I had been looking for ways to get them back into the first time. It’s not just about learning how to use it but also about the world around us.
10
+ The study has led to more than 150 studies showing that people who are able to learn new things like computers, laptops, and smartphones can be found in many different types of computer systems. The researchers have demonstrated that this type of technology works well with a wide range of computing applications. They believe they need to know what they want to do, such as a laptop or iPad, which means you might find themselves doing everything on your own. This could mean we would take up to 3 times an hour if we didn't see any problems, including anything from something else. So now there is no idea how much data will go through so far. We don’t understand why these algorithms aren’t actually working on their devices. And if you think about it, then here are some of the most important things we can do to help make our decisions.
11
+ What does it say? What happens when someone comes across the internet? Why doesn’t they really feel like the Internet? How should you connect
12
+
13
+ [*] Prompt: 'The main concept of physics is '
14
+ [*] Output: The main concept of physics is _______.
15
+ This principle, which means that the universe has no energy; it should be a matter of time in its own right and therefore to be able to achieve its maximum potential. It must have been used as an alternative for quantum mechanics but not only in terms of power but also in the form of electrical systems such as magnetic fields or electric currents. The theory would include some processes like this: "We are using mechanical machines." In fact, we need to work with other types of computers, including those who do so by their computer. This is because they can actually use them to generate electricity from our devices. Thus, if you want to learn how to manufacture electronic circuits, then you will see a very simple solution - the way you think about what happens when you're trying to build up on your device.
16
+ So, why does that sound? Because there's something new than a few different things, and I'm sure that you don't know exactly where it was. But, just let me know if there were many ways to create the same thing. You might say that it could make sense to you how much one is doing at all, even though it isn't going to happen! So, if you'd ask yourself to write down the problem
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }
train.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Β© SupraLabs 2026 - Official pretraining code for Supra Mini v5 8M
3
+ """
4
+
5
+ import os
6
+ os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
7
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
8
+
9
+ print("[*] Loading libraries...")
10
+ import torch
11
+ import math
12
+ import numpy as np
13
+ from datasets import load_dataset
14
+ from tokenizers import ByteLevelBPETokenizer
15
+ from transformers import (
16
+ LlamaConfig,
17
+ LlamaForCausalLM,
18
+ PreTrainedTokenizerFast,
19
+ Trainer,
20
+ TrainingArguments,
21
+ )
22
+ from torch.utils.data import Dataset
23
+ from tqdm import tqdm
24
+
25
+ print("[*] Loading tokenizer...")
26
+ fast_tokenizer = ByteLevelBPETokenizer(
27
+ "./custom_llama_tokenizer-vocab.json",
28
+ "./custom_llama_tokenizer-merges.txt"
29
+ )
30
+ tokenizer = PreTrainedTokenizerFast(
31
+ tokenizer_object=fast_tokenizer,
32
+ bos_token="<s>",
33
+ eos_token="</s>",
34
+ unk_token="<unk>",
35
+ pad_token="<pad>",
36
+ )
37
+
38
+ TOKEN_BIN = "./tokens.bin"
39
+ TARGET_TOKENS = 5_000_000_000
40
+ SEQ_LEN = 1024
41
+ BATCH_TEXTS = 1000
42
+ FLUSH_EVERY = 1_000_000
43
+
44
+
45
+ def build_token_bin(fast_tokenizer, path=TOKEN_BIN, target_tokens=TARGET_TOKENS):
46
+ if os.path.exists(path) and os.path.getsize(path) >= target_tokens * 2:
47
+ print(f"[=] Reusing existing token file: {path}")
48
+ return
49
+
50
+ print(f"[*] Streaming + tokenizing {target_tokens:,} tokens β†’ {path}")
51
+ mm = np.memmap(path, dtype=np.uint16, mode="w+", shape=(target_tokens,))
52
+
53
+ dataset = load_dataset(
54
+ "HuggingFaceFW/fineweb-edu", "sample-10BT",
55
+ split="train", streaming=True
56
+ )
57
+
58
+ written = 0
59
+ buf = []
60
+ texts = []
61
+ pbar = tqdm(total=target_tokens, desc="[*] Gathering tokens", unit="tok")
62
+
63
+ def flush_buf():
64
+ nonlocal written, buf
65
+ if not buf:
66
+ return False
67
+ n = min(len(buf), target_tokens - written)
68
+ mm[written:written + n] = np.asarray(buf[:n], dtype=np.uint16)
69
+ written += n
70
+ pbar.update(n)
71
+ del buf[:n]
72
+ return written >= target_tokens
73
+
74
+ for example in dataset:
75
+ texts.append(example["text"])
76
+ if len(texts) >= BATCH_TEXTS:
77
+ encs = fast_tokenizer.encode_batch(texts)
78
+ texts.clear()
79
+ for e in encs:
80
+ buf.extend(e.ids)
81
+ if len(buf) >= FLUSH_EVERY:
82
+ if flush_buf():
83
+ break
84
+
85
+ if written < target_tokens and texts:
86
+ encs = fast_tokenizer.encode_batch(texts)
87
+ for e in encs:
88
+ buf.extend(e.ids)
89
+ if written < target_tokens:
90
+ flush_buf()
91
+
92
+ pbar.close()
93
+ mm.flush()
94
+ del mm
95
+ print(f"[+] Wrote {written:,} tokens to {path} "
96
+ f"({os.path.getsize(path)/1e6:.1f} MB)")
97
+
98
+
99
+ class MemmapDataset(Dataset):
100
+ def __init__(self, path, total_tokens, seq_len=SEQ_LEN):
101
+ self.path = path
102
+ self.seq_len = seq_len
103
+ self.n_chunks = total_tokens // seq_len
104
+ self._data = None # lazy open (Multiprocessing-safe)
105
+
106
+ @property
107
+ def data(self):
108
+ if self._data is None:
109
+ self._data = np.memmap(
110
+ self.path, dtype=np.uint16, mode="r",
111
+ shape=(self.n_chunks * self.seq_len,)
112
+ )
113
+ return self._data
114
+
115
+ def __len__(self):
116
+ return self.n_chunks
117
+
118
+ def __getitem__(self, idx):
119
+ s = idx * self.seq_len
120
+ arr = np.asarray(self.data[s:s + self.seq_len], dtype=np.int64)
121
+ ids = torch.from_numpy(arr)
122
+ return {"input_ids": ids, "labels": ids.clone()}
123
+
124
+
125
+ def collate_fn(batch):
126
+ input_ids = torch.stack([b["input_ids"] for b in batch])
127
+ labels = torch.stack([b["labels"] for b in batch])
128
+ return {"input_ids": input_ids, "labels": labels}
129
+
130
+
131
+ print(f"[*] Preparing {TARGET_TOKENS:,} tokens (streaming, memmap-backed)...")
132
+ build_token_bin(fast_tokenizer, TOKEN_BIN, TARGET_TOKENS)
133
+ dataset = MemmapDataset(TOKEN_BIN, TARGET_TOKENS, seq_len=SEQ_LEN)
134
+ print(f"[+] Dataset ready: {len(dataset):,} chunks of {SEQ_LEN} tokens")
135
+
136
+ print("[*] Setting up model...")
137
+ config = LlamaConfig(
138
+ vocab_size=16384,
139
+ hidden_size=192,
140
+ intermediate_size=768,
141
+ num_hidden_layers=8,
142
+ num_attention_heads=4,
143
+ num_key_value_heads=4,
144
+ max_position_embeddings=1024,
145
+ rope_theta=500000,
146
+ tie_word_embeddings=True,
147
+ pad_token_id=tokenizer.pad_token_id,
148
+ bos_token_id=tokenizer.bos_token_id,
149
+ eos_token_id=tokenizer.eos_token_id,
150
+ )
151
+ model = LlamaForCausalLM(config)
152
+ print(f"[*] Model parameters: {model.num_parameters():,}")
153
+
154
+ print("[*] Defining training arguments...")
155
+ training_args = TrainingArguments(
156
+ output_dir="./Supra-Mini-v5-8M",
157
+ num_train_epochs=2,
158
+ per_device_train_batch_size=64,
159
+ gradient_accumulation_steps=16,
160
+ save_steps=500,
161
+ save_total_limit=2,
162
+ logging_steps=100,
163
+ weight_decay=0.01,
164
+ fp16=False,
165
+ bf16=True,
166
+ push_to_hub=False,
167
+ report_to="none",
168
+ dataloader_num_workers=os.cpu_count() // 2,
169
+ dataloader_pin_memory=True,
170
+ learning_rate=2e-4,
171
+ lr_scheduler_type="cosine",
172
+ warmup_ratio=0.05,
173
+ max_grad_norm=1.0,
174
+ torch_compile=True,
175
+ )
176
+
177
+ trainer = Trainer(
178
+ model=model,
179
+ args=training_args,
180
+ train_dataset=dataset,
181
+ data_collator=collate_fn,
182
+ )
183
+
184
+ print("[*] Starting training...")
185
+ trainer.train()
186
+ trainer.save_model("./Supra-Mini-v5-8M-FINAL")
187
+ tokenizer.save_pretrained("./Supra-Mini-v5-8M-FINAL")
188
+ print("[*] Training finished.")
training.log ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [*] Loading libraries...
2
+ [*] Loading tokenizer...
3
+ [*] Preparing 5,000,000,000 tokens (streaming, memmap-backed)...
4
+ [=] Reusing existing token file: ./tokens.bin
5
+ [+] Dataset ready: 4,882,812 chunks of 1024 tokens
6
+ [*] Setting up model...
7
+ [*] Model parameters: 7,867,584
8
+ [*] Defining training arguments...
9
+ [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
10
+ [*] Starting training...
11
+ 0%| | 0/9538 [00:00<?, ?it/s]W0515 19:05:07.214000 41242 torch/_inductor/utils.py:1731] [0/0] Not enough SMs to use max_autotune_gemm mode
12
+ {'loss': '9.401', 'grad_norm': '1.098', 'learning_rate': '4.151e-05', 'epoch': '0.02097'}
13
+ {'loss': '8.45', 'grad_norm': '1.012', 'learning_rate': '8.344e-05', 'epoch': '0.04194'}
14
+ {'loss': '7.463', 'grad_norm': '1.007', 'learning_rate': '0.0001254', 'epoch': '0.06291'}
15
+ {'loss': '6.763', 'grad_norm': '1.135', 'learning_rate': '0.0001673', 'epoch': '0.08389'}
16
+ {'loss': '6.296', 'grad_norm': '0.8444', 'learning_rate': '0.0002', 'epoch': '0.1049'}
17
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 45.64it/s]
18
+ {'loss': '5.963', 'grad_norm': '1.129', 'learning_rate': '0.0001999', 'epoch': '0.1258'}
19
+ {'loss': '5.732', 'grad_norm': '1.432', 'learning_rate': '0.0001997', 'epoch': '0.1468'}
20
+ {'loss': '5.555', 'grad_norm': '1.714', 'learning_rate': '0.0001994', 'epoch': '0.1678'}
21
+ {'loss': '5.407', 'grad_norm': '1.082', 'learning_rate': '0.0001989', 'epoch': '0.1887'}
22
+ {'loss': '5.281', 'grad_norm': '1.087', 'learning_rate': '0.0001984', 'epoch': '0.2097'}
23
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 51.79it/s]
24
+ {'loss': '5.176', 'grad_norm': '1.031', 'learning_rate': '0.0001977', 'epoch': '0.2307'}
25
+ {'loss': '5.081', 'grad_norm': '1.037', 'learning_rate': '0.0001969', 'epoch': '0.2517'}
26
+ {'loss': '4.997', 'grad_norm': '1.259', 'learning_rate': '0.000196', 'epoch': '0.2726'}
27
+ {'loss': '4.919', 'grad_norm': '1.149', 'learning_rate': '0.0001949', 'epoch': '0.2936'}
28
+ {'loss': '4.848', 'grad_norm': '1.25', 'learning_rate': '0.0001938', 'epoch': '0.3146'}
29
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 53.17it/s]
30
+ {'loss': '4.782', 'grad_norm': '1.465', 'learning_rate': '0.0001925', 'epoch': '0.3355'}
31
+ {'loss': '4.717', 'grad_norm': '1.792', 'learning_rate': '0.0001912', 'epoch': '0.3565'}
32
+ {'loss': '4.656', 'grad_norm': '1.379', 'learning_rate': '0.0001897', 'epoch': '0.3775'}
33
+ {'loss': '4.598', 'grad_norm': '1.669', 'learning_rate': '0.0001881', 'epoch': '0.3985'}
34
+ {'loss': '4.55', 'grad_norm': '1.305', 'learning_rate': '0.0001864', 'epoch': '0.4194'}
35
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 53.48it/s]
36
+ {'loss': '4.508', 'grad_norm': '1.443', 'learning_rate': '0.0001846', 'epoch': '0.4404'}
37
+ {'loss': '4.47', 'grad_norm': '1.677', 'learning_rate': '0.0001827', 'epoch': '0.4614'}
38
+ {'loss': '4.437', 'grad_norm': '1.25', 'learning_rate': '0.0001807', 'epoch': '0.4823'}
39
+ {'loss': '4.403', 'grad_norm': '1.595', 'learning_rate': '0.0001786', 'epoch': '0.5033'}
40
+ {'loss': '4.375', 'grad_norm': '1.593', 'learning_rate': '0.0001764', 'epoch': '0.5243'}
41
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 53.04it/s]
42
+ {'loss': '4.35', 'grad_norm': '1.411', 'learning_rate': '0.0001741', 'epoch': '0.5453'}
43
+ {'loss': '4.328', 'grad_norm': '2.014', 'learning_rate': '0.0001718', 'epoch': '0.5662'}
44
+ {'loss': '4.303', 'grad_norm': '1.523', 'learning_rate': '0.0001693', 'epoch': '0.5872'}
45
+ {'loss': '4.285', 'grad_norm': '1.343', 'learning_rate': '0.0001668', 'epoch': '0.6082'}
46
+ {'loss': '4.264', 'grad_norm': '1.376', 'learning_rate': '0.0001641', 'epoch': '0.6291'}
47
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 53.00it/s]
48
+ {'loss': '4.247', 'grad_norm': '1.62', 'learning_rate': '0.0001614', 'epoch': '0.6501'}
49
+ {'loss': '4.232', 'grad_norm': '1.243', 'learning_rate': '0.0001587', 'epoch': '0.6711'}
50
+ {'loss': '4.214', 'grad_norm': '1.226', 'learning_rate': '0.0001558', 'epoch': '0.6921'}
51
+ {'loss': '4.199', 'grad_norm': '1.243', 'learning_rate': '0.0001529', 'epoch': '0.713'}
52
+ {'loss': '4.186', 'grad_norm': '1.813', 'learning_rate': '0.0001499', 'epoch': '0.734'}
53
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 52.44it/s]
54
+ {'loss': '4.171', 'grad_norm': '1.488', 'learning_rate': '0.0001469', 'epoch': '0.755'}
55
+ {'loss': '4.158', 'grad_norm': '1.535', 'learning_rate': '0.0001438', 'epoch': '0.7759'}
56
+ {'loss': '4.148', 'grad_norm': '1.208', 'learning_rate': '0.0001407', 'epoch': '0.7969'}
57
+ {'loss': '4.136', 'grad_norm': '1.366', 'learning_rate': '0.0001375', 'epoch': '0.8179'}
58
+ {'loss': '4.125', 'grad_norm': '1.289', 'learning_rate': '0.0001343', 'epoch': '0.8389'}
59
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 52.97it/s]
60
+ {'loss': '4.115', 'grad_norm': '1.306', 'learning_rate': '0.000131', 'epoch': '0.8598'}
61
+ {'loss': '4.104', 'grad_norm': '1.121', 'learning_rate': '0.0001277', 'epoch': '0.8808'}
62
+ {'loss': '4.096', 'grad_norm': '1.608', 'learning_rate': '0.0001243', 'epoch': '0.9018'}
63
+ {'loss': '4.089', 'grad_norm': '1.192', 'learning_rate': '0.0001209', 'epoch': '0.9227'}
64
+ {'loss': '4.078', 'grad_norm': '1.291', 'learning_rate': '0.0001175', 'epoch': '0.9437'}
65
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 53.15it/s]
66
+ {'loss': '4.073', 'grad_norm': '1.054', 'learning_rate': '0.0001141', 'epoch': '0.9647'}
67
+ {'loss': '4.066', 'grad_norm': '1.141', 'learning_rate': '0.0001107', 'epoch': '0.9857'}
68
+ {'loss': '4.057', 'grad_norm': '1.703', 'learning_rate': '0.0001072', 'epoch': '1.007'}
69
+ {'loss': '4.051', 'grad_norm': '1.104', 'learning_rate': '0.0001038', 'epoch': '1.027'}
70
+ {'loss': '4.042', 'grad_norm': '1.058', 'learning_rate': '0.0001003', 'epoch': '1.048'}
71
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 22.14it/s]
72
+ {'loss': '4.038', 'grad_norm': '1.095', 'learning_rate': '9.683e-05', 'epoch': '1.069'}
73
+ {'loss': '4.032', 'grad_norm': '1.074', 'learning_rate': '9.337e-05', 'epoch': '1.09'}
74
+ {'loss': '4.027', 'grad_norm': '1.18', 'learning_rate': '8.991e-05', 'epoch': '1.111'}
75
+ {'loss': '4.02', 'grad_norm': '1.193', 'learning_rate': '8.647e-05', 'epoch': '1.132'}
76
+ {'loss': '4.015', 'grad_norm': '1.291', 'learning_rate': '8.304e-05', 'epoch': '1.153'}
77
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 59.08it/s]
78
+ {'loss': '4.012', 'grad_norm': '1.045', 'learning_rate': '7.964e-05', 'epoch': '1.174'}
79
+ {'loss': '4.008', 'grad_norm': '1.35', 'learning_rate': '7.625e-05', 'epoch': '1.195'}
80
+ {'loss': '4.002', 'grad_norm': '1.086', 'learning_rate': '7.29e-05', 'epoch': '1.216'}
81
+ {'loss': '3.999', 'grad_norm': '0.8626', 'learning_rate': '6.958e-05', 'epoch': '1.237'}
82
+ {'loss': '3.992', 'grad_norm': '1.381', 'learning_rate': '6.63e-05', 'epoch': '1.258'}
83
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 59.27it/s]
84
+ {'loss': '3.99', 'grad_norm': '1.229', 'learning_rate': '6.305e-05', 'epoch': '1.279'}
85
+ {'loss': '3.987', 'grad_norm': '0.8244', 'learning_rate': '5.985e-05', 'epoch': '1.3'}
86
+ {'loss': '3.982', 'grad_norm': '0.9264', 'learning_rate': '5.67e-05', 'epoch': '1.321'}
87
+ {'loss': '3.982', 'grad_norm': '1.037', 'learning_rate': '5.36e-05', 'epoch': '1.342'}
88
+ {'loss': '3.976', 'grad_norm': '0.9665', 'learning_rate': '5.056e-05', 'epoch': '1.363'}
89
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 60.27it/s]
90
+ {'loss': '3.975', 'grad_norm': '0.8869', 'learning_rate': '4.758e-05', 'epoch': '1.384'}
91
+ {'loss': '3.971', 'grad_norm': '0.7576', 'learning_rate': '4.466e-05', 'epoch': '1.405'}
92
+ {'loss': '3.968', 'grad_norm': '0.8313', 'learning_rate': '4.18e-05', 'epoch': '1.426'}
93
+ {'loss': '3.965', 'grad_norm': '0.7926', 'learning_rate': '3.902e-05', 'epoch': '1.447'}
94
+ {'loss': '3.963', 'grad_norm': '0.9134', 'learning_rate': '3.631e-05', 'epoch': '1.468'}
95
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 58.90it/s]
96
+ {'loss': '3.963', 'grad_norm': '0.7194', 'learning_rate': '3.367e-05', 'epoch': '1.489'}
97
+ {'loss': '3.96', 'grad_norm': '0.6361', 'learning_rate': '3.112e-05', 'epoch': '1.51'}
98
+ {'loss': '3.957', 'grad_norm': '0.927', 'learning_rate': '2.865e-05', 'epoch': '1.531'}
99
+ {'loss': '3.957', 'grad_norm': '0.6016', 'learning_rate': '2.626e-05', 'epoch': '1.552'}
100
+ {'loss': '3.954', 'grad_norm': '0.6197', 'learning_rate': '2.397e-05', 'epoch': '1.573'}
101
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 59.64it/s]
102
+ {'loss': '3.952', 'grad_norm': '0.577', 'learning_rate': '2.176e-05', 'epoch': '1.594'}
103
+ {'loss': '3.948', 'grad_norm': '0.5791', 'learning_rate': '1.965e-05', 'epoch': '1.615'}
104
+ {'loss': '3.951', 'grad_norm': '0.5636', 'learning_rate': '1.763e-05', 'epoch': '1.636'}
105
+ {'loss': '3.949', 'grad_norm': '0.5653', 'learning_rate': '1.572e-05', 'epoch': '1.657'}
106
+ {'loss': '3.948', 'grad_norm': '0.5782', 'learning_rate': '1.39e-05', 'epoch': '1.678'}
107
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 59.44it/s]
108
+ {'loss': '3.946', 'grad_norm': '0.4793', 'learning_rate': '1.219e-05', 'epoch': '1.699'}
109
+ {'loss': '3.946', 'grad_norm': '0.4931', 'learning_rate': '1.058e-05', 'epoch': '1.72'}
110
+ {'loss': '3.945', 'grad_norm': '0.5097', 'learning_rate': '9.086e-06', 'epoch': '1.741'}
111
+ {'loss': '3.945', 'grad_norm': '0.5356', 'learning_rate': '7.697e-06', 'epoch': '1.761'}
112
+ {'loss': '3.945', 'grad_norm': '0.5223', 'learning_rate': '6.419e-06', 'epoch': '1.782'}
113
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 59.33it/s]
114
+ {'loss': '3.944', 'grad_norm': '0.4148', 'learning_rate': '5.253e-06', 'epoch': '1.803'}
115
+ {'loss': '3.943', 'grad_norm': '0.4304', 'learning_rate': '4.201e-06', 'epoch': '1.824'}
116
+ {'loss': '3.943', 'grad_norm': '0.4192', 'learning_rate': '3.265e-06', 'epoch': '1.845'}
117
+ {'loss': '3.942', 'grad_norm': '0.4074', 'learning_rate': '2.444e-06', 'epoch': '1.866'}
118
+ {'loss': '3.941', 'grad_norm': '0.44', 'learning_rate': '1.741e-06', 'epoch': '1.887'}
119
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 20.57it/s]
120
+ {'loss': '3.942', 'grad_norm': '0.4061', 'learning_rate': '1.156e-06', 'epoch': '1.908'}
121
+ {'loss': '3.941', 'grad_norm': '0.3939', 'learning_rate': '6.899e-07', 'epoch': '1.929'}
122
+ {'loss': '3.942', 'grad_norm': '0.3792', 'learning_rate': '3.431e-07', 'epoch': '1.95'}
123
+ {'loss': '3.944', 'grad_norm': '0.3599', 'learning_rate': '1.161e-07', 'epoch': '1.971'}
124
+ {'loss': '3.942', 'grad_norm': '0.3671', 'learning_rate': '9.142e-09', 'epoch': '1.992'}
125
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 59.80it/s]
126
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 59.16it/s]
127
+ {'train_runtime': '3.949e+04', 'train_samples_per_second': '247.3', 'train_steps_per_second': '0.242', 'train_loss': '4.414', 'epoch': '2'}
128
+ 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 9538/9538 [10:58:05<00:00, 4.14s/it]
129
+ Writing model shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 57.38it/s]
130
+ [*] Training finished.
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed1f09ec8e4fd507c505c722f8f8332bfd3288995ba900f334dd814118a9818a
3
+ size 5265