LH-Tech-AI commited on
Commit
208972d
·
verified ·
1 Parent(s): d999f97

Upload 9 files

Browse files
benchmarks.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
2
+ |------------------------------------------------------------|------:|------|-----:|---------------|---|--------:|---|------|
3
+ |arc_easy | 1|none | 0|acc |↑ | 0.2727|± |0.0091|
4
+ | | |none | 0|acc_norm |↑ | 0.2816|± |0.0092|
5
+ |blimp | 2|none | |acc |↑ | 0.5526|± |0.0017|
6
+ | - blimp_adjunct_island | 1|none | 0|acc |↑ | 0.7330|± |0.0140|
7
+ | - blimp_anaphor_gender_agreement | 1|none | 0|acc |↑ | 0.3820|± |0.0154|
8
+ | - blimp_anaphor_number_agreement | 1|none | 0|acc |↑ | 0.5030|± |0.0158|
9
+ | - blimp_animate_subject_passive | 1|none | 0|acc |↑ | 0.5520|± |0.0157|
10
+ | - blimp_animate_subject_trans | 1|none | 0|acc |↑ | 0.7250|± |0.0141|
11
+ | - blimp_causative | 1|none | 0|acc |↑ | 0.5010|± |0.0158|
12
+ | - blimp_complex_NP_island | 1|none | 0|acc |↑ | 0.5640|± |0.0157|
13
+ | - blimp_coordinate_structure_constraint_complex_left_branch| 1|none | 0|acc |↑ | 0.0840|± |0.0088|
14
+ | - blimp_coordinate_structure_constraint_object_extraction | 1|none | 0|acc |↑ | 0.4930|± |0.0158|
15
+ | - blimp_determiner_noun_agreement_1 | 1|none | 0|acc |↑ | 0.7000|± |0.0145|
16
+ | - blimp_determiner_noun_agreement_2 | 1|none | 0|acc |↑ | 0.7070|± |0.0144|
17
+ | - blimp_determiner_noun_agreement_irregular_1 | 1|none | 0|acc |↑ | 0.5500|± |0.0157|
18
+ | - blimp_determiner_noun_agreement_irregular_2 | 1|none | 0|acc |↑ | 0.7110|± |0.0143|
19
+ | - blimp_determiner_noun_agreement_with_adj_2 | 1|none | 0|acc |↑ | 0.6170|± |0.0154|
20
+ | - blimp_determiner_noun_agreement_with_adj_irregular_1 | 1|none | 0|acc |↑ | 0.5010|± |0.0158|
21
+ | - blimp_determiner_noun_agreement_with_adj_irregular_2 | 1|none | 0|acc |↑ | 0.6180|± |0.0154|
22
+ | - blimp_determiner_noun_agreement_with_adjective_1 | 1|none | 0|acc |↑ | 0.6380|± |0.0152|
23
+ | - blimp_distractor_agreement_relational_noun | 1|none | 0|acc |↑ | 0.3050|± |0.0146|
24
+ | - blimp_distractor_agreement_relative_clause | 1|none | 0|acc |↑ | 0.2710|± |0.0141|
25
+ | - blimp_drop_argument | 1|none | 0|acc |↑ | 0.6970|± |0.0145|
26
+ | - blimp_ellipsis_n_bar_1 | 1|none | 0|acc |↑ | 0.2640|± |0.0139|
27
+ | - blimp_ellipsis_n_bar_2 | 1|none | 0|acc |↑ | 0.4140|± |0.0156|
28
+ | - blimp_existential_there_object_raising | 1|none | 0|acc |↑ | 0.7440|± |0.0138|
29
+ | - blimp_existential_there_quantifiers_1 | 1|none | 0|acc |↑ | 0.9030|± |0.0094|
30
+ | - blimp_existential_there_quantifiers_2 | 1|none | 0|acc |↑ | 0.1200|± |0.0103|
31
+ | - blimp_existential_there_subject_raising | 1|none | 0|acc |↑ | 0.6530|± |0.0151|
32
+ | - blimp_expletive_it_object_raising | 1|none | 0|acc |↑ | 0.6850|± |0.0147|
33
+ | - blimp_inchoative | 1|none | 0|acc |↑ | 0.4090|± |0.0156|
34
+ | - blimp_intransitive | 1|none | 0|acc |↑ | 0.5600|± |0.0157|
35
+ | - blimp_irregular_past_participle_adjectives | 1|none | 0|acc |↑ | 0.7220|± |0.0142|
36
+ | - blimp_irregular_past_participle_verbs | 1|none | 0|acc |↑ | 0.6330|± |0.0152|
37
+ | - blimp_irregular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.6140|± |0.0154|
38
+ | - blimp_irregular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.7250|± |0.0141|
39
+ | - blimp_left_branch_island_echo_question | 1|none | 0|acc |↑ | 0.6450|± |0.0151|
40
+ | - blimp_left_branch_island_simple_question | 1|none | 0|acc |↑ | 0.1690|± |0.0119|
41
+ | - blimp_matrix_question_npi_licensor_present | 1|none | 0|acc |↑ | 0.0020|± |0.0014|
42
+ | - blimp_npi_present_1 | 1|none | 0|acc |↑ | 0.3860|± |0.0154|
43
+ | - blimp_npi_present_2 | 1|none | 0|acc |↑ | 0.3810|± |0.0154|
44
+ | - blimp_only_npi_licensor_present | 1|none | 0|acc |↑ | 0.6120|± |0.0154|
45
+ | - blimp_only_npi_scope | 1|none | 0|acc |↑ | 0.4280|± |0.0157|
46
+ | - blimp_passive_1 | 1|none | 0|acc |↑ | 0.6450|± |0.0151|
47
+ | - blimp_passive_2 | 1|none | 0|acc |↑ | 0.6410|± |0.0152|
48
+ | - blimp_principle_A_c_command | 1|none | 0|acc |↑ | 0.6910|± |0.0146|
49
+ | - blimp_principle_A_case_1 | 1|none | 0|acc |↑ | 1.0000|± | 0|
50
+ | - blimp_principle_A_case_2 | 1|none | 0|acc |↑ | 0.5190|± |0.0158|
51
+ | - blimp_principle_A_domain_1 | 1|none | 0|acc |↑ | 0.9810|± |0.0043|
52
+ | - blimp_principle_A_domain_2 | 1|none | 0|acc |↑ | 0.5570|± |0.0157|
53
+ | - blimp_principle_A_domain_3 | 1|none | 0|acc |↑ | 0.4680|± |0.0158|
54
+ | - blimp_principle_A_reconstruction | 1|none | 0|acc |↑ | 0.2410|± |0.0135|
55
+ | - blimp_regular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.7200|± |0.0142|
56
+ | - blimp_regular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.6030|± |0.0155|
57
+ | - blimp_sentential_negation_npi_licensor_present | 1|none | 0|acc |↑ | 1.0000|± | 0|
58
+ | - blimp_sentential_negation_npi_scope | 1|none | 0|acc |↑ | 0.4990|± |0.0158|
59
+ | - blimp_sentential_subject_island | 1|none | 0|acc |↑ | 0.3440|± |0.0150|
60
+ | - blimp_superlative_quantifiers_1 | 1|none | 0|acc |↑ | 0.5400|± |0.0158|
61
+ | - blimp_superlative_quantifiers_2 | 1|none | 0|acc |↑ | 0.1780|± |0.0121|
62
+ | - blimp_tough_vs_raising_1 | 1|none | 0|acc |↑ | 0.4330|± |0.0157|
63
+ | - blimp_tough_vs_raising_2 | 1|none | 0|acc |↑ | 0.5950|± |0.0155|
64
+ | - blimp_transitive | 1|none | 0|acc |↑ | 0.6260|± |0.0153|
65
+ | - blimp_wh_island | 1|none | 0|acc |↑ | 0.4180|± |0.0156|
66
+ | - blimp_wh_questions_object_gap | 1|none | 0|acc |↑ | 0.5430|± |0.0158|
67
+ | - blimp_wh_questions_subject_gap | 1|none | 0|acc |↑ | 0.9160|± |0.0088|
68
+ | - blimp_wh_questions_subject_gap_long_distance | 1|none | 0|acc |↑ | 0.9410|± |0.0075|
69
+ | - blimp_wh_vs_that_no_gap | 1|none | 0|acc |↑ | 0.9800|± |0.0044|
70
+ | - blimp_wh_vs_that_no_gap_long_distance | 1|none | 0|acc |↑ | 0.9820|± |0.0042|
71
+ | - blimp_wh_vs_that_with_gap | 1|none | 0|acc |↑ | 0.0280|± |0.0052|
72
+ | - blimp_wh_vs_that_with_gap_long_distance | 1|none | 0|acc |↑ | 0.0150|± |0.0038|
73
+ |wikitext | 2|none | 0|bits_per_byte |↓ | 2.1661|± | N/A|
74
+ | | |none | 0|byte_perplexity|↓ | 4.4881|± | N/A|
75
+ | | |none | 0|word_perplexity|↓ |3068.2023|± | N/A|
76
+
77
+ |Groups|Version|Filter|n-shot|Metric| |Value | |Stderr|
78
+ |------|------:|------|------|------|---|-----:|---|-----:|
79
+ |blimp | 2|none | |acc |↑ |0.5526|± |0.0017|
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "head_dim": 8,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 64,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 128,
15
+ "max_position_embeddings": 512,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 8,
19
+ "num_hidden_layers": 5,
20
+ "num_key_value_heads": 8,
21
+ "pad_token_id": 1,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_parameters": {
25
+ "rope_theta": 10000.0,
26
+ "rope_type": "default"
27
+ },
28
+ "tie_word_embeddings": true,
29
+ "transformers_version": "5.8.1",
30
+ "use_cache": false,
31
+ "vocab_size": 4096
32
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "5.8.1",
9
+ "use_cache": true
10
+ }
inference.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print("[*] Loading libraries...")
2
+ import torch
3
+ from transformers import LlamaForCausalLM, PreTrainedTokenizerFast
4
+
5
+ model_path = "./Supra-Mini-v3-0.5M-FINAL"
6
+
7
+ print("[*] Loading tokenizer...")
8
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)
9
+
10
+ print("[*] Loading model...")
11
+ model = LlamaForCausalLM.from_pretrained(model_path)
12
+ model.eval()
13
+
14
+ prompt = "The main concept of physics is "
15
+ print(f"[*] Prompt: {prompt!r}")
16
+
17
+ inputs = tokenizer(prompt, return_tensors="pt")
18
+
19
+ with torch.no_grad():
20
+ outputs = model.generate(
21
+ input_ids=inputs["input_ids"],
22
+ attention_mask=inputs["attention_mask"],
23
+ max_new_tokens=150,
24
+ do_sample=True,
25
+ temperature=0.5,
26
+ top_p=0.9,
27
+ top_k=25,
28
+ repetition_penalty=1.3,
29
+ pad_token_id=tokenizer.pad_token_id,
30
+ eos_token_id=tokenizer.eos_token_id,
31
+ )
32
+
33
+ print("[*] Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9e652fea30d654bd4e0e523b1056857c43745adb3288af1f35986282768cfc0
3
+ size 1875544
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }
train.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ © SupraLabs 2026 - Official pretraining code for Supra Mini v3 0.5M
3
+ """
4
+
5
+ import os
6
+ os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
7
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
8
+
9
+ print("[*] Loading libraries...")
10
+ import torch
11
+ import math
12
+ import numpy as np
13
+ from datasets import load_dataset
14
+ from tokenizers import ByteLevelBPETokenizer
15
+ from transformers import (
16
+ LlamaConfig,
17
+ LlamaForCausalLM,
18
+ PreTrainedTokenizerFast,
19
+ Trainer,
20
+ TrainingArguments,
21
+ )
22
+ from torch.utils.data import Dataset
23
+ from tqdm import tqdm
24
+
25
+ print("[*] Loading tokenizer...")
26
+ fast_tokenizer = ByteLevelBPETokenizer(
27
+ "./custom_llama_tokenizer-vocab.json",
28
+ "./custom_llama_tokenizer-merges.txt"
29
+ )
30
+ tokenizer = PreTrainedTokenizerFast(
31
+ tokenizer_object=fast_tokenizer,
32
+ bos_token="<s>",
33
+ eos_token="</s>",
34
+ unk_token="<unk>",
35
+ pad_token="<pad>",
36
+ )
37
+
38
+ TOKEN_BIN = "./tokens.bin"
39
+ TARGET_TOKENS = 1_000_000_000
40
+ SEQ_LEN = 512
41
+ BATCH_TEXTS = 1000
42
+ FLUSH_EVERY = 1_000_000
43
+
44
+
45
+ def build_token_bin(fast_tokenizer, path=TOKEN_BIN, target_tokens=TARGET_TOKENS):
46
+ if os.path.exists(path) and os.path.getsize(path) >= target_tokens * 2:
47
+ print(f"[=] Reusing existing token file: {path}")
48
+ return
49
+
50
+ print(f"[*] Streaming + tokenizing {target_tokens:,} tokens → {path}")
51
+ mm = np.memmap(path, dtype=np.uint16, mode="w+", shape=(target_tokens,))
52
+
53
+ dataset = load_dataset(
54
+ "HuggingFaceFW/fineweb-edu", "sample-10BT",
55
+ split="train", streaming=True
56
+ )
57
+
58
+ written = 0
59
+ buf = []
60
+ texts = []
61
+ pbar = tqdm(total=target_tokens, desc="[*] Gathering tokens", unit="tok")
62
+
63
+ def flush_buf():
64
+ nonlocal written, buf
65
+ if not buf:
66
+ return False
67
+ n = min(len(buf), target_tokens - written)
68
+ mm[written:written + n] = np.asarray(buf[:n], dtype=np.uint16)
69
+ written += n
70
+ pbar.update(n)
71
+ del buf[:n]
72
+ return written >= target_tokens
73
+
74
+ for example in dataset:
75
+ texts.append(example["text"])
76
+ if len(texts) >= BATCH_TEXTS:
77
+ encs = fast_tokenizer.encode_batch(texts)
78
+ texts.clear()
79
+ for e in encs:
80
+ buf.extend(e.ids)
81
+ if len(buf) >= FLUSH_EVERY:
82
+ if flush_buf():
83
+ break
84
+
85
+ if written < target_tokens and texts:
86
+ encs = fast_tokenizer.encode_batch(texts)
87
+ for e in encs:
88
+ buf.extend(e.ids)
89
+ if written < target_tokens:
90
+ flush_buf()
91
+
92
+ pbar.close()
93
+ mm.flush()
94
+ del mm
95
+ print(f"[+] Wrote {written:,} tokens to {path} "
96
+ f"({os.path.getsize(path)/1e6:.1f} MB)")
97
+
98
+
99
+ class MemmapDataset(Dataset):
100
+ def __init__(self, path, total_tokens, seq_len=SEQ_LEN):
101
+ self.path = path
102
+ self.seq_len = seq_len
103
+ self.n_chunks = total_tokens // seq_len
104
+ self._data = None # lazy open (Multiprocessing-safe)
105
+
106
+ @property
107
+ def data(self):
108
+ if self._data is None:
109
+ self._data = np.memmap(
110
+ self.path, dtype=np.uint16, mode="r",
111
+ shape=(self.n_chunks * self.seq_len,)
112
+ )
113
+ return self._data
114
+
115
+ def __len__(self):
116
+ return self.n_chunks
117
+
118
+ def __getitem__(self, idx):
119
+ s = idx * self.seq_len
120
+ arr = np.asarray(self.data[s:s + self.seq_len], dtype=np.int64)
121
+ ids = torch.from_numpy(arr)
122
+ return {"input_ids": ids, "labels": ids.clone()}
123
+
124
+
125
+ def collate_fn(batch):
126
+ input_ids = torch.stack([b["input_ids"] for b in batch])
127
+ labels = torch.stack([b["labels"] for b in batch])
128
+ return {"input_ids": input_ids, "labels": labels}
129
+
130
+
131
+ print(f"[*] Preparing {TARGET_TOKENS:,} tokens (streaming, memmap-backed)...")
132
+ build_token_bin(fast_tokenizer, TOKEN_BIN, TARGET_TOKENS)
133
+ dataset = MemmapDataset(TOKEN_BIN, TARGET_TOKENS, seq_len=SEQ_LEN)
134
+ print(f"[+] Dataset ready: {len(dataset):,} chunks of {SEQ_LEN} tokens")
135
+
136
+ print("[*] Setting up model...")
137
+ config = LlamaConfig(
138
+ vocab_size=len(tokenizer.get_vocab()),
139
+ hidden_size=64,
140
+ intermediate_size=128,
141
+ num_hidden_layers=5,
142
+ num_attention_heads=8,
143
+ num_key_value_heads=8,
144
+ max_position_embeddings=512,
145
+ tie_word_embeddings=True,
146
+ pad_token_id=tokenizer.pad_token_id,
147
+ bos_token_id=tokenizer.bos_token_id,
148
+ eos_token_id=tokenizer.eos_token_id,
149
+ )
150
+ model = LlamaForCausalLM(config)
151
+ print(f"[*] Model parameters: {model.num_parameters():,}")
152
+
153
+ print("[*] Defining training arguments...")
154
+ training_args = TrainingArguments(
155
+ output_dir="./Supra-Mini-v3-0.5M",
156
+ num_train_epochs=2,
157
+ per_device_train_batch_size=256,
158
+ gradient_accumulation_steps=4,
159
+ save_steps=500,
160
+ save_total_limit=2,
161
+ logging_steps=100,
162
+ weight_decay=0.01,
163
+ fp16=False,
164
+ bf16=True,
165
+ push_to_hub=False,
166
+ report_to="none",
167
+ dataloader_num_workers=os.cpu_count() // 2,
168
+ dataloader_pin_memory=True,
169
+ learning_rate=5e-4,
170
+ lr_scheduler_type="cosine",
171
+ warmup_ratio=0.02,
172
+ )
173
+
174
+ trainer = Trainer(
175
+ model=model,
176
+ args=training_args,
177
+ train_dataset=dataset,
178
+ data_collator=collate_fn,
179
+ )
180
+
181
+ print("[*] Starting training...")
182
+ trainer.train()
183
+ trainer.save_model("./Supra-Mini-v3-0.5M-FINAL")
184
+ tokenizer.save_pretrained("./Supra-Mini-v3-0.5M-FINAL")
185
+ print("[*] Training finished.")
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6af52f41b76534782ca382fbeefc7f4a63a5a810768f8d1e40a0c6e2bd67d6dd
3
+ size 5265