LH-Tech-AI commited on
Commit
a795f8c
·
verified ·
1 Parent(s): 791f8b4

Upload 12 files

Browse files
benchmarks.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr |
2
+ |------------------------------------------------------------|------:|------|-----:|---------------|---|-------:|---|-------|
3
+ |arc_easy | 1|none | 0|acc |↑ | 0.5223|± | 0.0102|
4
+ | | |none | 0|acc_norm |↑ | 0.4600|± | 0.0102|
5
+ |blimp | 2|none | |acc |↑ | 0.7631|± | 0.0014|
6
+ | - blimp_adjunct_island | 1|none | 0|acc |↑ | 0.8420|± | 0.0115|
7
+ | - blimp_anaphor_gender_agreement | 1|none | 0|acc |↑ | 0.8430|± | 0.0115|
8
+ | - blimp_anaphor_number_agreement | 1|none | 0|acc |↑ | 0.9620|± | 0.0060|
9
+ | - blimp_animate_subject_passive | 1|none | 0|acc |↑ | 0.7820|± | 0.0131|
10
+ | - blimp_animate_subject_trans | 1|none | 0|acc |↑ | 0.8040|± | 0.0126|
11
+ | - blimp_causative | 1|none | 0|acc |↑ | 0.6980|± | 0.0145|
12
+ | - blimp_complex_NP_island | 1|none | 0|acc |↑ | 0.4940|± | 0.0158|
13
+ | - blimp_coordinate_structure_constraint_complex_left_branch| 1|none | 0|acc |↑ | 0.7420|± | 0.0138|
14
+ | - blimp_coordinate_structure_constraint_object_extraction | 1|none | 0|acc |↑ | 0.7520|± | 0.0137|
15
+ | - blimp_determiner_noun_agreement_1 | 1|none | 0|acc |↑ | 0.9790|± | 0.0045|
16
+ | - blimp_determiner_noun_agreement_2 | 1|none | 0|acc |↑ | 0.9680|± | 0.0056|
17
+ | - blimp_determiner_noun_agreement_irregular_1 | 1|none | 0|acc |↑ | 0.8990|± | 0.0095|
18
+ | - blimp_determiner_noun_agreement_irregular_2 | 1|none | 0|acc |↑ | 0.9650|± | 0.0058|
19
+ | - blimp_determiner_noun_agreement_with_adj_2 | 1|none | 0|acc |↑ | 0.9340|± | 0.0079|
20
+ | - blimp_determiner_noun_agreement_with_adj_irregular_1 | 1|none | 0|acc |↑ | 0.8740|± | 0.0105|
21
+ | - blimp_determiner_noun_agreement_with_adj_irregular_2 | 1|none | 0|acc |↑ | 0.9270|± | 0.0082|
22
+ | - blimp_determiner_noun_agreement_with_adjective_1 | 1|none | 0|acc |↑ | 0.9410|± | 0.0075|
23
+ | - blimp_distractor_agreement_relational_noun | 1|none | 0|acc |↑ | 0.8780|± | 0.0104|
24
+ | - blimp_distractor_agreement_relative_clause | 1|none | 0|acc |↑ | 0.7210|± | 0.0142|
25
+ | - blimp_drop_argument | 1|none | 0|acc |↑ | 0.7500|± | 0.0137|
26
+ | - blimp_ellipsis_n_bar_1 | 1|none | 0|acc |↑ | 0.8060|± | 0.0125|
27
+ | - blimp_ellipsis_n_bar_2 | 1|none | 0|acc |↑ | 0.8820|± | 0.0102|
28
+ | - blimp_existential_there_object_raising | 1|none | 0|acc |↑ | 0.8750|± | 0.0105|
29
+ | - blimp_existential_there_quantifiers_1 | 1|none | 0|acc |↑ | 0.9730|± | 0.0051|
30
+ | - blimp_existential_there_quantifiers_2 | 1|none | 0|acc |↑ | 0.2070|± | 0.0128|
31
+ | - blimp_existential_there_subject_raising | 1|none | 0|acc |↑ | 0.8810|± | 0.0102|
32
+ | - blimp_expletive_it_object_raising | 1|none | 0|acc |↑ | 0.7830|± | 0.0130|
33
+ | - blimp_inchoative | 1|none | 0|acc |↑ | 0.6330|± | 0.0152|
34
+ | - blimp_intransitive | 1|none | 0|acc |↑ | 0.7310|± | 0.0140|
35
+ | - blimp_irregular_past_participle_adjectives | 1|none | 0|acc |↑ | 0.8620|± | 0.0109|
36
+ | - blimp_irregular_past_participle_verbs | 1|none | 0|acc |↑ | 0.8930|± | 0.0098|
37
+ | - blimp_irregular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.8990|± | 0.0095|
38
+ | - blimp_irregular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.9030|± | 0.0094|
39
+ | - blimp_left_branch_island_echo_question | 1|none | 0|acc |↑ | 0.3810|± | 0.0154|
40
+ | - blimp_left_branch_island_simple_question | 1|none | 0|acc |↑ | 0.6470|± | 0.0151|
41
+ | - blimp_matrix_question_npi_licensor_present | 1|none | 0|acc |↑ | 0.1260|± | 0.0105|
42
+ | - blimp_npi_present_1 | 1|none | 0|acc |↑ | 0.5710|± | 0.0157|
43
+ | - blimp_npi_present_2 | 1|none | 0|acc |↑ | 0.6190|± | 0.0154|
44
+ | - blimp_only_npi_licensor_present | 1|none | 0|acc |↑ | 0.6250|± | 0.0153|
45
+ | - blimp_only_npi_scope | 1|none | 0|acc |↑ | 0.5360|± | 0.0158|
46
+ | - blimp_passive_1 | 1|none | 0|acc |↑ | 0.8770|± | 0.0104|
47
+ | - blimp_passive_2 | 1|none | 0|acc |↑ | 0.8840|± | 0.0101|
48
+ | - blimp_principle_A_c_command | 1|none | 0|acc |↑ | 0.5560|± | 0.0157|
49
+ | - blimp_principle_A_case_1 | 1|none | 0|acc |↑ | 1.0000|± | 0|
50
+ | - blimp_principle_A_case_2 | 1|none | 0|acc |↑ | 0.9650|± | 0.0058|
51
+ | - blimp_principle_A_domain_1 | 1|none | 0|acc |↑ | 0.9430|± | 0.0073|
52
+ | - blimp_principle_A_domain_2 | 1|none | 0|acc |↑ | 0.8040|± | 0.0126|
53
+ | - blimp_principle_A_domain_3 | 1|none | 0|acc |↑ | 0.5200|± | 0.0158|
54
+ | - blimp_principle_A_reconstruction | 1|none | 0|acc |↑ | 0.2920|± | 0.0144|
55
+ | - blimp_regular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.8930|± | 0.0098|
56
+ | - blimp_regular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.9110|± | 0.0090|
57
+ | - blimp_sentential_negation_npi_licensor_present | 1|none | 0|acc |↑ | 0.9930|± | 0.0026|
58
+ | - blimp_sentential_negation_npi_scope | 1|none | 0|acc |↑ | 0.7100|± | 0.0144|
59
+ | - blimp_sentential_subject_island | 1|none | 0|acc |↑ | 0.3310|± | 0.0149|
60
+ | - blimp_superlative_quantifiers_1 | 1|none | 0|acc |↑ | 0.7800|± | 0.0131|
61
+ | - blimp_superlative_quantifiers_2 | 1|none | 0|acc |↑ | 0.7450|± | 0.0138|
62
+ | - blimp_tough_vs_raising_1 | 1|none | 0|acc |↑ | 0.5390|± | 0.0158|
63
+ | - blimp_tough_vs_raising_2 | 1|none | 0|acc |↑ | 0.8780|± | 0.0104|
64
+ | - blimp_transitive | 1|none | 0|acc |↑ | 0.8430|± | 0.0115|
65
+ | - blimp_wh_island | 1|none | 0|acc |↑ | 0.7190|± | 0.0142|
66
+ | - blimp_wh_questions_object_gap | 1|none | 0|acc |↑ | 0.7590|± | 0.0135|
67
+ | - blimp_wh_questions_subject_gap | 1|none | 0|acc |↑ | 0.9280|± | 0.0082|
68
+ | - blimp_wh_questions_subject_gap_long_distance | 1|none | 0|acc |↑ | 0.8550|± | 0.0111|
69
+ | - blimp_wh_vs_that_no_gap | 1|none | 0|acc |↑ | 0.9490|± | 0.0070|
70
+ | - blimp_wh_vs_that_no_gap_long_distance | 1|none | 0|acc |↑ | 0.9490|± | 0.0070|
71
+ | - blimp_wh_vs_that_with_gap | 1|none | 0|acc |↑ | 0.5920|± | 0.0155|
72
+ | - blimp_wh_vs_that_with_gap_long_distance | 1|none | 0|acc |↑ | 0.3280|± | 0.0149|
73
+ |hellaswag | 1|none | 0|acc |↑ | 0.2914|± | 0.0045|
74
+ | | |none | 0|acc_norm |↑ | 0.3178|± | 0.0046|
75
+ |lambada_openai | 1|none | 0|acc |↑ | 0.2591|± | 0.0061|
76
+ | | |none | 0|perplexity |↓ | 95.5121|± | 4.1325|
77
+ |lambada_standard | 1|none | 0|acc |↑ | 0.1716|± | 0.0053|
78
+ | | |none | 0|perplexity |↓ |488.2170|± |23.4634|
79
+ |piqa | 1|none | 0|acc |↑ | 0.6224|± | 0.0113|
80
+ | | |none | 0|acc_norm |↑ | 0.6208|± | 0.0113|
81
+ |sciq | 1|none | 0|acc |↑ | 0.7720|± | 0.0133|
82
+ | | |none | 0|acc_norm |↑ | 0.6810|± | 0.0147|
83
+ |wikitext | 2|none | 0|bits_per_byte |↓ | 1.0267|± | N/A |
84
+ | | |none | 0|byte_perplexity|↓ | 2.0374|± | N/A |
85
+ | | |none | 0|word_perplexity|↓ | 44.9548|± | N/A |
86
+ |winogrande | 1|none | 0|acc |↑ | 0.5099|± | 0.0140|
87
+
88
+ |Groups|Version|Filter|n-shot|Metric| |Value | |Stderr|
89
+ |------|------:|------|------|------|---|-----:|---|-----:|
90
+ |blimp | 2|none | |acc |↑ |0.7631|± |0.0014|
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 1408,
15
+ "max_position_embeddings": 1024,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 8,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 1,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_parameters": {
25
+ "rope_theta": 10000,
26
+ "rope_type": "default"
27
+ },
28
+ "tie_word_embeddings": true,
29
+ "transformers_version": "5.8.1",
30
+ "use_cache": false,
31
+ "vocab_size": 32000
32
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "5.8.1",
9
+ "use_cache": true
10
+ }
inference.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print("[*] Loading libraries...")
2
+ import torch
3
+ from transformers import LlamaForCausalLM, PreTrainedTokenizerFast
4
+
5
+ model_path = "./Chimera-FINAL"
6
+
7
+ print("[*] Loading tokenizer...")
8
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)
9
+
10
+ print("[*] Loading model...")
11
+ model = LlamaForCausalLM.from_pretrained(model_path)
12
+ model.eval()
13
+
14
+ prompt = "Artificial intelligence is " # "Artificial intelligence is " | "The main concept of physics is " | "Once upon a time, "
15
+ print(f"[*] Prompt: {prompt!r}")
16
+
17
+ inputs = tokenizer(prompt, return_tensors="pt")
18
+
19
+ with torch.no_grad():
20
+ outputs = model.generate(
21
+ input_ids=inputs["input_ids"],
22
+ attention_mask=inputs["attention_mask"],
23
+ max_new_tokens=256,
24
+ do_sample=True,
25
+ temperature=0.4,
26
+ top_p=0.85,
27
+ top_k=30,
28
+ repetition_penalty=1.1,
29
+ pad_token_id=tokenizer.pad_token_id,
30
+ eos_token_id=tokenizer.eos_token_id,
31
+ )
32
+
33
+ print("[*] Output of Supra 50M Base:", tokenizer.decode(outputs[0], skip_special_tokens=True))
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b3287f381ae67127b09ac24939733792452cc7328925303b7ef7475f4f6f285
3
+ size 207157136
samples.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [*] Prompt: 'The main concept of physics is '
2
+ [*] Output of Supra 50M Base: The main concept of physics is iffy, and the idea that we can make things behave in a certain way. The most important part of physics is called quantum mechanics which states that all particles are made up of energy (energy) and matter (matter).
3
+ In physics, there are two types of particles: elementary particles and exotic ones. These particles have properties like mass, speed or momentum but they don’t interact with each other to form new objects. This is because these particles do not exist independently from one another. In this case, an exotic particle might be created by adding more energy into its structure than it would take for a normal particle. However, when you add additional energy to an exotic particle, the new object will become smaller and larger until it becomes too large to fit within the existing structure.
4
+ If you think about how light travels through space, it takes around 20 billion years before the light reaches our eyes. Light waves travel faster than light at high speeds so if we could create some kind of light wave, then we wouldn’t need any special equipment. It just needs a few hundred millionths of a second to produce light rays. So even though the light is moving along the same path as the current, the speed of light is different depending on where the light hits the
5
+
6
+ [*] Prompt: 'Artificial intelligence is '
7
+ [*] Output of Supra 50M Base: Artificial intelligence is iffy, it can be used to make intelligent machines that could take over the world.
8
+ What does Artificial Intelligence mean?
9
+ AI refers to artificial intelligence and machine learning technology which is a type of computer science (also known as artificial intelligence) in which computers are programmed with knowledge about their environment or other objects. The term AI comes from the Greek word "art" meaning "to create."
10
+ The most common uses for AI include:
11
+ - Machine Learning
12
+ This means using algorithms like natural language processing systems to learn how words work together to form sentences such as “I am going to go to the store.”
13
+ These programs will then use these rules to decide whether they should buy something or not so that you know what’s being sold on the internet. For example, if you purchase an ebook at Amazon, you may want to check its price first before purchasing it. If this happens, your shopping cart might look different than it did when purchased by someone else who bought it earlier.
14
+ You can also think of AI as a way to help people understand themselves better through training and reasoning rather than simply seeing them doing things differently. In fact, we often see AI models working very well because of the way humans interact with our minds. This ability makes us more effective
15
+
16
+ [*] Prompt: 'Once upon a time, '
17
+ [*] Output of Supra 50M Base: Once upon a time, ...... I was so excited about the new school year and wanted to make some changes in my life.
18
+ I had been looking for ways to help me become more self-aware. As an adult, I have always felt that there is no one way of doing things without thinking first. This has led me to start making small changes at home or at work. One such change was to create a space where I could be more mindful and aware of myself as well as other people around me. It’s important to remember that we all need our own personal growth and development. We can do this by taking responsibility for ourselves; being responsible for what happens outside us and keeping it within our control. By creating these smaller steps towards becoming more conscious of yourself, you will see how much better your future looks!The word "treaty" means something like "a treaty made with a king." The French word for "covenant," célèbre (French: cœle), comes from the Latin cecus ("to give up"). A covenant is not a binding agreement but rather an act of mutual understanding between two parties. In general terms, a contract is anything agreed on which someone agrees to agree to receive certain benefits. For example, if a person
19
+
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }
train.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ © SupraLabs 2026 - Official pretraining code for PROJECT CHIMERA - 50M Llama
3
+ """
4
+
5
+ import os
6
+ os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
7
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
8
+
9
+ print("[*] Loading libraries...")
10
+ import torch
11
+ import math
12
+ import numpy as np
13
+ from datasets import load_dataset
14
+ from tokenizers import ByteLevelBPETokenizer
15
+ from transformers import (
16
+ LlamaConfig,
17
+ LlamaForCausalLM,
18
+ PreTrainedTokenizerFast,
19
+ Trainer,
20
+ TrainingArguments,
21
+ )
22
+ from torch.utils.data import Dataset
23
+ from tqdm import tqdm
24
+
25
+ print("[*] Loading tokenizer...")
26
+ fast_tokenizer = ByteLevelBPETokenizer(
27
+ "custom_llama_tokenizer-vocab.json",
28
+ "custom_llama_tokenizer-merges.txt"
29
+ )
30
+ tokenizer = PreTrainedTokenizerFast(
31
+ tokenizer_object=fast_tokenizer,
32
+ bos_token="<s>",
33
+ eos_token="</s>",
34
+ unk_token="<unk>",
35
+ pad_token="<pad>",
36
+ )
37
+
38
+ TOKEN_BIN = "tokens.bin"
39
+ TARGET_TOKENS = 20_000_000_000
40
+ SEQ_LEN = 1024
41
+ BATCH_TEXTS = 1000
42
+ FLUSH_EVERY = 1_000_000
43
+
44
+
45
+ def build_token_bin(fast_tokenizer, path=TOKEN_BIN, target_tokens=TARGET_TOKENS):
46
+ if os.path.exists(path) and os.path.getsize(path) >= target_tokens * 2:
47
+ print(f"[=] Reusing existing token file: {path}")
48
+ return
49
+
50
+ print(f"[*] Streaming + tokenizing {target_tokens:,} tokens → {path}")
51
+ mm = np.memmap(path, dtype=np.uint16, mode="w+", shape=(target_tokens,))
52
+
53
+ dataset = load_dataset(
54
+ "HuggingFaceFW/fineweb-edu", "sample-100BT",
55
+ split="train", streaming=True
56
+ )
57
+
58
+ written = 0
59
+ buf = []
60
+ texts = []
61
+ pbar = tqdm(total=target_tokens, desc="[*] Gathering tokens", unit="tok")
62
+
63
+ def flush_buf():
64
+ nonlocal written, buf
65
+ if not buf:
66
+ return False
67
+ n = min(len(buf), target_tokens - written)
68
+ mm[written:written + n] = np.asarray(buf[:n], dtype=np.uint16)
69
+ written += n
70
+ pbar.update(n)
71
+ del buf[:n]
72
+ return written >= target_tokens
73
+
74
+ for example in dataset:
75
+ texts.append(example["text"])
76
+ if len(texts) >= BATCH_TEXTS:
77
+ encs = fast_tokenizer.encode_batch(texts)
78
+ texts.clear()
79
+ for e in encs:
80
+ buf.extend(e.ids)
81
+ if len(buf) >= FLUSH_EVERY:
82
+ if flush_buf():
83
+ break
84
+
85
+ if written < target_tokens and texts:
86
+ encs = fast_tokenizer.encode_batch(texts)
87
+ for e in encs:
88
+ buf.extend(e.ids)
89
+ if written < target_tokens:
90
+ flush_buf()
91
+
92
+ pbar.close()
93
+ mm.flush()
94
+ del mm
95
+ print(f"[+] Wrote {written:,} tokens to {path} "
96
+ f"({os.path.getsize(path)/1e6:.1f} MB)")
97
+
98
+
99
+ class MemmapDataset(Dataset):
100
+ def __init__(self, path, total_tokens, seq_len=SEQ_LEN):
101
+ self.path = path
102
+ self.seq_len = seq_len
103
+ self.n_chunks = total_tokens // seq_len
104
+ self._data = None # lazy open (Multiprocessing-safe)
105
+
106
+ @property
107
+ def data(self):
108
+ if self._data is None:
109
+ self._data = np.memmap(
110
+ self.path, dtype=np.uint16, mode="r",
111
+ shape=(self.n_chunks * self.seq_len,)
112
+ )
113
+ return self._data
114
+
115
+ def __len__(self):
116
+ return self.n_chunks
117
+
118
+ def __getitem__(self, idx):
119
+ s = idx * self.seq_len
120
+ arr = np.asarray(self.data[s:s + self.seq_len], dtype=np.int64)
121
+ ids = torch.from_numpy(arr)
122
+ return {"input_ids": ids, "labels": ids.clone()}
123
+
124
+
125
+ def collate_fn(batch):
126
+ input_ids = torch.stack([b["input_ids"] for b in batch])
127
+ labels = torch.stack([b["labels"] for b in batch])
128
+ return {"input_ids": input_ids, "labels": labels}
129
+
130
+
131
+ print(f"[*] Preparing {TARGET_TOKENS:,} tokens (streaming, memmap-backed)...")
132
+ build_token_bin(fast_tokenizer, TOKEN_BIN, TARGET_TOKENS)
133
+ dataset = MemmapDataset(TOKEN_BIN, TARGET_TOKENS, seq_len=SEQ_LEN)
134
+ print(f"[+] Dataset ready: {len(dataset):,} chunks of {SEQ_LEN} tokens")
135
+
136
+ print("[*] Setting up model...")
137
+ config = LlamaConfig(
138
+ vocab_size=32_000,
139
+ hidden_size=512,
140
+ intermediate_size=1408,
141
+ num_hidden_layers=12,
142
+ num_attention_heads=8,
143
+ num_key_value_heads=4,
144
+ max_position_embeddings=1024,
145
+ rope_theta=10_000,
146
+ tie_word_embeddings=True,
147
+ pad_token_id=tokenizer.pad_token_id,
148
+ bos_token_id=tokenizer.bos_token_id,
149
+ eos_token_id=tokenizer.eos_token_id,
150
+ )
151
+ model = LlamaForCausalLM(config)
152
+ print(f"[*] Model parameters: {model.num_parameters():,}")
153
+
154
+ print("[*] Defining training arguments...")
155
+ training_args = TrainingArguments(
156
+ output_dir="./Chimera",
157
+ num_train_epochs=1,
158
+ per_device_train_batch_size=32,
159
+ gradient_accumulation_steps=4,
160
+ save_steps=500,
161
+ save_total_limit=2,
162
+ logging_steps=100,
163
+ weight_decay=0.1,
164
+ fp16=False,
165
+ bf16=True,
166
+ push_to_hub=False,
167
+ report_to="none",
168
+ dataloader_num_workers=os.cpu_count() // 2,
169
+ dataloader_pin_memory=True,
170
+ learning_rate=6e-4,
171
+ lr_scheduler_type="cosine",
172
+ warmup_ratio=0.02,
173
+ max_grad_norm=1.0,
174
+ optim="adamw_torch_fused",
175
+ adam_beta1=0.9,
176
+ adam_beta2=0.95,
177
+ torch_compile=True,
178
+ )
179
+
180
+ trainer = Trainer(
181
+ model=model,
182
+ args=training_args,
183
+ train_dataset=dataset,
184
+ data_collator=collate_fn,
185
+ )
186
+
187
+ print("[*] Starting training...")
188
+ trainer.train()
189
+ trainer.save_model("./Chimera-FINAL")
190
+ tokenizer.save_pretrained("./Chimera-FINAL")
191
+ print("[*] Training finished.")
train_tokenizer.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print("[*] Loading libraries...")
2
+ from datasets import load_dataset
3
+ from tokenizers import ByteLevelBPETokenizer
4
+ from tqdm import tqdm
5
+
6
+ dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True)
7
+ def get_training_corpus():
8
+ dataset_iter = iter(dataset)
9
+ for _ in tqdm(range(500_000), desc="Feeding data"):
10
+ yield next(dataset_iter)["text"]
11
+
12
+ tokenizer = ByteLevelBPETokenizer()
13
+
14
+ print("[*] Training tokenizer...")
15
+
16
+ tokenizer.train_from_iterator(
17
+ get_training_corpus(),
18
+ vocab_size=32_000,
19
+ min_frequency=2,
20
+ show_progress=True,
21
+ special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
22
+ )
23
+
24
+ tokenizer.save_model(".", "custom_llama_tokenizer")
25
+ print("[*] Tokenizer training complete!")
training.log ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9855e7302ec83f189fff07d302011b54df5845f2cffd9bb57def7baaef536fc
3
+ size 5265