Harley-ml commited on
Commit
5aac0e4
·
verified ·
1 Parent(s): af8424d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +178 -1
README.md CHANGED
@@ -54,4 +54,181 @@ LargeWord was trained on a NVIDIA RTX 2060 6GB for 2 epochs with a batch size of
54
  | 1500 | 0.91 | 1.3247 | 3.76 | 1.2682 | 3.55 |
55
  | 2000 | 1.21 | 1.2120 | 3.36 | 1.2026 | 3.33 |
56
  | 2500 | 1.51 | 1.1619 | 3.20 | 1.1667 | 3.21 |
57
- | 3000 | 1.82 | 1.1314 | 3.10 | 1.1378 | 3.12 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  | 1500 | 0.91 | 1.3247 | 3.76 | 1.2682 | 3.55 |
55
  | 2000 | 1.21 | 1.2120 | 3.36 | 1.2026 | 3.33 |
56
  | 2500 | 1.51 | 1.1619 | 3.20 | 1.1667 | 3.21 |
57
+ | 3000 | 1.82 | 1.1314 | 3.10 | 1.1378 | 3.12 |
58
+
59
+ ![Training and Evaluation Curves](images/training_graph.png)
60
+
61
+ ## Generations
62
+
63
+ Prompt: `w`
64
+
65
+ Output:
66
+ ```
67
+ weldosfish's
68
+ ```
69
+
70
+ Prompt: `app`
71
+
72
+ Output:
73
+ ```
74
+ appardness
75
+ ```
76
+
77
+ Prompt: `z`
78
+
79
+ Output:
80
+ ```
81
+ zeething's
82
+ ```
83
+
84
+ ## Use Cases
85
+
86
+ 1. Education research
87
+ 2. Morphology/phonetic research
88
+ 3. Deployment on constrained devices
89
+ 4. Or, more simply, for fun.
90
+
91
+ # Inference
92
+
93
+ ```python
94
+ # =============================================================================
95
+ # MorseMini-20M — Inference
96
+ # =============================================================================
97
+
98
+ MODEL_DIR = "Harley-ml/LargeWord-1.5M" # path
99
+ TOKENIZER_PATH = MODEL_DIR
100
+
101
+ # --- Generation settings ---
102
+ PROMPT = "a" # prompt
103
+ MAX_NEW_TOKENS = 16
104
+ TEMPERATURE = 1.2
105
+ TOP_P = 0.95
106
+ TOP_K = 200
107
+ REPETITION_PENALTY = 1.1
108
+ DO_SAMPLE = True
109
+
110
+ # =============================================================================
111
+
112
+ import torch
113
+ from pathlib import Path
114
+ from transformers import (
115
+ AutoModelForCausalLM,
116
+ PreTrainedTokenizerFast,
117
+ AddedToken,
118
+ )
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Device
122
+ # ---------------------------------------------------------------------------
123
+
124
+ device = (
125
+ "cuda" if torch.cuda.is_available() else
126
+ "mps" if torch.backends.mps.is_available() else
127
+ "cpu"
128
+ )
129
+ print(f"Device : {device}")
130
+
131
+ # ---------------------------------------------------------------------------
132
+ # Tokenizer (mirrors training setup)
133
+ # ---------------------------------------------------------------------------
134
+
135
+ def load_tokenizer(path: str):
136
+ p = Path(path).resolve()
137
+ if not p.exists():
138
+ raise FileNotFoundError(f"Tokenizer not found: {p}")
139
+ tok = PreTrainedTokenizerFast(tokenizer_file=str(p))
140
+ specials = {}
141
+ if tok.bos_token is None: specials["bos_token"] = AddedToken("<|bos|>", special=True)
142
+ if tok.eos_token is None: specials["eos_token"] = AddedToken("<|eos|>", special=True)
143
+ if tok.unk_token is None: specials["unk_token"] = AddedToken("<|unk|>", special=True)
144
+ if tok.pad_token is None:
145
+ if tok.eos_token is not None:
146
+ tok.pad_token = tok.eos_token
147
+ else:
148
+ specials["pad_token"] = AddedToken("<|pad|>", special=True)
149
+ if specials:
150
+ tok.add_special_tokens(specials)
151
+ tok.padding_side = "left" # left-pad for batched generation
152
+ return tok
153
+
154
+ print("Loading tokenizer...")
155
+ tokenizer = load_tokenizer(TOKENIZER_PATH)
156
+ print(f" Vocab size : {tokenizer.vocab_size}")
157
+ print(f" BOS : {tokenizer.bos_token!r}")
158
+ print(f" EOS : {tokenizer.eos_token!r}")
159
+ print(f" PAD : {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})")
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # Model
163
+ # ---------------------------------------------------------------------------
164
+
165
+ print(f"\nLoading model from {MODEL_DIR} ...")
166
+ model = AutoModelForCausalLM.from_pretrained(
167
+ MODEL_DIR,
168
+ dtype=torch.float16 if device == "cuda" else torch.float32,
169
+ low_cpu_mem_usage=True,
170
+ )
171
+ model.eval()
172
+ model.to(device)
173
+
174
+ total_params = sum(p.numel() for p in model.parameters())
175
+ print(f" Parameters : {total_params:,}")
176
+
177
+ # ---------------------------------------------------------------------------
178
+ # Generation helper
179
+ # ---------------------------------------------------------------------------
180
+
181
+ def generate(
182
+ prompt: str = PROMPT,
183
+ max_new_tokens: int = MAX_NEW_TOKENS,
184
+ temperature: float = TEMPERATURE,
185
+ top_p: float = TOP_P,
186
+ top_k: int = TOP_K,
187
+ repetition_penalty: float = REPETITION_PENALTY,
188
+ do_sample: bool = DO_SAMPLE,
189
+ ) -> str:
190
+
191
+ bos = tokenizer.bos_token or ""
192
+ full_prompt = bos + prompt
193
+
194
+ inputs = tokenizer(
195
+ full_prompt,
196
+ return_tensors="pt",
197
+ add_special_tokens=False,
198
+ ).to(device)
199
+ inputs.pop("token_type_ids", None) # Qwen3 doesn't use this
200
+
201
+ gen_kwargs = dict(
202
+ max_new_tokens = max_new_tokens,
203
+ do_sample = do_sample,
204
+ repetition_penalty = repetition_penalty,
205
+ eos_token_id = tokenizer.eos_token_id,
206
+ pad_token_id = tokenizer.pad_token_id,
207
+ )
208
+ if do_sample:
209
+ gen_kwargs["temperature"] = temperature
210
+ gen_kwargs["top_p"] = top_p
211
+ gen_kwargs["top_k"] = top_k
212
+
213
+ with torch.inference_mode():
214
+ output_ids = model.generate(**inputs, **gen_kwargs)
215
+
216
+ # Strip the prompt tokens so we only return what was generated
217
+ prompt_len = inputs["input_ids"].shape[-1]
218
+ new_ids = output_ids[0][prompt_len:]
219
+ return tokenizer.decode(new_ids, skip_special_tokens=True)
220
+
221
+
222
+ # ---------------------------------------------------------------------------
223
+ # Run
224
+ # ---------------------------------------------------------------------------
225
+
226
+ if __name__ == "__main__":
227
+ print(f"\nPrompt : {PROMPT!r}")
228
+ print("-" * 60)
229
+
230
+ output = generate(PROMPT)
231
+
232
+ print("Generated:")
233
+ print(output)
234
+ ```