Update README.md
Browse files
README.md
CHANGED
|
@@ -54,4 +54,181 @@ LargeWord was trained on a NVIDIA RTX 2060 6GB for 2 epochs with a batch size of
|
|
| 54 |
| 1500 | 0.91 | 1.3247 | 3.76 | 1.2682 | 3.55 |
|
| 55 |
| 2000 | 1.21 | 1.2120 | 3.36 | 1.2026 | 3.33 |
|
| 56 |
| 2500 | 1.51 | 1.1619 | 3.20 | 1.1667 | 3.21 |
|
| 57 |
-
| 3000 | 1.82 | 1.1314 | 3.10 | 1.1378 | 3.12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
| 1500 | 0.91 | 1.3247 | 3.76 | 1.2682 | 3.55 |
|
| 55 |
| 2000 | 1.21 | 1.2120 | 3.36 | 1.2026 | 3.33 |
|
| 56 |
| 2500 | 1.51 | 1.1619 | 3.20 | 1.1667 | 3.21 |
|
| 57 |
+
| 3000 | 1.82 | 1.1314 | 3.10 | 1.1378 | 3.12 |
|
| 58 |
+
|
| 59 |
+

|
| 60 |
+
|
| 61 |
+
## Generations
|
| 62 |
+
|
| 63 |
+
Prompt: `w`
|
| 64 |
+
|
| 65 |
+
Output:
|
| 66 |
+
```
|
| 67 |
+
weldosfish's
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
Prompt: `app`
|
| 71 |
+
|
| 72 |
+
Output:
|
| 73 |
+
```
|
| 74 |
+
appardness
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
Prompt: `z`
|
| 78 |
+
|
| 79 |
+
Output:
|
| 80 |
+
```
|
| 81 |
+
zeething's
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
## Use Cases
|
| 85 |
+
|
| 86 |
+
1. Education research
|
| 87 |
+
2. Morphology/phonetic research
|
| 88 |
+
3. Deployment on constrained devices
|
| 89 |
+
4. Or, more simply, for fun.
|
| 90 |
+
|
| 91 |
+
# Inference
|
| 92 |
+
|
| 93 |
+
```python
|
| 94 |
+
# =============================================================================
|
| 95 |
+
# MorseMini-20M — Inference
|
| 96 |
+
# =============================================================================
|
| 97 |
+
|
| 98 |
+
MODEL_DIR = "Harley-ml/LargeWord-1.5M" # path
|
| 99 |
+
TOKENIZER_PATH = MODEL_DIR
|
| 100 |
+
|
| 101 |
+
# --- Generation settings ---
|
| 102 |
+
PROMPT = "a" # prompt
|
| 103 |
+
MAX_NEW_TOKENS = 16
|
| 104 |
+
TEMPERATURE = 1.2
|
| 105 |
+
TOP_P = 0.95
|
| 106 |
+
TOP_K = 200
|
| 107 |
+
REPETITION_PENALTY = 1.1
|
| 108 |
+
DO_SAMPLE = True
|
| 109 |
+
|
| 110 |
+
# =============================================================================
|
| 111 |
+
|
| 112 |
+
import torch
|
| 113 |
+
from pathlib import Path
|
| 114 |
+
from transformers import (
|
| 115 |
+
AutoModelForCausalLM,
|
| 116 |
+
PreTrainedTokenizerFast,
|
| 117 |
+
AddedToken,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# ---------------------------------------------------------------------------
|
| 121 |
+
# Device
|
| 122 |
+
# ---------------------------------------------------------------------------
|
| 123 |
+
|
| 124 |
+
device = (
|
| 125 |
+
"cuda" if torch.cuda.is_available() else
|
| 126 |
+
"mps" if torch.backends.mps.is_available() else
|
| 127 |
+
"cpu"
|
| 128 |
+
)
|
| 129 |
+
print(f"Device : {device}")
|
| 130 |
+
|
| 131 |
+
# ---------------------------------------------------------------------------
|
| 132 |
+
# Tokenizer (mirrors training setup)
|
| 133 |
+
# ---------------------------------------------------------------------------
|
| 134 |
+
|
| 135 |
+
def load_tokenizer(path: str):
|
| 136 |
+
p = Path(path).resolve()
|
| 137 |
+
if not p.exists():
|
| 138 |
+
raise FileNotFoundError(f"Tokenizer not found: {p}")
|
| 139 |
+
tok = PreTrainedTokenizerFast(tokenizer_file=str(p))
|
| 140 |
+
specials = {}
|
| 141 |
+
if tok.bos_token is None: specials["bos_token"] = AddedToken("<|bos|>", special=True)
|
| 142 |
+
if tok.eos_token is None: specials["eos_token"] = AddedToken("<|eos|>", special=True)
|
| 143 |
+
if tok.unk_token is None: specials["unk_token"] = AddedToken("<|unk|>", special=True)
|
| 144 |
+
if tok.pad_token is None:
|
| 145 |
+
if tok.eos_token is not None:
|
| 146 |
+
tok.pad_token = tok.eos_token
|
| 147 |
+
else:
|
| 148 |
+
specials["pad_token"] = AddedToken("<|pad|>", special=True)
|
| 149 |
+
if specials:
|
| 150 |
+
tok.add_special_tokens(specials)
|
| 151 |
+
tok.padding_side = "left" # left-pad for batched generation
|
| 152 |
+
return tok
|
| 153 |
+
|
| 154 |
+
print("Loading tokenizer...")
|
| 155 |
+
tokenizer = load_tokenizer(TOKENIZER_PATH)
|
| 156 |
+
print(f" Vocab size : {tokenizer.vocab_size}")
|
| 157 |
+
print(f" BOS : {tokenizer.bos_token!r}")
|
| 158 |
+
print(f" EOS : {tokenizer.eos_token!r}")
|
| 159 |
+
print(f" PAD : {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})")
|
| 160 |
+
|
| 161 |
+
# ---------------------------------------------------------------------------
|
| 162 |
+
# Model
|
| 163 |
+
# ---------------------------------------------------------------------------
|
| 164 |
+
|
| 165 |
+
print(f"\nLoading model from {MODEL_DIR} ...")
|
| 166 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 167 |
+
MODEL_DIR,
|
| 168 |
+
dtype=torch.float16 if device == "cuda" else torch.float32,
|
| 169 |
+
low_cpu_mem_usage=True,
|
| 170 |
+
)
|
| 171 |
+
model.eval()
|
| 172 |
+
model.to(device)
|
| 173 |
+
|
| 174 |
+
total_params = sum(p.numel() for p in model.parameters())
|
| 175 |
+
print(f" Parameters : {total_params:,}")
|
| 176 |
+
|
| 177 |
+
# ---------------------------------------------------------------------------
|
| 178 |
+
# Generation helper
|
| 179 |
+
# ---------------------------------------------------------------------------
|
| 180 |
+
|
| 181 |
+
def generate(
|
| 182 |
+
prompt: str = PROMPT,
|
| 183 |
+
max_new_tokens: int = MAX_NEW_TOKENS,
|
| 184 |
+
temperature: float = TEMPERATURE,
|
| 185 |
+
top_p: float = TOP_P,
|
| 186 |
+
top_k: int = TOP_K,
|
| 187 |
+
repetition_penalty: float = REPETITION_PENALTY,
|
| 188 |
+
do_sample: bool = DO_SAMPLE,
|
| 189 |
+
) -> str:
|
| 190 |
+
|
| 191 |
+
bos = tokenizer.bos_token or ""
|
| 192 |
+
full_prompt = bos + prompt
|
| 193 |
+
|
| 194 |
+
inputs = tokenizer(
|
| 195 |
+
full_prompt,
|
| 196 |
+
return_tensors="pt",
|
| 197 |
+
add_special_tokens=False,
|
| 198 |
+
).to(device)
|
| 199 |
+
inputs.pop("token_type_ids", None) # Qwen3 doesn't use this
|
| 200 |
+
|
| 201 |
+
gen_kwargs = dict(
|
| 202 |
+
max_new_tokens = max_new_tokens,
|
| 203 |
+
do_sample = do_sample,
|
| 204 |
+
repetition_penalty = repetition_penalty,
|
| 205 |
+
eos_token_id = tokenizer.eos_token_id,
|
| 206 |
+
pad_token_id = tokenizer.pad_token_id,
|
| 207 |
+
)
|
| 208 |
+
if do_sample:
|
| 209 |
+
gen_kwargs["temperature"] = temperature
|
| 210 |
+
gen_kwargs["top_p"] = top_p
|
| 211 |
+
gen_kwargs["top_k"] = top_k
|
| 212 |
+
|
| 213 |
+
with torch.inference_mode():
|
| 214 |
+
output_ids = model.generate(**inputs, **gen_kwargs)
|
| 215 |
+
|
| 216 |
+
# Strip the prompt tokens so we only return what was generated
|
| 217 |
+
prompt_len = inputs["input_ids"].shape[-1]
|
| 218 |
+
new_ids = output_ids[0][prompt_len:]
|
| 219 |
+
return tokenizer.decode(new_ids, skip_special_tokens=True)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
# ---------------------------------------------------------------------------
|
| 223 |
+
# Run
|
| 224 |
+
# ---------------------------------------------------------------------------
|
| 225 |
+
|
| 226 |
+
if __name__ == "__main__":
|
| 227 |
+
print(f"\nPrompt : {PROMPT!r}")
|
| 228 |
+
print("-" * 60)
|
| 229 |
+
|
| 230 |
+
output = generate(PROMPT)
|
| 231 |
+
|
| 232 |
+
print("Generated:")
|
| 233 |
+
print(output)
|
| 234 |
+
```
|