Harley-ml
/

LargeWord-1.5M

@@ -54,4 +54,181 @@ LargeWord was trained on a NVIDIA RTX 2060 6GB for 2 epochs with a batch size of
 | 1500 | 0.91  | 1.3247     | 3.76      | 1.2682    | 3.55     |
 | 2000 | 1.21  | 1.2120     | 3.36      | 1.2026    | 3.33     |
 | 2500 | 1.51  | 1.1619     | 3.20      | 1.1667    | 3.21     |
-| 3000 | 1.82  | 1.1314     | 3.10      | 1.1378    | 3.12     |

 | 1500 | 0.91  | 1.3247     | 3.76      | 1.2682    | 3.55     |
 | 2000 | 1.21  | 1.2120     | 3.36      | 1.2026    | 3.33     |
 | 2500 | 1.51  | 1.1619     | 3.20      | 1.1667    | 3.21     |
+| 3000 | 1.82  | 1.1314     | 3.10      | 1.1378    | 3.12     |
+![Training and Evaluation Curves](images/training_graph.png)
+## Generations
+Prompt: `w`
+Output:
+```
+weldosfish's
+```
+Prompt: `app`
+Output:
+```
+appardness
+```
+Prompt: `z`
+Output:
+```
+zeething's
+```
+## Use Cases
+1. Education research
+2. Morphology/phonetic research
+3. Deployment on constrained devices
+4. Or, more simply, for fun.
+# Inference
+```python
+# =============================================================================
+# MorseMini-20M — Inference
+# =============================================================================
+MODEL_DIR      = "Harley-ml/LargeWord-1.5M"   # path
+TOKENIZER_PATH = MODEL_DIR
+# --- Generation settings ---
+PROMPT             = "a"   # prompt
+MAX_NEW_TOKENS     = 16
+TEMPERATURE        = 1.2
+TOP_P              = 0.95
+TOP_K              = 200
+REPETITION_PENALTY = 1.1
+DO_SAMPLE          = True
+# =============================================================================
+import torch
+from pathlib import Path
+from transformers import (
+    AutoModelForCausalLM,
+    PreTrainedTokenizerFast,
+    AddedToken,
+)
+# ---------------------------------------------------------------------------
+# Device
+# ---------------------------------------------------------------------------
+device = (
+    "cuda" if torch.cuda.is_available() else
+    "mps"  if torch.backends.mps.is_available() else
+    "cpu"
+)
+print(f"Device : {device}")
+# ---------------------------------------------------------------------------
+# Tokenizer  (mirrors training setup)
+# ---------------------------------------------------------------------------
+def load_tokenizer(path: str):
+    p = Path(path).resolve()
+    if not p.exists():
+        raise FileNotFoundError(f"Tokenizer not found: {p}")
+    tok = PreTrainedTokenizerFast(tokenizer_file=str(p))
+    specials = {}
+    if tok.bos_token is None: specials["bos_token"] = AddedToken("<|bos|>", special=True)
+    if tok.eos_token is None: specials["eos_token"] = AddedToken("<|eos|>", special=True)
+    if tok.unk_token is None: specials["unk_token"] = AddedToken("<|unk|>", special=True)
+    if tok.pad_token is None:
+        if tok.eos_token is not None:
+            tok.pad_token = tok.eos_token
+        else:
+            specials["pad_token"] = AddedToken("<|pad|>", special=True)
+    if specials:
+        tok.add_special_tokens(specials)
+    tok.padding_side = "left"  # left-pad for batched generation
+    return tok
+print("Loading tokenizer...")
+tokenizer = load_tokenizer(TOKENIZER_PATH)
+print(f"  Vocab size : {tokenizer.vocab_size}")
+print(f"  BOS        : {tokenizer.bos_token!r}")
+print(f"  EOS        : {tokenizer.eos_token!r}")
+print(f"  PAD        : {tokenizer.pad_token!r}  (id={tokenizer.pad_token_id})")
+# ---------------------------------------------------------------------------
+# Model
+# ---------------------------------------------------------------------------
+print(f"\nLoading model from {MODEL_DIR} ...")
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_DIR,
+    dtype=torch.float16 if device == "cuda" else torch.float32,
+    low_cpu_mem_usage=True,
+)
+model.eval()
+model.to(device)
+total_params = sum(p.numel() for p in model.parameters())
+print(f"  Parameters : {total_params:,}")
+# ---------------------------------------------------------------------------
+# Generation helper
+# ---------------------------------------------------------------------------
+def generate(
+    prompt: str             = PROMPT,
+    max_new_tokens: int     = MAX_NEW_TOKENS,
+    temperature: float      = TEMPERATURE,
+    top_p: float            = TOP_P,
+    top_k: int              = TOP_K,
+    repetition_penalty: float = REPETITION_PENALTY,
+    do_sample: bool         = DO_SAMPLE,
+) -> str:
+    bos         = tokenizer.bos_token or ""
+    full_prompt = bos + prompt
+    inputs = tokenizer(
+        full_prompt,
+        return_tensors="pt",
+        add_special_tokens=False,
+    ).to(device)
+    inputs.pop("token_type_ids", None)  # Qwen3 doesn't use this
+    gen_kwargs = dict(
+        max_new_tokens     = max_new_tokens,
+        do_sample          = do_sample,
+        repetition_penalty = repetition_penalty,
+        eos_token_id       = tokenizer.eos_token_id,
+        pad_token_id       = tokenizer.pad_token_id,
+    )
+    if do_sample:
+        gen_kwargs["temperature"] = temperature
+        gen_kwargs["top_p"]       = top_p
+        gen_kwargs["top_k"]       = top_k
+    with torch.inference_mode():
+        output_ids = model.generate(**inputs, **gen_kwargs)
+    # Strip the prompt tokens so we only return what was generated
+    prompt_len = inputs["input_ids"].shape[-1]
+    new_ids    = output_ids[0][prompt_len:]
+    return tokenizer.decode(new_ids, skip_special_tokens=True)
+# ---------------------------------------------------------------------------
+# Run
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    print(f"\nPrompt : {PROMPT!r}")
+    print("-" * 60)
+    output = generate(PROMPT)
+    print("Generated:")
+    print(output)
+```