| --- |
| license: mit |
| language: |
| - en |
| - es |
| tags: |
| - small |
| - tiny |
| - tinyword |
| - theword |
| - harley-ml |
| - small-language-model |
| - word-generation |
| - word-generator |
| - text-generation |
| - qwen3 |
| datasets: |
| - Harley-ml/es-en-words |
| --- |
| |
| # TinyWord-v2-128k |
|
|
| TinyWord-v2 is a revamped and retrained version of v1. In v1, we noticed that it didn't use weight-tying, which ate up half of its parameters. This was misleading as it was effectively the same size as MicroWord. |
| Anyway, this version achives much better performace compared to v1. |
|
|
| ## Architecture |
|
|
| | Parameter | Value | |
| |---|---| |
| | Hidden Layers | 2 | |
| | Hidden Size | 48 | |
| | Attention Heads | 1 | |
| | KV Heads | 1 | |
| | Vocab Size | 1,200 | |
| | Intermediate Size | 160 | |
| | RoPE Theta | 1,000 | |
| | Max Position Embeddings | 32 | |
| | Tie Word Embeddings | True | |
|
|
| ## Training |
|
|
| TinyWord-v2 was trained on 753,232 unique words (entries), 3,225,398 tokens, and 7,022,310 characters. ~660k of those words are English, while ~90k of them are Spanish. |
|
|
| ### Dataset |
|
|
| | Key | Value | |
| | :---------------------: | :-------: | |
| | Entries (words) | 753,232 | |
| | Tokens | 3,225,398 | |
| | Characters | 7,022,310 | |
| | Avg. Tokens Per Entry | ~4.2 | |
| | Avg. Words Per Entry | 1 | |
| | Avg. Chars Per Entry | ~9.3 | |
| | Longest Entry (Tokens) | 36 | |
| | Shortest Entry (Tokens) | 1 | |
| | English Words | ~660k | |
| | Spanish Words | ~90k | |
|
|
|
|
| ### Hardware |
|
|
| TinyWord-v2 was trained on a NVIDA RTX 2060 6GB for 6 epochs with a batch size of 32. |
|
|
| ### Training Results |
|
|
| | Step | Train Loss | Val Loss | Train PPL | Eval PPL | |
| |---|---|---|---|---| |
| | 2000 | 3.0579 | 2.5138 | 21.28 | 12.35 | |
| | 4000 | 2.0494 | 1.9456 | 7.76 | 6.99 | |
| | 6000 | 1.8572 | 1.7965 | 6.40 | 6.03 | |
| | 8000 | 1.7822 | 1.7294 | 5.94 | 5.64 | |
| | 10000 | 1.7360 | 1.6932 | 5.67 | 5.44 | |
|
|
| ## Generations |
|
|
| Prompt: `w` |
|
|
| Output: |
| ``` |
| wrtervulatoration |
| ``` |
|
|
| Prompt: `app` |
|
|
| Output: |
| ``` |
| appatating |
| ``` |
|
|
| Prompt: `a` |
|
|
| Output: |
| ``` |
| ay's |
| ``` |
|
|
| Prompt: `z` |
|
|
| Output: |
| ``` |
| aceae |
| ``` |
|
|
| ## Limitations |
|
|
| 1. It does not generate sentences, prose, code, or anything besides a single word-like sequence. |
| 2. It cannot reason or produce complex language. |
| 3. Generated words may not be real. The goal isn't real word generation but reflecting the lexicon and morphology of the English and Spanish languages through tiny language models. |
| 4. Output is non-deterministic. The same prompt can produce very different completions across runs. |
|
|
| # Inference |
|
|
| ```python |
| # ============================================================================= |
| # Inference |
| # ============================================================================= |
| |
| MODEL_DIR = "Harley-ml/TinyWord2-128k" # path |
| TOKENIZER_PATH = "Harley-ml/TinyWord2-128k" |
| |
| # --- Generation settings --- |
| PROMPT = "w" # prompt |
| MAX_NEW_TOKENS = 32 |
| TEMPERATURE = 1.2 |
| TOP_P = 0.95 |
| TOP_K = 50 |
| REPETITION_PENALTY = 1.1 |
| DO_SAMPLE = True |
| |
| # ============================================================================= |
| |
| import torch |
| from pathlib import Path |
| from transformers import ( |
| AutoModelForCausalLM, |
| PreTrainedTokenizerFast, |
| AddedToken, |
| ) |
| |
| # --------------------------------------------------------------------------- |
| # Device |
| # --------------------------------------------------------------------------- |
| |
| device = ( |
| "cuda" if torch.cuda.is_available() else |
| "mps" if torch.backends.mps.is_available() else |
| "cpu" |
| ) |
| print(f"Device : {device}") |
| |
| # --------------------------------------------------------------------------- |
| # Tokenizer (mirrors training setup) |
| # --------------------------------------------------------------------------- |
| |
| def load_tokenizer(path: str): |
| p = Path(path).resolve() |
| if not p.exists(): |
| raise FileNotFoundError(f"Tokenizer not found: {p}") |
| tok = PreTrainedTokenizerFast(tokenizer_file=str(p)) |
| specials = {} |
| if tok.bos_token is None: specials["bos_token"] = AddedToken("<|bos|>", special=True) |
| if tok.eos_token is None: specials["eos_token"] = AddedToken("<|eos|>", special=True) |
| if tok.unk_token is None: specials["unk_token"] = AddedToken("<|unk|>", special=True) |
| if tok.pad_token is None: |
| if tok.eos_token is not None: |
| tok.pad_token = tok.eos_token |
| else: |
| specials["pad_token"] = AddedToken("<|pad|>", special=True) |
| if specials: |
| tok.add_special_tokens(specials) |
| tok.padding_side = "left" # left-pad for batched generation |
| return tok |
| |
| print("Loading tokenizer...") |
| tokenizer = load_tokenizer(TOKENIZER_PATH) |
| print(f" Vocab size : {tokenizer.vocab_size}") |
| print(f" BOS : {tokenizer.bos_token!r}") |
| print(f" EOS : {tokenizer.eos_token!r}") |
| print(f" PAD : {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})") |
| |
| # --------------------------------------------------------------------------- |
| # Model |
| # --------------------------------------------------------------------------- |
| |
| print(f"\nLoading model from {MODEL_DIR} ...") |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_DIR, |
| dtype=torch.float16 if device == "cuda" else torch.float32, |
| low_cpu_mem_usage=True, |
| ) |
| model.eval() |
| model.to(device) |
| |
| total_params = sum(p.numel() for p in model.parameters()) |
| print(f" Parameters : {total_params:,}") |
| |
| # --------------------------------------------------------------------------- |
| # Generation helper |
| # --------------------------------------------------------------------------- |
| |
| def generate( |
| prompt: str = PROMPT, |
| max_new_tokens: int = MAX_NEW_TOKENS, |
| temperature: float = TEMPERATURE, |
| top_p: float = TOP_P, |
| top_k: int = TOP_K, |
| repetition_penalty: float = REPETITION_PENALTY, |
| do_sample: bool = DO_SAMPLE, |
| ) -> str: |
| |
| bos = tokenizer.bos_token or "" |
| full_prompt = bos + prompt |
| |
| inputs = tokenizer( |
| full_prompt, |
| return_tensors="pt", |
| add_special_tokens=False, |
| ).to(device) |
| inputs.pop("token_type_ids", None) # Qwen3 doesn't use this |
| |
| gen_kwargs = dict( |
| max_new_tokens = max_new_tokens, |
| do_sample = do_sample, |
| repetition_penalty = repetition_penalty, |
| eos_token_id = tokenizer.eos_token_id, |
| pad_token_id = tokenizer.pad_token_id, |
| ) |
| if do_sample: |
| gen_kwargs["temperature"] = temperature |
| gen_kwargs["top_p"] = top_p |
| gen_kwargs["top_k"] = top_k |
| |
| with torch.inference_mode(): |
| output_ids = model.generate(**inputs, **gen_kwargs) |
| |
| # Strip the prompt tokens so we only return what was generated |
| prompt_len = inputs["input_ids"].shape[-1] |
| new_ids = output_ids[0][prompt_len:] |
| return tokenizer.decode(new_ids, skip_special_tokens=True) |
| |
| |
| # --------------------------------------------------------------------------- |
| # Run |
| # --------------------------------------------------------------------------- |
| |
| if __name__ == "__main__": |
| print(f"\nPrompt : {PROMPT!r}") |
| print("-" * 60) |
| |
| output = generate(PROMPT) |
| |
| print("Generated:") |
| print(output) |
| ``` |
|
|
| ### Related Models |
|
|
| 1. [PicoWord](https://huggingface.co/Harley-ml/PicoWord-5k) |
| 2. [MicroWord](https://huggingface.co/Harley-ml/MicroWord-23k) |
| 3. [TinyWord](https://huggingface.co/Harley-ml/TinyWord-134k) |
| 4. [MediumWord](https://huggingface.co/Harley-ml/MediumWord-559k) |
| 5. [LargeWord](https://huggingface.co/Harley-ml/LargeWord-1.5M) |
|
|
| ## Citation |
|
|
| ```bibtex |
| @misc{tinyword2-128k, |
| title = {TinyWord-134k: A Test of Morphological Compression in TLMs}, |
| author = {Harley-ml}, |
| year = {2026}, |
| url = {https://huggingface.co/Harley-ml/TinyWord2-128k} |
| } |
| ``` |