Lgr54HFi
/

chimera

chimera51

custom_code

Model card Files Files and versions

xet

Community

Lgr54HFi commited on 13 days ago

Commit

c4fa83f

verified ·

1 Parent(s): 092c193

Upload train.py

Browse files

Files changed (1) hide show

train.py +138 -12

train.py CHANGED Viewed

@@ -13,6 +13,7 @@ Optimizations implemented:
   7. Intel IPEX integration (optional) — auto-detected
   8. Cosine LR with warmup
   9. Standard AdamW with backprop as fallback mode
 Usage:
   # MeZO mode (recommended for CPU — no backward pass):
@@ -21,8 +22,11 @@ Usage:
   # AdamW mode (standard backprop with gradient checkpointing + bf16):
   python train.py --optimizer adamw --scale tiny --seq_len 64 --max_steps 100
-  # Full run:
-  python train.py --optimizer mezo --scale small --seq_len 256 --max_steps 10000 --compile
 """
 import os
@@ -255,25 +259,120 @@ class TokenDataset(Dataset):
         return {"input_ids": self.chunks[idx], "labels": self.chunks[idx]}
-def build_dataset(seq_len: int, max_samples=None, split: str = "train"):
-    """Build dataset from TinyStories with splintr tokenizer."""
     from datasets import load_dataset
     from chimera import ChimeraTokenizer
-    print(f"[DATA] Loading TinyStories ({split})...")
-    ds = load_dataset("roneneldan/TinyStories", split=split, streaming=True)
     print(f"[DATA] Loading tokenizer (splintr o200k_base)...")
     tok = ChimeraTokenizer(pretrained="o200k_base")
     all_ids = []
     target = max_samples * (seq_len + 1) if max_samples else float('inf')
     for i, ex in enumerate(ds):
-        all_ids.extend(tok.encode(ex["text"], add_special_tokens=False))
         all_ids.append(tok.eos_token_id)
         if len(all_ids) >= target:
             break
-        if (i + 1) % 10000 == 0:
-            print(f"  {i + 1} texts, {len(all_ids):,} tokens...")
     all_ids = torch.tensor(all_ids, dtype=torch.long)
     n = len(all_ids) // (seq_len + 1)
@@ -387,6 +486,11 @@ def train(args):
     print(f"Device:       CPU ({torch.get_num_threads()} threads)")
     print(f"IPEX:         {HAS_IPEX}")
     print(f"Tokenizer:    splintr o200k_base ({config['vocab_size']} tokens)")
     # ─── Build model ───
     model = Chimera51ForCausalLM(config)
@@ -423,8 +527,16 @@ def train(args):
         print("[OPT] Compilation deferred (will compile on first forward pass)")
     # ─── Dataset ───
-    dataset, tok = build_dataset(args.seq_len, max_samples=args.max_samples,
-                                  split="train")
     loader = DataLoader(
         dataset,
         batch_size=args.batch_size,
@@ -616,7 +728,21 @@ if __name__ == "__main__":
                    action="store_false", default=True,
                    help="Regenerate directions instead of caching them for the step")
-    # Data
     p.add_argument("--num_workers", type=int, default=4)
     p.add_argument("--log_every", type=int, default=10)
     p.add_argument("--save_every", type=int, default=1000)

   7. Intel IPEX integration (optional) — auto-detected
   8. Cosine LR with warmup
   9. Standard AdamW with backprop as fallback mode
+  10. Generic dataset loading — supports any HF dataset, messages/text columns, category filtering
 Usage:
   # MeZO mode (recommended for CPU — no backward pass):
   # AdamW mode (standard backprop with gradient checkpointing + bf16):
   python train.py --optimizer adamw --scale tiny --seq_len 64 --max_steps 100
+  # Full run with custom dataset and category filter:
+  python train.py --optimizer mezo --scale tiny --seq_len 64 --max_steps 10000 \
+    --dataset_name Roman1111111/claude-sonnet-4.6-120000x \
+    --dataset_split train --text_column messages \
+    --category_filter "C++,organic chemistry"
 """
 import os
         return {"input_ids": self.chunks[idx], "labels": self.chunks[idx]}
+def _matches_category_filter(ex: dict, filters: list) -> bool:
+    """Check if example matches any of the requested category substrings."""
+    cat = ex.get("category", "")
+    if not cat:
+        return False
+    cat_lower = cat.lower()
+    return any(f.lower() in cat_lower for f in filters)
+def _format_example(ex: dict, tok, text_column: str = "auto", include_reasoning: bool = False) -> str:
+    """Convert an example dict to a single text string for tokenization."""
+    # Auto-detect text column
+    if text_column == "auto":
+        if "messages" in ex:
+            text_column = "messages"
+        elif "text" in ex:
+            text_column = "text"
+        elif "content" in ex:
+            text_column = "content"
+        elif "conversation" in ex:
+            text_column = "conversation"
+        else:
+            text_column = None
+    if text_column == "messages" and "messages" in ex:
+        msgs = ex["messages"]
+        # Inject reasoning into assistant messages if requested
+        if include_reasoning and isinstance(msgs, list):
+            msgs = []
+            for m in ex["messages"]:
+                if isinstance(m, dict) and m.get("role") == "assistant" and "reasoning" in m:
+                    content = f"<|thinking|>\n{m['reasoning']}\n<|/thinking|>\n{m.get('content', '')}"
+                    msgs.append({"role": "assistant", "content": content})
+                else:
+                    msgs.append(m)
+        return tok.apply_chat_template(msgs)
+    if text_column and text_column in ex:
+        val = ex[text_column]
+        if isinstance(val, str):
+            return val
+        # Some datasets store conversation as list of dicts even in 'text' col
+        if isinstance(val, list) and len(val) > 0 and isinstance(val[0], dict):
+            return tok.apply_chat_template(val)
+        return str(val)
+    # Fallback: stringify the whole example
+    return str(ex)
+def build_dataset(seq_len: int, max_samples=None, split: str = "train",
+                  dataset_name: str = "roneneldan/TinyStories",
+                  dataset_config: str = None,
+                  text_column: str = "auto",
+                  category_filter: str = None,
+                  include_reasoning: bool = False):
+    """Build dataset from any HuggingFace dataset with splintr tokenizer.
+    Supports:
+      - Generic text columns ('text', 'content', etc.)
+      - Messages/chat format (auto-detected, uses apply_chat_template)
+      - Category filtering (comma-separated substrings)
+      - Streaming for huge datasets
+    """
     from datasets import load_dataset
     from chimera import ChimeraTokenizer
+    print(f"[DATA] Loading {dataset_name} ({split})...")
+    load_kwargs = {"split": split, "streaming": True}
+    if dataset_config:
+        load_kwargs["name"] = dataset_config
+    ds = load_dataset(dataset_name, **load_kwargs)
     print(f"[DATA] Loading tokenizer (splintr o200k_base)...")
     tok = ChimeraTokenizer(pretrained="o200k_base")
+    # Parse category filters
+    cat_filters = None
+    if category_filter:
+        cat_filters = [c.strip() for c in category_filter.split(",") if c.strip()]
+        print(f"[DATA] Filtering categories: {cat_filters}")
     all_ids = []
     target = max_samples * (seq_len + 1) if max_samples else float('inf')
+    processed = 0
+    skipped = 0
     for i, ex in enumerate(ds):
+        # Category filter
+        if cat_filters and not _matches_category_filter(ex, cat_filters):
+            skipped += 1
+            continue
+        text = _format_example(ex, tok, text_column, include_reasoning)
+        if not text or not text.strip():
+            skipped += 1
+            continue
+        all_ids.extend(tok.encode(text, add_special_tokens=False))
         all_ids.append(tok.eos_token_id)
+        processed += 1
         if len(all_ids) >= target:
             break
+        if (processed + 1) % 10000 == 0:
+            print(f"  {processed:,} examples, {len(all_ids):,} tokens...")
+    print(f"[DATA] Processed {processed:,} examples, skipped {skipped:,} (category/text mismatch)")
+    if len(all_ids) == 0:
+        raise ValueError(
+            f"No data matched filters. dataset={dataset_name}, "
+            f"category_filter={category_filter}, text_column={text_column}"
+        )
     all_ids = torch.tensor(all_ids, dtype=torch.long)
     n = len(all_ids) // (seq_len + 1)
     print(f"Device:       CPU ({torch.get_num_threads()} threads)")
     print(f"IPEX:         {HAS_IPEX}")
     print(f"Tokenizer:    splintr o200k_base ({config['vocab_size']} tokens)")
+    print(f"Dataset:      {args.dataset_name} / {args.dataset_split}")
+    if args.category_filter:
+        print(f"Category filter: {args.category_filter}")
+    if args.include_reasoning:
+        print("Reasoning:    INCLUDED (<|thinking|> ... <|/thinking|>)")
     # ─── Build model ───
     model = Chimera51ForCausalLM(config)
         print("[OPT] Compilation deferred (will compile on first forward pass)")
     # ─── Dataset ───
+    dataset, tok = build_dataset(
+        args.seq_len,
+        max_samples=args.max_samples,
+        split=args.dataset_split,
+        dataset_name=args.dataset_name,
+        dataset_config=args.dataset_config,
+        text_column=args.text_column,
+        category_filter=args.category_filter,
+        include_reasoning=args.include_reasoning,
+    )
     loader = DataLoader(
         dataset,
         batch_size=args.batch_size,
                    action="store_false", default=True,
                    help="Regenerate directions instead of caching them for the step")
+    # Data — fully configurable
+    p.add_argument("--dataset_name", default="roneneldan/TinyStories",
+                   help="HuggingFace dataset name (e.g. Roman1111111/claude-sonnet-4.6-120000x)")
+    p.add_argument("--dataset_config", default=None,
+                   help="Dataset config/subset name")
+    p.add_argument("--dataset_split", default="train",
+                   help="Dataset split to use")
+    p.add_argument("--text_column", default="auto",
+                   help="Column containing text. 'auto' detects 'messages'/'text'/'content'/'conversation'")
+    p.add_argument("--category_filter", default=None,
+                   help="Comma-separated category substrings to filter on (e.g. 'C++,python,math')")
+    p.add_argument("--include_reasoning", action="store_true", default=False,
+                   help="Include reasoning/thinking content from assistant messages as <|thinking|>...<|/thinking|>")
+    # Logging / Output
     p.add_argument("--num_workers", type=int, default=4)
     p.add_argument("--log_every", type=int, default=10)
     p.add_argument("--save_every", type=int, default=1000)