Spaces:

jampuramprem
/

AxiomForgeAI

Sleeping

File size: 26,259 Bytes

ec4ae03

#!/usr/bin/env python3
"""
Combined dataset pipeline — NuminaMath-CoT + OpenMathInstruct-2
================================================================
Downloads, filters, normalises, and merges two large math datasets into a single
JSONL file (train / val / test) that the GRPO training script can consume directly
via --gsm8k-data.

Why these two datasets
----------------------
  NuminaMath-CoT  (AI-MO/NuminaMath-CoT)
      860 K problems.  Clean \\boxed{} answers.  7 rich topic categories that map
      directly to ZPD skill_ids.  Sources span AMC, AIME, Chinese HS, olympiads,
      and synthetic — giving natural difficulty diversity.

  OpenMathInstruct-2  (nvidia/OpenMathInstruct-2)
      14 M synthetic problems with step-level CoT.  `expected_answer` is pre-verified.
      Diverse surface forms prevent pattern memorisation.  We skip any row whose
      problem_source is "gsm8k" (already in prior training).

Output schema (identical to gsm8k_sft.jsonl / aqua_train.jsonl)
---------------------------------------------------------------
  {
    "id":       "<source>_<split>_<idx>",
    "skill_id": "<topic_slug>",        ← used by ZPD CurriculumManager
    "source":   "<hf_dataset_name>",
    "split":    "train" | "val" | "test",
    "difficulty": 1 | 2 | 3,          ← 1=easy 2=medium 3=hard (for ZPD)
    "task_type": "solve",
    "messages": [
        {"role": "system",    "content": SOLVER_SYSTEM_PROMPT},
        {"role": "user",      "content": "Solve ... Problem:\\n<question>"},
        {"role": "assistant", "content": "Step 1: ...\\nFinal Answer: <answer>"}
    ]
  }

Usage
-----
  # Quick test (no download, just show stats)
  python scripts/prepare_combined_dataset.py --dry-run

  # Full pipeline (default caps: 20 K numina + 15 K openmath)
  python scripts/prepare_combined_dataset.py

  # Larger run
  python scripts/prepare_combined_dataset.py --max-numina 40000 --max-openmath 30000

  # Only one source
  python scripts/prepare_combined_dataset.py --skip-openmath
  python scripts/prepare_combined_dataset.py --skip-numina

  # Custom output dir
  python scripts/prepare_combined_dataset.py --output-dir data/sft/combined
"""

from __future__ import annotations

import argparse
import hashlib
import json
import logging
import math
import random
import re
import sys
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional, Tuple

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Constants — kept in sync with src/config/prompts.py
# ---------------------------------------------------------------------------

SOLVER_SYSTEM_PROMPT = (
    "You are a step-by-step math solver. "
    "Solve the given problem one step at a time. "
    "Each step must be on its own line, starting with 'Step N:'. "
    "End with a line starting with 'Final Answer:'. "
    "Write every mathematical expression in Python/SymPy syntax "
    "so it can be verified programmatically."
)

USER_WRAPPER = (
    "Solve the following problem. Show your reasoning as numbered steps, "
    "then give the final numeric answer on the last line.\n\nProblem:\n{question}"
)

# ---------------------------------------------------------------------------
# Skill-ID mappings  (drives ZPD CurriculumManager per-topic mastery)
# ---------------------------------------------------------------------------

# NuminaMath-CoT  `type` field → skill_id
NUMINA_TYPE_TO_SKILL: Dict[str, str] = {
    "algebra":                   "numina_algebra",
    "intermediate_algebra":      "numina_algebra",
    "prealgebra":                "numina_prealgebra",
    "number_theory":             "numina_number_theory",
    "geometry":                  "numina_geometry",
    "counting_and_probability":  "numina_combinatorics",
    "precalculus":               "numina_calculus",
    "calculus":                  "numina_calculus",
    "statistics":                "numina_statistics",
    "probability":               "numina_statistics",
    # competition-source buckets (fallback when type not in map above)
    "cn_k12":                    "numina_algebra",
    "olympiads":                 "numina_olympiad",
    "amc_aime":                  "numina_competition",
    "synthetic_math":            "numina_synthetic",
}

# NuminaMath source → approximate difficulty (1=easy 2=medium 3=hard)
NUMINA_SOURCE_DIFFICULTY: Dict[str, int] = {
    "cn_k12":        1,
    "synthetic_math": 2,
    "amc_aime":       2,
    "olympiads":      3,
}

# OpenMathInstruct-2 problem_source → skill_id / difficulty
OPENMATH_SOURCE_TO_SKILL: Dict[str, str] = {
    "math":                  "openmath_algebra",   # overridden per-row by subject
    "amc_aime_1983_2024":    "openmath_competition",
    "synthetic_math":        "openmath_synthetic",
    "number_theory":         "openmath_number_theory",
}

OPENMATH_SOURCE_DIFFICULTY: Dict[str, int] = {
    "math":                 2,
    "amc_aime_1983_2024":   3,
    "synthetic_math":       1,
}

# OpenMathInstruct MATH-subject → skill_id (when problem_source == "math")
OPENMATH_MATH_SUBJECT_SKILL: Dict[str, str] = {
    "Algebra":                   "openmath_algebra",
    "Number Theory":             "openmath_number_theory",
    "Geometry":                  "openmath_geometry",
    "Counting & Probability":    "openmath_combinatorics",
    "Intermediate Algebra":      "openmath_algebra",
    "Prealgebra":                "openmath_prealgebra",
    "Precalculus":               "openmath_calculus",
    "Calculus":                  "openmath_calculus",
}

# ---------------------------------------------------------------------------
# Answer normalisation
# ---------------------------------------------------------------------------

_BOXED_RE = re.compile(r"\\boxed\{((?:[^{}]|\{[^{}]*\})*)\}")
_LATEX_FRAC = re.compile(r"\\frac\{(\d+)\}\{(\d+)\}")
_PLAIN_FRAC = re.compile(r"^(-?\d+)\s*/\s*(\d+)$")
_CURRENCY    = re.compile(r"(?:Rs\.?|USD|\$|€|£)\s*", re.IGNORECASE)
_UNICODE_MINUS = str.maketrans({"\u2212": "-", "−": "-"})


def extract_boxed(text: str) -> Optional[str]:
    """Return the last \\boxed{} contents from a solution string."""
    matches = _BOXED_RE.findall(text)
    return matches[-1].strip() if matches else None


def normalise_numeric(raw: str) -> Optional[str]:
    """
    Convert a raw answer string to a clean numeric string.

    Returns None for:
      - multi-value answers ("3 and 5")
      - symbolic expressions ("3\\sqrt{2}", "x+1")
      - inequalities
      - fractions where num/den exceed safe range
    """
    text = raw.strip()

    # Remove currency symbols and commas in numbers
    text = _CURRENCY.sub("", text)
    text = text.replace(",", "").translate(_UNICODE_MINUS).strip()

    # Skip if still contains words other than units
    if re.search(r"\b(and|or|none|no solution|undefined)\b", text, re.IGNORECASE):
        return None

    # Skip if contains letters (symbolic)
    if re.search(r"[a-zA-Z]", text):
        return None

    # Skip inequalities / ranges
    if re.search(r"[≤≥<>]", text):
        return None

    # Handle LaTeX fractions: \frac{3}{4}
    m = _LATEX_FRAC.fullmatch(text)
    if m:
        num, den = int(m.group(1)), int(m.group(2))
        if den:
            v = num / den
            return str(int(v)) if v == int(v) else f"{v:.4f}"
        return None

    # Handle plain fractions: 3/4
    m = _PLAIN_FRAC.match(text)
    if m:
        num, den = int(m.group(1)), int(m.group(2))
        if den:
            v = num / den
            return str(int(v)) if v == int(v) else f"{v:.4f}"
        return None

    # Handle percentage → decimal
    pct = re.fullmatch(r"(-?\d+(?:\.\d+)?)\s*%", text)
    if pct:
        v = float(pct.group(1))
        return str(int(v)) if v == int(v) else f"{v:.4f}"

    # Plain integer or decimal (possibly negative, possibly with trailing unit like "km")
    m = re.match(r"^\s*(-?\d+(?:\.\d+)?)\s*(?:[^0-9.\s].*)?\s*$", text)
    if m:
        val_str = m.group(1)
        try:
            v = float(val_str)
            return str(int(v)) if v == int(v) else val_str
        except ValueError:
            pass

    return None


# ---------------------------------------------------------------------------
# Solution → Step N: format
# ---------------------------------------------------------------------------

_SKIP_LINE_RE = re.compile(
    r"^\s*("
    r"\\boxed\{|"
    r"(Therefore|Thus|Hence|So),?\s+(the\s+)?(final\s+)?answer\s+is|"
    r"The\s+(final\s+)?answer\s+is|"
    r"Answer\s*[:=]"
    r")",
    re.IGNORECASE,
)


def solution_to_steps(solution: str, final_answer: str, max_steps: int = 18) -> str:
    """
    Convert an arbitrary CoT solution to the pipeline's Step N: format.

    Strategy:
      1. Split on newlines.
      2. Drop blank lines and lines that just announce the final answer
         (those are replaced by the explicit Final Answer: line).
      3. Strip any existing "Step N:" prefix to avoid double-numbering.
      4. Re-number as "Step 1:", "Step 2:", …
      5. Append "Final Answer: <answer>".
    """
    raw_lines = [l.strip() for l in solution.split("\n") if l.strip()]
    clean: List[str] = []
    for line in raw_lines:
        if _SKIP_LINE_RE.match(line):
            continue
        # Strip old step prefix
        line = re.sub(r"^Step\s*\d+\s*[:.)]\s*", "", line)
        if line:
            clean.append(line)

    # Cap to max_steps to keep token count reasonable
    clean = clean[:max_steps]

    if not clean:
        return f"Final Answer: {final_answer}"

    parts = [f"Step {i}: {line}" for i, line in enumerate(clean, 1)]
    return "\n".join(parts) + f"\nFinal Answer: {final_answer}"


# ---------------------------------------------------------------------------
# Record builders
# ---------------------------------------------------------------------------

def build_record(
    idx: int,
    split: str,
    source_name: str,
    skill_id: str,
    difficulty: int,
    question: str,
    solution_text: str,
    final_answer: str,
) -> Dict[str, Any]:
    assistant_content = solution_to_steps(solution_text, final_answer)
    return {
        "id":         f"{source_name.replace('/', '_')}_{split}_{idx}",
        "skill_id":   skill_id,
        "source":     source_name,
        "split":      split,
        "difficulty": difficulty,
        "task_type":  "solve",
        "messages": [
            {"role": "system",    "content": SOLVER_SYSTEM_PROMPT},
            {"role": "user",      "content": USER_WRAPPER.format(question=question.strip())},
            {"role": "assistant", "content": assistant_content},
        ],
    }


# ---------------------------------------------------------------------------
# Deduplication
# ---------------------------------------------------------------------------

def problem_hash(text: str) -> str:
    """Fast 16-char hash for near-dedup (exact-match on normalised text)."""
    normalised = re.sub(r"\s+", " ", text.strip().lower())
    return hashlib.md5(normalised.encode()).hexdigest()[:16]


# ---------------------------------------------------------------------------
# NuminaMath-CoT processing
# ---------------------------------------------------------------------------

def _numina_skill_and_difficulty(row: Dict) -> Tuple[str, int]:
    topic = (row.get("type") or "").lower().strip()
    source = (row.get("source") or "").lower().strip()

    skill = NUMINA_TYPE_TO_SKILL.get(topic)
    if skill is None:
        skill = NUMINA_TYPE_TO_SKILL.get(source, "numina_general")

    difficulty = NUMINA_SOURCE_DIFFICULTY.get(source, 2)
    return skill, difficulty


def iter_numina(
    max_samples: int,
    per_skill_cap: int,
    skip_olympiad: bool,
    seed: int,
) -> Iterator[Dict[str, Any]]:
    """
    Stream NuminaMath-CoT from HuggingFace and yield cleaned records.
    Uses per-skill quota to guarantee topic diversity.
    """
    try:
        from datasets import load_dataset  # type: ignore
    except ImportError:
        log.error("pip install datasets huggingface_hub")
        sys.exit(1)

    log.info("Streaming AI-MO/NuminaMath-CoT …")
    ds = load_dataset("AI-MO/NuminaMath-CoT", split="train", streaming=True,
                      trust_remote_code=True)

    skill_counts: Counter = Counter()
    seen_hashes: set = set()
    total_yielded = 0

    rng = random.Random(seed)

    for row in ds:
        if total_yielded >= max_samples:
            break

        problem  = (row.get("problem") or "").strip()
        solution = (row.get("solution") or "").strip()
        if not problem or not solution:
            continue

        # Extract and normalise answer from \boxed{}
        raw_answer = extract_boxed(solution)
        if raw_answer is None:
            continue
        final_answer = normalise_numeric(raw_answer)
        if final_answer is None:
            continue

        skill, difficulty = _numina_skill_and_difficulty(row)

        # Optionally skip very hard olympiad problems
        if skip_olympiad and skill == "numina_olympiad":
            continue

        # Per-skill cap to guarantee diversity
        if skill_counts[skill] >= per_skill_cap:
            continue

        # Dedup
        h = problem_hash(problem)
        if h in seen_hashes:
            continue
        seen_hashes.add(h)

        skill_counts[skill] += 1
        total_yielded += 1

        yield build_record(
            idx=total_yielded,
            split="__assign__",
            source_name="AI-MO/NuminaMath-CoT",
            skill_id=skill,
            difficulty=difficulty,
            question=problem,
            solution_text=solution,
            final_answer=final_answer,
        )

    log.info("NuminaMath-CoT: yielded %d records | skill dist: %s",
             total_yielded, dict(skill_counts.most_common()))


# ---------------------------------------------------------------------------
# OpenMathInstruct-2 processing
# ---------------------------------------------------------------------------

def _openmath_skill_and_difficulty(row: Dict) -> Tuple[str, int]:
    src    = (row.get("problem_source") or "").lower().strip()
    subj   = (row.get("subject") or "").strip()

    if src == "math" and subj:
        skill = OPENMATH_MATH_SUBJECT_SKILL.get(subj, "openmath_algebra")
    else:
        skill = OPENMATH_SOURCE_TO_SKILL.get(src, "openmath_general")

    difficulty = OPENMATH_SOURCE_DIFFICULTY.get(src, 2)
    return skill, difficulty


def iter_openmath(
    max_samples: int,
    per_skill_cap: int,
    skip_gsm8k: bool,
    seed: int,
) -> Iterator[Dict[str, Any]]:
    """
    Stream OpenMathInstruct-2 from HuggingFace and yield cleaned records.
    Only yields rows where `is_correct_solution` is True (pre-verified by NVIDIA).
    """
    try:
        from datasets import load_dataset  # type: ignore
    except ImportError:
        log.error("pip install datasets huggingface_hub")
        sys.exit(1)

    log.info("Streaming nvidia/OpenMathInstruct-2 (this may take a moment) …")
    ds = load_dataset(
        "nvidia/OpenMathInstruct-2",
        split="train",
        streaming=True,
        trust_remote_code=True,
    )

    skill_counts: Counter = Counter()
    seen_hashes: set = set()
    total_yielded = 0

    for row in ds:
        if total_yielded >= max_samples:
            break

        # Filter: skip gsm8k (contamination risk)
        problem_src = (row.get("problem_source") or "").lower()
        if skip_gsm8k and "gsm8k" in problem_src:
            continue

        # Filter: only verified correct solutions
        if not row.get("is_correct_solution", True):
            continue

        problem  = (row.get("problem") or "").strip()
        solution = (row.get("generated_solution") or "").strip()
        expected = (row.get("expected_answer") or "").strip()

        if not problem or not solution or not expected:
            continue

        # Normalise the pre-extracted answer
        final_answer = normalise_numeric(expected)
        if final_answer is None:
            continue

        skill, difficulty = _openmath_skill_and_difficulty(row)

        # Per-skill cap
        if skill_counts[skill] >= per_skill_cap:
            continue

        # Dedup
        h = problem_hash(problem)
        if h in seen_hashes:
            continue
        seen_hashes.add(h)

        skill_counts[skill] += 1
        total_yielded += 1

        yield build_record(
            idx=total_yielded,
            split="__assign__",
            source_name="nvidia/OpenMathInstruct-2",
            skill_id=skill,
            difficulty=difficulty,
            question=problem,
            solution_text=solution,
            final_answer=final_answer,
        )

    log.info("OpenMathInstruct-2: yielded %d records | skill dist: %s",
             total_yielded, dict(skill_counts.most_common()))


# ---------------------------------------------------------------------------
# Dataset stats printer
# ---------------------------------------------------------------------------

def print_stats(records: List[Dict], label: str) -> None:
    skill_c: Counter = Counter(r["skill_id"] for r in records)
    diff_c:  Counter = Counter(r["difficulty"] for r in records)
    src_c:   Counter = Counter(r["source"] for r in records)
    split_c: Counter = Counter(r["split"] for r in records)

    log.info("─── %s  (%d records) ───────────────────────────────", label, len(records))
    log.info("  by split:      %s", dict(split_c))
    log.info("  by source:     %s", dict(src_c))
    log.info("  by difficulty: %s", dict(sorted(diff_c.items())))
    log.info("  by skill_id:")
    for sk, cnt in skill_c.most_common():
        log.info("    %-40s  %5d", sk, cnt)


# ---------------------------------------------------------------------------
# Write JSONL
# ---------------------------------------------------------------------------

def write_jsonl(records: List[Dict], path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    log.info("Wrote %d records → %s", len(records), path)


# ---------------------------------------------------------------------------
# Train / val / test split  (stratified by skill_id)
# ---------------------------------------------------------------------------

def stratified_split(
    records: List[Dict],
    train_frac: float = 0.85,
    val_frac:   float = 0.10,
    seed:       int   = 42,
) -> Tuple[List[Dict], List[Dict], List[Dict]]:
    """
    Stratified split by skill_id so every skill appears in all three sets.
    Remaining fraction after train+val goes to test.
    """
    rng = random.Random(seed)

    by_skill: Dict[str, List[Dict]] = defaultdict(list)
    for r in records:
        by_skill[r["skill_id"]].append(r)

    train_, val_, test_ = [], [], []
    for skill, items in by_skill.items():
        rng.shuffle(items)
        n = len(items)
        n_train = math.floor(n * train_frac)
        n_val   = math.floor(n * val_frac)
        train_ += items[:n_train]
        val_   += items[n_train: n_train + n_val]
        test_  += items[n_train + n_val:]

    for r in train_: r["split"] = "train"
    for r in val_:   r["split"] = "val"
    for r in test_:  r["split"] = "test"

    # Shuffle each split so skill interleaves during training
    rng.shuffle(train_)
    rng.shuffle(val_)
    rng.shuffle(test_)

    return train_, val_, test_


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Build combined NuminaMath + OpenMathInstruct-2 training data."
    )
    p.add_argument("--output-dir",    default="data/sft",
                   help="Directory for output JSONL files.")
    p.add_argument("--max-numina",    type=int, default=20_000,
                   help="Max records from NuminaMath-CoT (default 20 000).")
    p.add_argument("--max-openmath",  type=int, default=15_000,
                   help="Max records from OpenMathInstruct-2 (default 15 000).")
    p.add_argument("--per-skill-cap", type=int, default=4_000,
                   help="Max records per skill_id to guarantee topic diversity.")
    p.add_argument("--skip-numina",   action="store_true",
                   help="Skip NuminaMath-CoT entirely.")
    p.add_argument("--skip-openmath", action="store_true",
                   help="Skip OpenMathInstruct-2 entirely.")
    p.add_argument("--skip-olympiad", action="store_true", default=True,
                   help="Skip numina_olympiad problems (too hard for 1.5B; default: True).")
    p.add_argument("--no-skip-olympiad", dest="skip_olympiad", action="store_false",
                   help="Include olympiad-level problems.")
    p.add_argument("--train-frac", type=float, default=0.85)
    p.add_argument("--val-frac",   type=float, default=0.10)
    p.add_argument("--seed",       type=int,   default=42)
    p.add_argument("--dry-run",    action="store_true",
                   help="Process only 500 rows from each source and show stats (no write).")
    return p.parse_args()


def main() -> None:
    args = parse_args()
    rng  = random.Random(args.seed)

    if args.dry_run:
        args.max_numina   = min(args.max_numina,   500)
        args.max_openmath = min(args.max_openmath, 500)
        log.info("DRY RUN — capped at 500 samples per source, nothing written to disk.")

    all_records: List[Dict] = []

    # ── NuminaMath-CoT ────────────────────────────────────────────────────
    if not args.skip_numina:
        numina_recs = list(iter_numina(
            max_samples   = args.max_numina,
            per_skill_cap = args.per_skill_cap,
            skip_olympiad = args.skip_olympiad,
            seed          = args.seed,
        ))
        all_records.extend(numina_recs)
        log.info("NuminaMath-CoT collected: %d records", len(numina_recs))
    else:
        log.info("Skipping NuminaMath-CoT (--skip-numina).")

    # ── OpenMathInstruct-2 ────────────────────────────────────────────────
    if not args.skip_openmath:
        openmath_recs = list(iter_openmath(
            max_samples   = args.max_openmath,
            per_skill_cap = args.per_skill_cap,
            skip_gsm8k    = True,
            seed          = args.seed,
        ))
        all_records.extend(openmath_recs)
        log.info("OpenMathInstruct-2 collected: %d records", len(openmath_recs))
    else:
        log.info("Skipping OpenMathInstruct-2 (--skip-openmath).")

    if not all_records:
        log.error("No records collected — check dataset availability.")
        sys.exit(1)

    # ── Deduplicate across sources ─────────────────────────────────────────
    seen: set = set()
    deduped: List[Dict] = []
    for r in all_records:
        question = r["messages"][1]["content"]
        h = problem_hash(question)
        if h not in seen:
            seen.add(h)
            deduped.append(r)

    log.info("After cross-source dedup: %d → %d records  (removed %d dupes)",
             len(all_records), len(deduped), len(all_records) - len(deduped))

    # ── Stratified split ──────────────────────────────────────────────────
    train_recs, val_recs, test_recs = stratified_split(
        deduped, args.train_frac, args.val_frac, args.seed
    )

    print_stats(train_recs + val_recs + test_recs, "COMBINED DATASET")

    # ── Write outputs ─────────────────────────────────────────────────────
    if args.dry_run:
        log.info("DRY RUN complete — no files written.")
        log.info("  would write: combined_train.jsonl  (%d rows)", len(train_recs))
        log.info("  would write: combined_val.jsonl    (%d rows)", len(val_recs))
        log.info("  would write: combined_test.jsonl   (%d rows)", len(test_recs))
        log.info("Sample record:")
        print(json.dumps(train_recs[0], indent=2, ensure_ascii=False))
        return

    out = Path(args.output_dir)
    write_jsonl(train_recs, out / "combined_train.jsonl")
    write_jsonl(val_recs,   out / "combined_val.jsonl")
    write_jsonl(test_recs,  out / "combined_test.jsonl")

    log.info("")
    log.info("╔══════════════════════════════════════════════════════════════╗")
    log.info("║  Pipeline complete.  Next step:                              ║")
    log.info("║    bash launch_grpo_combined.sh                              ║")
    log.info("╚══════════════════════════════════════════════════════════════╝")
    log.info("  train : %6d rows  → %s/combined_train.jsonl", len(train_recs), out)
    log.info("  val   : %6d rows  → %s/combined_val.jsonl",   len(val_recs),   out)
    log.info("  test  : %6d rows  → %s/combined_test.jsonl",  len(test_recs),  out)
    log.info("")
    log.info("Skill coverage (for ZPD CurriculumManager):")
    skill_c = Counter(r["skill_id"] for r in train_recs)
    for sk, cnt in sorted(skill_c.items()):
        log.info("  %-40s  %5d train samples", sk, cnt)


if __name__ == "__main__":
    main()