Spaces:
Runtime error
Runtime error
Ashira Pitchayapakayakul
v11(into-model): add 9 ingest datasets + Phase 0 hygiene + TruthRL ternary GRPO
a71a56a | #!/usr/bin/env python3 | |
| """Surrogate-1 V10 — feature-request extractor. | |
| Read all 748 past Claude conversation .jsonl files in ~/.claude/projects/ | |
| and extract every moment where the user asked Claude to BUILD a feature | |
| in Surrogate. These become FEATURE-DEMONSTRATION training pairs that | |
| get distilled into Surrogate's weights — so the model itself knows how | |
| to build the feature when asked again. | |
| Format of output JSONL (consumed by trainer via merge_external): | |
| { | |
| "prompt": "<user feature request, made self-contained>", | |
| "response": "<concrete implementation: code/diff/spec/etc>", | |
| "kind": "feature-build", | |
| "source": "<session-id>", | |
| "feature_class": "<heuristic-tag>" | |
| } | |
| Heuristic feature-class tags pulled from the user's actual asks: | |
| - "agent-coding-loop" — autonomous coding agent / dev chain | |
| - "monitoring" — 24/7 monitor, watchdog, observability | |
| - "training-technique" — new technique to add to trainer (RL, DPO, etc.) | |
| - "orchestration" — multi-agent, spawn/aggregate, parallel | |
| - "knowledge-ingest" — distill X into model | |
| - "self-improvement" — closed-loop improvement, flywheel | |
| - "release-pipeline" — autonomous release, CI/CD, draft PR | |
| - "incident-response" — auto-heal, diagnose, fix | |
| - "evaluation" — bench, scoring, verdict | |
| - "cost-efficiency" — frontier-style smarter-with-less | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import re | |
| import sys | |
| from pathlib import Path | |
| PROJECTS = Path.home() / ".claude/projects" | |
| # Heuristic patterns — match Thai + English request-style language | |
| REQUEST_PATTERNS = [ | |
| # Direct imperative "build/make/add/create X" | |
| r"\b(build|make|add|create|implement|wire|train|bake|ingest|distill)\s+\w+", | |
| # Thai imperatives | |
| r"(ทำ|สร้าง|เพิ่ม|implement|train|เทรน|fine-?tune|fine_tune)\s+", | |
| # Feature-shaped asks | |
| r"feature\s+", | |
| r"(can|please|let|let's|how about)\s+", | |
| # explicit "I want X" / "ผมอยาก" | |
| r"(I want|I need|I'd like|ผมอยาก|ผมต้องการ|อยาก|ต้องการ)", | |
| # research-asks that lead to features | |
| r"(go research|research|find out|หามา|ลองหา|วิจัย)", | |
| ] | |
| FEATURE_CLASS_KEYWORDS = { | |
| "agent-coding-loop": ["agent chain", "dev agent", "autonomous coding", "code 24", "spawn agent", "sub-?agent", "team agent"], | |
| "monitoring": ["monitor", "watchdog", "observability", "smoke", "health check", "anomaly", "incident detect"], | |
| "training-technique": ["GRPO", "DPO", "DAPO", "ORPO", "KTO", "RLVR", "SimPO", "Constitutional AI", "TruthRL", "Mask-DPO", "PiSSA", "LoftQ", "CorDA", "DoRA", "RSLoRA", "NEFTune", "Spectrum", "Quiet-STaR", "Reflexion", "Voyager", "Magpie", "self-rewarding", "knowledge distillation", "DistillKit", "active learning", "CoT", "PRM", "verify"], | |
| "orchestration": ["multi-agent", "spawn", "aggregate", "parallel", "orchestrat", "team", "shared context", "subagent", "dispatch"], | |
| "knowledge-ingest": ["ingest", "distill", "corpus", "training data", "training pair", "Q&A", "vault", "obsidian", "memory"], | |
| "self-improvement": ["self-improve", "flywheel", "improve over time", "เก่งขึ้น", "online RL", "continual learn"], | |
| "release-pipeline": ["release", "draft PR", "CI/CD", "deploy", "MVP", "v1 v2", "v10000", "ship feature", "auto-release"], | |
| "incident-response": ["incident", "auto-heal", "rollback", "diagnose", "patch", "remediate", "fix"], | |
| "evaluation": ["bench", "eval", "score", "verdict", "rubric", "HumanEval", "MBPP", "BFCL", "SWE-Bench", "test", "วัดผล"], | |
| "cost-efficiency": ["smarter with less", "frontier", "efficiency", "speculative", "MoE", "sliding window", "test-time compute", "ใช้ resource น้อย"], | |
| "role-persona": ["SRE", "DevSecOps", "Full Stack", "PM", "PO", "SA", "BD", "QE", "AI Engineer", "Marketing", "role", "persona", "engineer"], | |
| "long-context": ["long context", "32K", "64K", "128K", "1M", "YaRN", "RoPE", "context window"], | |
| "anti-hallucination": ["halluc", "หลอน", "factual", "truth", "correctness", "F-DPO", "TruthRL", "calibration"], | |
| } | |
| def classify(text: str) -> str: | |
| text_lower = text.lower() | |
| scores = {} | |
| for cls, kws in FEATURE_CLASS_KEYWORDS.items(): | |
| for kw in kws: | |
| if kw.lower() in text_lower: | |
| scores[cls] = scores.get(cls, 0) + 1 | |
| if not scores: | |
| return "general" | |
| return max(scores, key=scores.get) | |
| def looks_like_feature_request(user_msg: str) -> bool: | |
| """True if the user message is asking for something to be built.""" | |
| if len(user_msg) < 30: | |
| return False | |
| if len(user_msg) > 4000: | |
| # Very long messages are usually critique/anger, not feature requests | |
| # but they often contain feature requests inside | |
| pass | |
| for pat in REQUEST_PATTERNS: | |
| if re.search(pat, user_msg, re.I): | |
| return True | |
| return False | |
| def extract_pairs_from_session(jsonl_path: Path) -> list[dict]: | |
| """For each (user, assistant) pair where the user asked for a feature, | |
| return a training-ready dict.""" | |
| out = [] | |
| last_user = None | |
| for L in jsonl_path.read_text(errors="replace").splitlines(): | |
| try: | |
| ev = json.loads(L) | |
| except Exception: | |
| continue | |
| msg = ev.get("message", {}) | |
| role = msg.get("role") | |
| content = msg.get("content", "") | |
| if isinstance(content, list): | |
| content = "\n".join(b.get("text", "") for b in content | |
| if isinstance(b, dict) and b.get("type") == "text") | |
| if not isinstance(content, str): | |
| content = str(content) | |
| if role == "user": | |
| last_user = content | |
| elif role == "assistant" and last_user: | |
| if looks_like_feature_request(last_user): | |
| # Make prompt self-contained — restate context briefly | |
| # Skip if assistant response is too short or just clarification | |
| if len(content) >= 100 and not content.startswith("?"): | |
| out.append({ | |
| "prompt": last_user[:3000], | |
| "response": content[:6000], | |
| "kind": "feature-build", | |
| "source": jsonl_path.stem, | |
| "feature_class": classify(last_user + " " + content), | |
| }) | |
| last_user = None | |
| return out | |
| def main() -> int: | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--out", default=str(Path.home() / ".surrogate/state/v10-ingest/conversations/feature-builds.jsonl")) | |
| p.add_argument("--limit", type=int, default=0) | |
| args = p.parse_args() | |
| out_path = Path(args.out) | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| f_out = out_path.open("w") | |
| n_total = 0 | |
| n_files = 0 | |
| by_class = {} | |
| files = sorted(PROJECTS.rglob("*.jsonl")) | |
| if args.limit: | |
| files = files[:args.limit] | |
| print(f"scanning {len(files)} session files for feature-build pairs...") | |
| for fp in files: | |
| try: | |
| pairs = extract_pairs_from_session(fp) | |
| except Exception as e: | |
| sys.stderr.write(f" skip {fp.name}: {e}\n") | |
| continue | |
| if not pairs: | |
| continue | |
| n_files += 1 | |
| for j in pairs: | |
| f_out.write(json.dumps(j, ensure_ascii=False) + "\n") | |
| by_class[j["feature_class"]] = by_class.get(j["feature_class"], 0) + 1 | |
| n_total += 1 | |
| if n_files % 50 == 0: | |
| print(f" scanned {n_files} sessions, extracted {n_total} feature-build pairs") | |
| f_out.close() | |
| print(f"\nDONE: {n_total} feature-build pairs from {n_files} sessions → {out_path}") | |
| print("By feature class:") | |
| for cls, n in sorted(by_class.items(), key=lambda x: -x[1]): | |
| print(f" {n:5d} {cls}") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |