#!/usr/bin/env python3 """Surrogate-1 V10 — feature-request extractor. Read all 748 past Claude conversation .jsonl files in ~/.claude/projects/ and extract every moment where the user asked Claude to BUILD a feature in Surrogate. These become FEATURE-DEMONSTRATION training pairs that get distilled into Surrogate's weights — so the model itself knows how to build the feature when asked again. Format of output JSONL (consumed by trainer via merge_external): { "prompt": "", "response": "", "kind": "feature-build", "source": "", "feature_class": "" } Heuristic feature-class tags pulled from the user's actual asks: - "agent-coding-loop" — autonomous coding agent / dev chain - "monitoring" — 24/7 monitor, watchdog, observability - "training-technique" — new technique to add to trainer (RL, DPO, etc.) - "orchestration" — multi-agent, spawn/aggregate, parallel - "knowledge-ingest" — distill X into model - "self-improvement" — closed-loop improvement, flywheel - "release-pipeline" — autonomous release, CI/CD, draft PR - "incident-response" — auto-heal, diagnose, fix - "evaluation" — bench, scoring, verdict - "cost-efficiency" — frontier-style smarter-with-less """ from __future__ import annotations import argparse import json import re import sys from pathlib import Path PROJECTS = Path.home() / ".claude/projects" # Heuristic patterns — match Thai + English request-style language REQUEST_PATTERNS = [ # Direct imperative "build/make/add/create X" r"\b(build|make|add|create|implement|wire|train|bake|ingest|distill)\s+\w+", # Thai imperatives r"(ทำ|สร้าง|เพิ่ม|implement|train|เทรน|fine-?tune|fine_tune)\s+", # Feature-shaped asks r"feature\s+", r"(can|please|let|let's|how about)\s+", # explicit "I want X" / "ผมอยาก" r"(I want|I need|I'd like|ผมอยาก|ผมต้องการ|อยาก|ต้องการ)", # research-asks that lead to features r"(go research|research|find out|หามา|ลองหา|วิจัย)", ] FEATURE_CLASS_KEYWORDS = { "agent-coding-loop": ["agent chain", "dev agent", "autonomous coding", "code 24", "spawn agent", "sub-?agent", "team agent"], "monitoring": ["monitor", "watchdog", "observability", "smoke", "health check", "anomaly", "incident detect"], "training-technique": ["GRPO", "DPO", "DAPO", "ORPO", "KTO", "RLVR", "SimPO", "Constitutional AI", "TruthRL", "Mask-DPO", "PiSSA", "LoftQ", "CorDA", "DoRA", "RSLoRA", "NEFTune", "Spectrum", "Quiet-STaR", "Reflexion", "Voyager", "Magpie", "self-rewarding", "knowledge distillation", "DistillKit", "active learning", "CoT", "PRM", "verify"], "orchestration": ["multi-agent", "spawn", "aggregate", "parallel", "orchestrat", "team", "shared context", "subagent", "dispatch"], "knowledge-ingest": ["ingest", "distill", "corpus", "training data", "training pair", "Q&A", "vault", "obsidian", "memory"], "self-improvement": ["self-improve", "flywheel", "improve over time", "เก่งขึ้น", "online RL", "continual learn"], "release-pipeline": ["release", "draft PR", "CI/CD", "deploy", "MVP", "v1 v2", "v10000", "ship feature", "auto-release"], "incident-response": ["incident", "auto-heal", "rollback", "diagnose", "patch", "remediate", "fix"], "evaluation": ["bench", "eval", "score", "verdict", "rubric", "HumanEval", "MBPP", "BFCL", "SWE-Bench", "test", "วัดผล"], "cost-efficiency": ["smarter with less", "frontier", "efficiency", "speculative", "MoE", "sliding window", "test-time compute", "ใช้ resource น้อย"], "role-persona": ["SRE", "DevSecOps", "Full Stack", "PM", "PO", "SA", "BD", "QE", "AI Engineer", "Marketing", "role", "persona", "engineer"], "long-context": ["long context", "32K", "64K", "128K", "1M", "YaRN", "RoPE", "context window"], "anti-hallucination": ["halluc", "หลอน", "factual", "truth", "correctness", "F-DPO", "TruthRL", "calibration"], } def classify(text: str) -> str: text_lower = text.lower() scores = {} for cls, kws in FEATURE_CLASS_KEYWORDS.items(): for kw in kws: if kw.lower() in text_lower: scores[cls] = scores.get(cls, 0) + 1 if not scores: return "general" return max(scores, key=scores.get) def looks_like_feature_request(user_msg: str) -> bool: """True if the user message is asking for something to be built.""" if len(user_msg) < 30: return False if len(user_msg) > 4000: # Very long messages are usually critique/anger, not feature requests # but they often contain feature requests inside pass for pat in REQUEST_PATTERNS: if re.search(pat, user_msg, re.I): return True return False def extract_pairs_from_session(jsonl_path: Path) -> list[dict]: """For each (user, assistant) pair where the user asked for a feature, return a training-ready dict.""" out = [] last_user = None for L in jsonl_path.read_text(errors="replace").splitlines(): try: ev = json.loads(L) except Exception: continue msg = ev.get("message", {}) role = msg.get("role") content = msg.get("content", "") if isinstance(content, list): content = "\n".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text") if not isinstance(content, str): content = str(content) if role == "user": last_user = content elif role == "assistant" and last_user: if looks_like_feature_request(last_user): # Make prompt self-contained — restate context briefly # Skip if assistant response is too short or just clarification if len(content) >= 100 and not content.startswith("?"): out.append({ "prompt": last_user[:3000], "response": content[:6000], "kind": "feature-build", "source": jsonl_path.stem, "feature_class": classify(last_user + " " + content), }) last_user = None return out def main() -> int: p = argparse.ArgumentParser() p.add_argument("--out", default=str(Path.home() / ".surrogate/state/v10-ingest/conversations/feature-builds.jsonl")) p.add_argument("--limit", type=int, default=0) args = p.parse_args() out_path = Path(args.out) out_path.parent.mkdir(parents=True, exist_ok=True) f_out = out_path.open("w") n_total = 0 n_files = 0 by_class = {} files = sorted(PROJECTS.rglob("*.jsonl")) if args.limit: files = files[:args.limit] print(f"scanning {len(files)} session files for feature-build pairs...") for fp in files: try: pairs = extract_pairs_from_session(fp) except Exception as e: sys.stderr.write(f" skip {fp.name}: {e}\n") continue if not pairs: continue n_files += 1 for j in pairs: f_out.write(json.dumps(j, ensure_ascii=False) + "\n") by_class[j["feature_class"]] = by_class.get(j["feature_class"], 0) + 1 n_total += 1 if n_files % 50 == 0: print(f" scanned {n_files} sessions, extracted {n_total} feature-build pairs") f_out.close() print(f"\nDONE: {n_total} feature-build pairs from {n_files} sessions → {out_path}") print("By feature class:") for cls, n in sorted(by_class.items(), key=lambda x: -x[1]): print(f" {n:5d} {cls}") return 0 if __name__ == "__main__": sys.exit(main())