nraptisss
/

tmf921-intent-training

+#!/usr/bin/env python3
+"""Package stage-1/stage-2 evaluation metrics into publication-ready artifacts.
+Reads raw and normalized metrics from run directories and writes:
+- results/metrics_summary.json
+- results/stage1_vs_stage2_comparison.md
+- results/stage1_normalized_metrics.json
+- results/stage2_normalized_metrics.json
+- results/stage1_raw_metrics.json
+- results/stage2_raw_metrics.json
+"""
+import argparse
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+from tmf921_train.utils import write_json
+DEFAULT_SPLITS = ["test_in_distribution", "test_template_ood", "test_use_case_ood", "test_sector_ood", "test_adversarial"]
+DEFAULT_WEAK = ["o1_nrm", "a1_policy", "tmf921_lifecycle_report", "tmf921_lifecycle_monitor", "tmf921_lifecycle_scale"]
+def load_json(path: Path) -> Dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(path)
+    return json.loads(path.read_text())
+def fmt(x, digits=4):
+    if x is None:
+        return "n/a"
+    if isinstance(x, float):
+        return f"{x:.{digits}f}"
+    return str(x)
+def delta(a, b):
+    if a is None or b is None:
+        return None
+    return b - a
+def metric_row(s1, s2, split, key):
+    a = s1.get(split, {}).get(key)
+    b = s2.get(split, {}).get(key)
+    d = delta(a, b)
+    return a, b, d
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--stage1_eval_dir", required=True, help="e.g. runs/qwen3-.../eval_merged")
+    ap.add_argument("--stage2_eval_dir", required=True, help="e.g. runs/stage2-.../eval")
+    ap.add_argument("--output_dir", default="results")
+    ap.add_argument("--splits", nargs="+", default=DEFAULT_SPLITS)
+    ap.add_argument("--weak_layers", nargs="+", default=DEFAULT_WEAK)
+    args = ap.parse_args()
+    out = Path(args.output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    s1_dir = Path(args.stage1_eval_dir)
+    s2_dir = Path(args.stage2_eval_dir)
+    s1_raw = load_json(s1_dir / "all_metrics.json")
+    s2_raw = load_json(s2_dir / "all_metrics.json")
+    s1_norm = load_json(s1_dir / "all_normalized_metrics.json")
+    s2_norm = load_json(s2_dir / "all_normalized_metrics.json")
+    write_json(out / "stage1_raw_metrics.json", s1_raw)
+    write_json(out / "stage2_raw_metrics.json", s2_raw)
+    write_json(out / "stage1_normalized_metrics.json", s1_norm)
+    write_json(out / "stage2_normalized_metrics.json", s2_norm)
+    summary = {
+        "stage1_eval_dir": str(s1_dir),
+        "stage2_eval_dir": str(s2_dir),
+        "splits": {},
+        "weak_layers": {},
+        "decision": "stage1_primary_stage2_diagnostic",
+    }
+    lines: List[str] = []
+    A = lines.append
+    A("# Stage 1 vs Stage 2 Results Comparison")
+    A("")
+    A("This artifact compares the primary stage-1 Qwen3-8B QLoRA adapter against the stage-2 weak-layer continuation adapter.")
+    A("")
+    A("## Decision")
+    A("")
+    A("**Stage 1 remains the primary model. Stage 2 is diagnostic and is not promoted.**")
+    A("")
+    A("Reason: stage 2 does not materially improve `o1_nrm` or `a1_policy`, slightly regresses global normalized metrics, and reduces adversarial robustness.")
+    A("")
+    A("## Global normalized metrics")
+    A("")
+    A("| Split | Stage 1 norm field F1 | Stage 2 norm field F1 | Δ field F1 | Stage 1 norm key F1 | Stage 2 norm key F1 | Δ key F1 | Stage 1 parse | Stage 2 parse |")
+    A("|---|---:|---:|---:|---:|---:|---:|---:|---:|")
+    for split in args.splits:
+        f1a, f1b, f1d = metric_row(s1_norm, s2_norm, split, "norm_field_f1")
+        k1a, k1b, k1d = metric_row(s1_norm, s2_norm, split, "norm_key_f1")
+        pa, pb, pd = metric_row(s1_norm, s2_norm, split, "parse_json")
+        summary["splits"][split] = {
+            "stage1_norm_field_f1": f1a, "stage2_norm_field_f1": f1b, "delta_norm_field_f1": f1d,
+            "stage1_norm_key_f1": k1a, "stage2_norm_key_f1": k1b, "delta_norm_key_f1": k1d,
+            "stage1_parse_json": pa, "stage2_parse_json": pb, "delta_parse_json": pd,
+        }
+        A(f"| `{split}` | {fmt(f1a)} | {fmt(f1b)} | {fmt(f1d, 4)} | {fmt(k1a)} | {fmt(k1b)} | {fmt(k1d, 4)} | {fmt(pa)} | {fmt(pb)} |")
+    A("")
+    A("## Weak-layer normalized field F1")
+    A("")
+    for split in [s for s in args.splits if s != "test_adversarial"]:
+        A(f"### `{split}`")
+        A("")
+        A("| Layer | Stage 1 | Stage 2 | Δ |")
+        A("|---|---:|---:|---:|")
+        by1 = s1_norm.get(split, {}).get("by_target_layer", {})
+        by2 = s2_norm.get(split, {}).get("by_target_layer", {})
+        summary["weak_layers"][split] = {}
+        for layer in args.weak_layers:
+            if layer in by1 and layer in by2:
+                a = by1[layer].get("norm_field_f1")
+                b = by2[layer].get("norm_field_f1")
+                d = delta(a, b)
+                summary["weak_layers"][split][layer] = {"stage1": a, "stage2": b, "delta": d}
+                A(f"| `{layer}` | {fmt(a)} | {fmt(b)} | {fmt(d, 4)} |")
+        A("")
+    A("## Interpretation")
+    A("")
+    A("Stage 2 is scientifically useful as a negative/diagnostic experiment: weak-layer exposure alone did not solve low-level O1 NRM or A1 policy value fidelity. The next step should be layer-specific semantic evaluation and better canonical data generation rather than more blind continuation training.")
+    A("")
+    A("## Current best model")
+    A("")
+    A("The primary model remains the stage-1 Qwen3-8B QLoRA adapter.")
+    write_json(out / "metrics_summary.json", summary)
+    (out / "stage1_vs_stage2_comparison.md").write_text("\n".join(lines), encoding="utf-8")
+    print(out / "stage1_vs_stage2_comparison.md")
+    print(out / "metrics_summary.json")
+if __name__ == "__main__":
+    main()