File size: 5,697 Bytes
aaf8c59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | #!/usr/bin/env python3
"""Package stage-1/stage-2 evaluation metrics into publication-ready artifacts.
Reads raw and normalized metrics from run directories and writes:
- results/metrics_summary.json
- results/stage1_vs_stage2_comparison.md
- results/stage1_normalized_metrics.json
- results/stage2_normalized_metrics.json
- results/stage1_raw_metrics.json
- results/stage2_raw_metrics.json
"""
import argparse
import json
from pathlib import Path
from typing import Any, Dict, List
from tmf921_train.utils import write_json
DEFAULT_SPLITS = ["test_in_distribution", "test_template_ood", "test_use_case_ood", "test_sector_ood", "test_adversarial"]
DEFAULT_WEAK = ["o1_nrm", "a1_policy", "tmf921_lifecycle_report", "tmf921_lifecycle_monitor", "tmf921_lifecycle_scale"]
def load_json(path: Path) -> Dict[str, Any]:
if not path.exists():
raise FileNotFoundError(path)
return json.loads(path.read_text())
def fmt(x, digits=4):
if x is None:
return "n/a"
if isinstance(x, float):
return f"{x:.{digits}f}"
return str(x)
def delta(a, b):
if a is None or b is None:
return None
return b - a
def metric_row(s1, s2, split, key):
a = s1.get(split, {}).get(key)
b = s2.get(split, {}).get(key)
d = delta(a, b)
return a, b, d
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--stage1_eval_dir", required=True, help="e.g. runs/qwen3-.../eval_merged")
ap.add_argument("--stage2_eval_dir", required=True, help="e.g. runs/stage2-.../eval")
ap.add_argument("--output_dir", default="results")
ap.add_argument("--splits", nargs="+", default=DEFAULT_SPLITS)
ap.add_argument("--weak_layers", nargs="+", default=DEFAULT_WEAK)
args = ap.parse_args()
out = Path(args.output_dir)
out.mkdir(parents=True, exist_ok=True)
s1_dir = Path(args.stage1_eval_dir)
s2_dir = Path(args.stage2_eval_dir)
s1_raw = load_json(s1_dir / "all_metrics.json")
s2_raw = load_json(s2_dir / "all_metrics.json")
s1_norm = load_json(s1_dir / "all_normalized_metrics.json")
s2_norm = load_json(s2_dir / "all_normalized_metrics.json")
write_json(out / "stage1_raw_metrics.json", s1_raw)
write_json(out / "stage2_raw_metrics.json", s2_raw)
write_json(out / "stage1_normalized_metrics.json", s1_norm)
write_json(out / "stage2_normalized_metrics.json", s2_norm)
summary = {
"stage1_eval_dir": str(s1_dir),
"stage2_eval_dir": str(s2_dir),
"splits": {},
"weak_layers": {},
"decision": "stage1_primary_stage2_diagnostic",
}
lines: List[str] = []
A = lines.append
A("# Stage 1 vs Stage 2 Results Comparison")
A("")
A("This artifact compares the primary stage-1 Qwen3-8B QLoRA adapter against the stage-2 weak-layer continuation adapter.")
A("")
A("## Decision")
A("")
A("**Stage 1 remains the primary model. Stage 2 is diagnostic and is not promoted.**")
A("")
A("Reason: stage 2 does not materially improve `o1_nrm` or `a1_policy`, slightly regresses global normalized metrics, and reduces adversarial robustness.")
A("")
A("## Global normalized metrics")
A("")
A("| Split | Stage 1 norm field F1 | Stage 2 norm field F1 | Δ field F1 | Stage 1 norm key F1 | Stage 2 norm key F1 | Δ key F1 | Stage 1 parse | Stage 2 parse |")
A("|---|---:|---:|---:|---:|---:|---:|---:|---:|")
for split in args.splits:
f1a, f1b, f1d = metric_row(s1_norm, s2_norm, split, "norm_field_f1")
k1a, k1b, k1d = metric_row(s1_norm, s2_norm, split, "norm_key_f1")
pa, pb, pd = metric_row(s1_norm, s2_norm, split, "parse_json")
summary["splits"][split] = {
"stage1_norm_field_f1": f1a, "stage2_norm_field_f1": f1b, "delta_norm_field_f1": f1d,
"stage1_norm_key_f1": k1a, "stage2_norm_key_f1": k1b, "delta_norm_key_f1": k1d,
"stage1_parse_json": pa, "stage2_parse_json": pb, "delta_parse_json": pd,
}
A(f"| `{split}` | {fmt(f1a)} | {fmt(f1b)} | {fmt(f1d, 4)} | {fmt(k1a)} | {fmt(k1b)} | {fmt(k1d, 4)} | {fmt(pa)} | {fmt(pb)} |")
A("")
A("## Weak-layer normalized field F1")
A("")
for split in [s for s in args.splits if s != "test_adversarial"]:
A(f"### `{split}`")
A("")
A("| Layer | Stage 1 | Stage 2 | Δ |")
A("|---|---:|---:|---:|")
by1 = s1_norm.get(split, {}).get("by_target_layer", {})
by2 = s2_norm.get(split, {}).get("by_target_layer", {})
summary["weak_layers"][split] = {}
for layer in args.weak_layers:
if layer in by1 and layer in by2:
a = by1[layer].get("norm_field_f1")
b = by2[layer].get("norm_field_f1")
d = delta(a, b)
summary["weak_layers"][split][layer] = {"stage1": a, "stage2": b, "delta": d}
A(f"| `{layer}` | {fmt(a)} | {fmt(b)} | {fmt(d, 4)} |")
A("")
A("## Interpretation")
A("")
A("Stage 2 is scientifically useful as a negative/diagnostic experiment: weak-layer exposure alone did not solve low-level O1 NRM or A1 policy value fidelity. The next step should be layer-specific semantic evaluation and better canonical data generation rather than more blind continuation training.")
A("")
A("## Current best model")
A("")
A("The primary model remains the stage-1 Qwen3-8B QLoRA adapter.")
write_json(out / "metrics_summary.json", summary)
(out / "stage1_vs_stage2_comparison.md").write_text("\n".join(lines), encoding="utf-8")
print(out / "stage1_vs_stage2_comparison.md")
print(out / "metrics_summary.json")
if __name__ == "__main__":
main()
|