Add results packaging script
Browse files- scripts/package_results.py +145 -0
scripts/package_results.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Package stage-1/stage-2 evaluation metrics into publication-ready artifacts.
|
| 3 |
+
|
| 4 |
+
Reads raw and normalized metrics from run directories and writes:
|
| 5 |
+
- results/metrics_summary.json
|
| 6 |
+
- results/stage1_vs_stage2_comparison.md
|
| 7 |
+
- results/stage1_normalized_metrics.json
|
| 8 |
+
- results/stage2_normalized_metrics.json
|
| 9 |
+
- results/stage1_raw_metrics.json
|
| 10 |
+
- results/stage2_raw_metrics.json
|
| 11 |
+
"""
|
| 12 |
+
import argparse
|
| 13 |
+
import json
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Any, Dict, List
|
| 16 |
+
|
| 17 |
+
from tmf921_train.utils import write_json
|
| 18 |
+
|
| 19 |
+
DEFAULT_SPLITS = ["test_in_distribution", "test_template_ood", "test_use_case_ood", "test_sector_ood", "test_adversarial"]
|
| 20 |
+
DEFAULT_WEAK = ["o1_nrm", "a1_policy", "tmf921_lifecycle_report", "tmf921_lifecycle_monitor", "tmf921_lifecycle_scale"]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def load_json(path: Path) -> Dict[str, Any]:
|
| 24 |
+
if not path.exists():
|
| 25 |
+
raise FileNotFoundError(path)
|
| 26 |
+
return json.loads(path.read_text())
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def fmt(x, digits=4):
|
| 30 |
+
if x is None:
|
| 31 |
+
return "n/a"
|
| 32 |
+
if isinstance(x, float):
|
| 33 |
+
return f"{x:.{digits}f}"
|
| 34 |
+
return str(x)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def delta(a, b):
|
| 38 |
+
if a is None or b is None:
|
| 39 |
+
return None
|
| 40 |
+
return b - a
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def metric_row(s1, s2, split, key):
|
| 44 |
+
a = s1.get(split, {}).get(key)
|
| 45 |
+
b = s2.get(split, {}).get(key)
|
| 46 |
+
d = delta(a, b)
|
| 47 |
+
return a, b, d
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def main():
|
| 51 |
+
ap = argparse.ArgumentParser()
|
| 52 |
+
ap.add_argument("--stage1_eval_dir", required=True, help="e.g. runs/qwen3-.../eval_merged")
|
| 53 |
+
ap.add_argument("--stage2_eval_dir", required=True, help="e.g. runs/stage2-.../eval")
|
| 54 |
+
ap.add_argument("--output_dir", default="results")
|
| 55 |
+
ap.add_argument("--splits", nargs="+", default=DEFAULT_SPLITS)
|
| 56 |
+
ap.add_argument("--weak_layers", nargs="+", default=DEFAULT_WEAK)
|
| 57 |
+
args = ap.parse_args()
|
| 58 |
+
|
| 59 |
+
out = Path(args.output_dir)
|
| 60 |
+
out.mkdir(parents=True, exist_ok=True)
|
| 61 |
+
s1_dir = Path(args.stage1_eval_dir)
|
| 62 |
+
s2_dir = Path(args.stage2_eval_dir)
|
| 63 |
+
|
| 64 |
+
s1_raw = load_json(s1_dir / "all_metrics.json")
|
| 65 |
+
s2_raw = load_json(s2_dir / "all_metrics.json")
|
| 66 |
+
s1_norm = load_json(s1_dir / "all_normalized_metrics.json")
|
| 67 |
+
s2_norm = load_json(s2_dir / "all_normalized_metrics.json")
|
| 68 |
+
|
| 69 |
+
write_json(out / "stage1_raw_metrics.json", s1_raw)
|
| 70 |
+
write_json(out / "stage2_raw_metrics.json", s2_raw)
|
| 71 |
+
write_json(out / "stage1_normalized_metrics.json", s1_norm)
|
| 72 |
+
write_json(out / "stage2_normalized_metrics.json", s2_norm)
|
| 73 |
+
|
| 74 |
+
summary = {
|
| 75 |
+
"stage1_eval_dir": str(s1_dir),
|
| 76 |
+
"stage2_eval_dir": str(s2_dir),
|
| 77 |
+
"splits": {},
|
| 78 |
+
"weak_layers": {},
|
| 79 |
+
"decision": "stage1_primary_stage2_diagnostic",
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
lines: List[str] = []
|
| 83 |
+
A = lines.append
|
| 84 |
+
A("# Stage 1 vs Stage 2 Results Comparison")
|
| 85 |
+
A("")
|
| 86 |
+
A("This artifact compares the primary stage-1 Qwen3-8B QLoRA adapter against the stage-2 weak-layer continuation adapter.")
|
| 87 |
+
A("")
|
| 88 |
+
A("## Decision")
|
| 89 |
+
A("")
|
| 90 |
+
A("**Stage 1 remains the primary model. Stage 2 is diagnostic and is not promoted.**")
|
| 91 |
+
A("")
|
| 92 |
+
A("Reason: stage 2 does not materially improve `o1_nrm` or `a1_policy`, slightly regresses global normalized metrics, and reduces adversarial robustness.")
|
| 93 |
+
A("")
|
| 94 |
+
|
| 95 |
+
A("## Global normalized metrics")
|
| 96 |
+
A("")
|
| 97 |
+
A("| Split | Stage 1 norm field F1 | Stage 2 norm field F1 | Δ field F1 | Stage 1 norm key F1 | Stage 2 norm key F1 | Δ key F1 | Stage 1 parse | Stage 2 parse |")
|
| 98 |
+
A("|---|---:|---:|---:|---:|---:|---:|---:|---:|")
|
| 99 |
+
for split in args.splits:
|
| 100 |
+
f1a, f1b, f1d = metric_row(s1_norm, s2_norm, split, "norm_field_f1")
|
| 101 |
+
k1a, k1b, k1d = metric_row(s1_norm, s2_norm, split, "norm_key_f1")
|
| 102 |
+
pa, pb, pd = metric_row(s1_norm, s2_norm, split, "parse_json")
|
| 103 |
+
summary["splits"][split] = {
|
| 104 |
+
"stage1_norm_field_f1": f1a, "stage2_norm_field_f1": f1b, "delta_norm_field_f1": f1d,
|
| 105 |
+
"stage1_norm_key_f1": k1a, "stage2_norm_key_f1": k1b, "delta_norm_key_f1": k1d,
|
| 106 |
+
"stage1_parse_json": pa, "stage2_parse_json": pb, "delta_parse_json": pd,
|
| 107 |
+
}
|
| 108 |
+
A(f"| `{split}` | {fmt(f1a)} | {fmt(f1b)} | {fmt(f1d, 4)} | {fmt(k1a)} | {fmt(k1b)} | {fmt(k1d, 4)} | {fmt(pa)} | {fmt(pb)} |")
|
| 109 |
+
A("")
|
| 110 |
+
|
| 111 |
+
A("## Weak-layer normalized field F1")
|
| 112 |
+
A("")
|
| 113 |
+
for split in [s for s in args.splits if s != "test_adversarial"]:
|
| 114 |
+
A(f"### `{split}`")
|
| 115 |
+
A("")
|
| 116 |
+
A("| Layer | Stage 1 | Stage 2 | Δ |")
|
| 117 |
+
A("|---|---:|---:|---:|")
|
| 118 |
+
by1 = s1_norm.get(split, {}).get("by_target_layer", {})
|
| 119 |
+
by2 = s2_norm.get(split, {}).get("by_target_layer", {})
|
| 120 |
+
summary["weak_layers"][split] = {}
|
| 121 |
+
for layer in args.weak_layers:
|
| 122 |
+
if layer in by1 and layer in by2:
|
| 123 |
+
a = by1[layer].get("norm_field_f1")
|
| 124 |
+
b = by2[layer].get("norm_field_f1")
|
| 125 |
+
d = delta(a, b)
|
| 126 |
+
summary["weak_layers"][split][layer] = {"stage1": a, "stage2": b, "delta": d}
|
| 127 |
+
A(f"| `{layer}` | {fmt(a)} | {fmt(b)} | {fmt(d, 4)} |")
|
| 128 |
+
A("")
|
| 129 |
+
|
| 130 |
+
A("## Interpretation")
|
| 131 |
+
A("")
|
| 132 |
+
A("Stage 2 is scientifically useful as a negative/diagnostic experiment: weak-layer exposure alone did not solve low-level O1 NRM or A1 policy value fidelity. The next step should be layer-specific semantic evaluation and better canonical data generation rather than more blind continuation training.")
|
| 133 |
+
A("")
|
| 134 |
+
A("## Current best model")
|
| 135 |
+
A("")
|
| 136 |
+
A("The primary model remains the stage-1 Qwen3-8B QLoRA adapter.")
|
| 137 |
+
|
| 138 |
+
write_json(out / "metrics_summary.json", summary)
|
| 139 |
+
(out / "stage1_vs_stage2_comparison.md").write_text("\n".join(lines), encoding="utf-8")
|
| 140 |
+
print(out / "stage1_vs_stage2_comparison.md")
|
| 141 |
+
print(out / "metrics_summary.json")
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|