tmf921-intent-training / scripts /package_results.py

Add results packaging script

aaf8c59 verified 2 days ago

5.7 kB

	#!/usr/bin/env python3
	"""Package stage-1/stage-2 evaluation metrics into publication-ready artifacts.

	Reads raw and normalized metrics from run directories and writes:
	- results/metrics_summary.json
	- results/stage1_vs_stage2_comparison.md
	- results/stage1_normalized_metrics.json
	- results/stage2_normalized_metrics.json
	- results/stage1_raw_metrics.json
	- results/stage2_raw_metrics.json
	"""
	import argparse
	import json
	from pathlib import Path
	from typing import Any, Dict, List

	from tmf921_train.utils import write_json

	DEFAULT_SPLITS = ["test_in_distribution", "test_template_ood", "test_use_case_ood", "test_sector_ood", "test_adversarial"]
	DEFAULT_WEAK = ["o1_nrm", "a1_policy", "tmf921_lifecycle_report", "tmf921_lifecycle_monitor", "tmf921_lifecycle_scale"]


	def load_json(path: Path) -> Dict[str, Any]:
	if not path.exists():
	raise FileNotFoundError(path)
	return json.loads(path.read_text())


	def fmt(x, digits=4):
	if x is None:
	return "n/a"
	if isinstance(x, float):
	return f"{x:.{digits}f}"
	return str(x)


	def delta(a, b):
	if a is None or b is None:
	return None
	return b - a


	def metric_row(s1, s2, split, key):
	a = s1.get(split, {}).get(key)
	b = s2.get(split, {}).get(key)
	d = delta(a, b)
	return a, b, d


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--stage1_eval_dir", required=True, help="e.g. runs/qwen3-.../eval_merged")
	ap.add_argument("--stage2_eval_dir", required=True, help="e.g. runs/stage2-.../eval")
	ap.add_argument("--output_dir", default="results")
	ap.add_argument("--splits", nargs="+", default=DEFAULT_SPLITS)
	ap.add_argument("--weak_layers", nargs="+", default=DEFAULT_WEAK)
	args = ap.parse_args()

	out = Path(args.output_dir)
	out.mkdir(parents=True, exist_ok=True)
	s1_dir = Path(args.stage1_eval_dir)
	s2_dir = Path(args.stage2_eval_dir)

	s1_raw = load_json(s1_dir / "all_metrics.json")
	s2_raw = load_json(s2_dir / "all_metrics.json")
	s1_norm = load_json(s1_dir / "all_normalized_metrics.json")
	s2_norm = load_json(s2_dir / "all_normalized_metrics.json")

	write_json(out / "stage1_raw_metrics.json", s1_raw)
	write_json(out / "stage2_raw_metrics.json", s2_raw)
	write_json(out / "stage1_normalized_metrics.json", s1_norm)
	write_json(out / "stage2_normalized_metrics.json", s2_norm)

	summary = {
	"stage1_eval_dir": str(s1_dir),
	"stage2_eval_dir": str(s2_dir),
	"splits": {},
	"weak_layers": {},
	"decision": "stage1_primary_stage2_diagnostic",
	}

	lines: List[str] = []
	A = lines.append
	A("# Stage 1 vs Stage 2 Results Comparison")
	A("")
	A("This artifact compares the primary stage-1 Qwen3-8B QLoRA adapter against the stage-2 weak-layer continuation adapter.")
	A("")
	A("## Decision")
	A("")
	A("Stage 1 remains the primary model. Stage 2 is diagnostic and is not promoted.")
	A("")
	A("Reason: stage 2 does not materially improve `o1_nrm` or `a1_policy`, slightly regresses global normalized metrics, and reduces adversarial robustness.")
	A("")

	A("## Global normalized metrics")
	A("")
	A("\| Split \| Stage 1 norm field F1 \| Stage 2 norm field F1 \| Δ field F1 \| Stage 1 norm key F1 \| Stage 2 norm key F1 \| Δ key F1 \| Stage 1 parse \| Stage 2 parse \|")
	A("\|---\|---:\|---:\|---:\|---:\|---:\|---:\|---:\|---:\|")
	for split in args.splits:
	f1a, f1b, f1d = metric_row(s1_norm, s2_norm, split, "norm_field_f1")
	k1a, k1b, k1d = metric_row(s1_norm, s2_norm, split, "norm_key_f1")
	pa, pb, pd = metric_row(s1_norm, s2_norm, split, "parse_json")
	summary["splits"][split] = {
	"stage1_norm_field_f1": f1a, "stage2_norm_field_f1": f1b, "delta_norm_field_f1": f1d,
	"stage1_norm_key_f1": k1a, "stage2_norm_key_f1": k1b, "delta_norm_key_f1": k1d,
	"stage1_parse_json": pa, "stage2_parse_json": pb, "delta_parse_json": pd,
	}
	A(f"\| `{split}` \| {fmt(f1a)} \| {fmt(f1b)} \| {fmt(f1d, 4)} \| {fmt(k1a)} \| {fmt(k1b)} \| {fmt(k1d, 4)} \| {fmt(pa)} \| {fmt(pb)} \|")
	A("")

	A("## Weak-layer normalized field F1")
	A("")
	for split in [s for s in args.splits if s != "test_adversarial"]:
	A(f"### `{split}`")
	A("")
	A("\| Layer \| Stage 1 \| Stage 2 \| Δ \|")
	A("\|---\|---:\|---:\|---:\|")
	by1 = s1_norm.get(split, {}).get("by_target_layer", {})
	by2 = s2_norm.get(split, {}).get("by_target_layer", {})
	summary["weak_layers"][split] = {}
	for layer in args.weak_layers:
	if layer in by1 and layer in by2:
	a = by1[layer].get("norm_field_f1")
	b = by2[layer].get("norm_field_f1")
	d = delta(a, b)
	summary["weak_layers"][split][layer] = {"stage1": a, "stage2": b, "delta": d}
	A(f"\| `{layer}` \| {fmt(a)} \| {fmt(b)} \| {fmt(d, 4)} \|")
	A("")

	A("## Interpretation")
	A("")
	A("Stage 2 is scientifically useful as a negative/diagnostic experiment: weak-layer exposure alone did not solve low-level O1 NRM or A1 policy value fidelity. The next step should be layer-specific semantic evaluation and better canonical data generation rather than more blind continuation training.")
	A("")
	A("## Current best model")
	A("")
	A("The primary model remains the stage-1 Qwen3-8B QLoRA adapter.")

	write_json(out / "metrics_summary.json", summary)
	(out / "stage1_vs_stage2_comparison.md").write_text("\n".join(lines), encoding="utf-8")
	print(out / "stage1_vs_stage2_comparison.md")
	print(out / "metrics_summary.json")


	if __name__ == "__main__":
	main()