| |
| """Package stage-1/stage-2 evaluation metrics into publication-ready artifacts. |
| |
| Reads raw and normalized metrics from run directories and writes: |
| - results/metrics_summary.json |
| - results/stage1_vs_stage2_comparison.md |
| - results/stage1_normalized_metrics.json |
| - results/stage2_normalized_metrics.json |
| - results/stage1_raw_metrics.json |
| - results/stage2_raw_metrics.json |
| """ |
| import argparse |
| import json |
| from pathlib import Path |
| from typing import Any, Dict, List |
|
|
| from tmf921_train.utils import write_json |
|
|
| DEFAULT_SPLITS = ["test_in_distribution", "test_template_ood", "test_use_case_ood", "test_sector_ood", "test_adversarial"] |
| DEFAULT_WEAK = ["o1_nrm", "a1_policy", "tmf921_lifecycle_report", "tmf921_lifecycle_monitor", "tmf921_lifecycle_scale"] |
|
|
|
|
| def load_json(path: Path) -> Dict[str, Any]: |
| if not path.exists(): |
| raise FileNotFoundError(path) |
| return json.loads(path.read_text()) |
|
|
|
|
| def fmt(x, digits=4): |
| if x is None: |
| return "n/a" |
| if isinstance(x, float): |
| return f"{x:.{digits}f}" |
| return str(x) |
|
|
|
|
| def delta(a, b): |
| if a is None or b is None: |
| return None |
| return b - a |
|
|
|
|
| def metric_row(s1, s2, split, key): |
| a = s1.get(split, {}).get(key) |
| b = s2.get(split, {}).get(key) |
| d = delta(a, b) |
| return a, b, d |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--stage1_eval_dir", required=True, help="e.g. runs/qwen3-.../eval_merged") |
| ap.add_argument("--stage2_eval_dir", required=True, help="e.g. runs/stage2-.../eval") |
| ap.add_argument("--output_dir", default="results") |
| ap.add_argument("--splits", nargs="+", default=DEFAULT_SPLITS) |
| ap.add_argument("--weak_layers", nargs="+", default=DEFAULT_WEAK) |
| args = ap.parse_args() |
|
|
| out = Path(args.output_dir) |
| out.mkdir(parents=True, exist_ok=True) |
| s1_dir = Path(args.stage1_eval_dir) |
| s2_dir = Path(args.stage2_eval_dir) |
|
|
| s1_raw = load_json(s1_dir / "all_metrics.json") |
| s2_raw = load_json(s2_dir / "all_metrics.json") |
| s1_norm = load_json(s1_dir / "all_normalized_metrics.json") |
| s2_norm = load_json(s2_dir / "all_normalized_metrics.json") |
|
|
| write_json(out / "stage1_raw_metrics.json", s1_raw) |
| write_json(out / "stage2_raw_metrics.json", s2_raw) |
| write_json(out / "stage1_normalized_metrics.json", s1_norm) |
| write_json(out / "stage2_normalized_metrics.json", s2_norm) |
|
|
| summary = { |
| "stage1_eval_dir": str(s1_dir), |
| "stage2_eval_dir": str(s2_dir), |
| "splits": {}, |
| "weak_layers": {}, |
| "decision": "stage1_primary_stage2_diagnostic", |
| } |
|
|
| lines: List[str] = [] |
| A = lines.append |
| A("# Stage 1 vs Stage 2 Results Comparison") |
| A("") |
| A("This artifact compares the primary stage-1 Qwen3-8B QLoRA adapter against the stage-2 weak-layer continuation adapter.") |
| A("") |
| A("## Decision") |
| A("") |
| A("**Stage 1 remains the primary model. Stage 2 is diagnostic and is not promoted.**") |
| A("") |
| A("Reason: stage 2 does not materially improve `o1_nrm` or `a1_policy`, slightly regresses global normalized metrics, and reduces adversarial robustness.") |
| A("") |
|
|
| A("## Global normalized metrics") |
| A("") |
| A("| Split | Stage 1 norm field F1 | Stage 2 norm field F1 | Δ field F1 | Stage 1 norm key F1 | Stage 2 norm key F1 | Δ key F1 | Stage 1 parse | Stage 2 parse |") |
| A("|---|---:|---:|---:|---:|---:|---:|---:|---:|") |
| for split in args.splits: |
| f1a, f1b, f1d = metric_row(s1_norm, s2_norm, split, "norm_field_f1") |
| k1a, k1b, k1d = metric_row(s1_norm, s2_norm, split, "norm_key_f1") |
| pa, pb, pd = metric_row(s1_norm, s2_norm, split, "parse_json") |
| summary["splits"][split] = { |
| "stage1_norm_field_f1": f1a, "stage2_norm_field_f1": f1b, "delta_norm_field_f1": f1d, |
| "stage1_norm_key_f1": k1a, "stage2_norm_key_f1": k1b, "delta_norm_key_f1": k1d, |
| "stage1_parse_json": pa, "stage2_parse_json": pb, "delta_parse_json": pd, |
| } |
| A(f"| `{split}` | {fmt(f1a)} | {fmt(f1b)} | {fmt(f1d, 4)} | {fmt(k1a)} | {fmt(k1b)} | {fmt(k1d, 4)} | {fmt(pa)} | {fmt(pb)} |") |
| A("") |
|
|
| A("## Weak-layer normalized field F1") |
| A("") |
| for split in [s for s in args.splits if s != "test_adversarial"]: |
| A(f"### `{split}`") |
| A("") |
| A("| Layer | Stage 1 | Stage 2 | Δ |") |
| A("|---|---:|---:|---:|") |
| by1 = s1_norm.get(split, {}).get("by_target_layer", {}) |
| by2 = s2_norm.get(split, {}).get("by_target_layer", {}) |
| summary["weak_layers"][split] = {} |
| for layer in args.weak_layers: |
| if layer in by1 and layer in by2: |
| a = by1[layer].get("norm_field_f1") |
| b = by2[layer].get("norm_field_f1") |
| d = delta(a, b) |
| summary["weak_layers"][split][layer] = {"stage1": a, "stage2": b, "delta": d} |
| A(f"| `{layer}` | {fmt(a)} | {fmt(b)} | {fmt(d, 4)} |") |
| A("") |
|
|
| A("## Interpretation") |
| A("") |
| A("Stage 2 is scientifically useful as a negative/diagnostic experiment: weak-layer exposure alone did not solve low-level O1 NRM or A1 policy value fidelity. The next step should be layer-specific semantic evaluation and better canonical data generation rather than more blind continuation training.") |
| A("") |
| A("## Current best model") |
| A("") |
| A("The primary model remains the stage-1 Qwen3-8B QLoRA adapter.") |
|
|
| write_json(out / "metrics_summary.json", summary) |
| (out / "stage1_vs_stage2_comparison.md").write_text("\n".join(lines), encoding="utf-8") |
| print(out / "stage1_vs_stage2_comparison.md") |
| print(out / "metrics_summary.json") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|