PEFT
qlora
sft
trl
qwen3
tmf921
intent-based-networking
network-slicing
rtx-6000-ada
ml-intern
File size: 5,697 Bytes
aaf8c59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""Package stage-1/stage-2 evaluation metrics into publication-ready artifacts.

Reads raw and normalized metrics from run directories and writes:
- results/metrics_summary.json
- results/stage1_vs_stage2_comparison.md
- results/stage1_normalized_metrics.json
- results/stage2_normalized_metrics.json
- results/stage1_raw_metrics.json
- results/stage2_raw_metrics.json
"""
import argparse
import json
from pathlib import Path
from typing import Any, Dict, List

from tmf921_train.utils import write_json

DEFAULT_SPLITS = ["test_in_distribution", "test_template_ood", "test_use_case_ood", "test_sector_ood", "test_adversarial"]
DEFAULT_WEAK = ["o1_nrm", "a1_policy", "tmf921_lifecycle_report", "tmf921_lifecycle_monitor", "tmf921_lifecycle_scale"]


def load_json(path: Path) -> Dict[str, Any]:
    if not path.exists():
        raise FileNotFoundError(path)
    return json.loads(path.read_text())


def fmt(x, digits=4):
    if x is None:
        return "n/a"
    if isinstance(x, float):
        return f"{x:.{digits}f}"
    return str(x)


def delta(a, b):
    if a is None or b is None:
        return None
    return b - a


def metric_row(s1, s2, split, key):
    a = s1.get(split, {}).get(key)
    b = s2.get(split, {}).get(key)
    d = delta(a, b)
    return a, b, d


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--stage1_eval_dir", required=True, help="e.g. runs/qwen3-.../eval_merged")
    ap.add_argument("--stage2_eval_dir", required=True, help="e.g. runs/stage2-.../eval")
    ap.add_argument("--output_dir", default="results")
    ap.add_argument("--splits", nargs="+", default=DEFAULT_SPLITS)
    ap.add_argument("--weak_layers", nargs="+", default=DEFAULT_WEAK)
    args = ap.parse_args()

    out = Path(args.output_dir)
    out.mkdir(parents=True, exist_ok=True)
    s1_dir = Path(args.stage1_eval_dir)
    s2_dir = Path(args.stage2_eval_dir)

    s1_raw = load_json(s1_dir / "all_metrics.json")
    s2_raw = load_json(s2_dir / "all_metrics.json")
    s1_norm = load_json(s1_dir / "all_normalized_metrics.json")
    s2_norm = load_json(s2_dir / "all_normalized_metrics.json")

    write_json(out / "stage1_raw_metrics.json", s1_raw)
    write_json(out / "stage2_raw_metrics.json", s2_raw)
    write_json(out / "stage1_normalized_metrics.json", s1_norm)
    write_json(out / "stage2_normalized_metrics.json", s2_norm)

    summary = {
        "stage1_eval_dir": str(s1_dir),
        "stage2_eval_dir": str(s2_dir),
        "splits": {},
        "weak_layers": {},
        "decision": "stage1_primary_stage2_diagnostic",
    }

    lines: List[str] = []
    A = lines.append
    A("# Stage 1 vs Stage 2 Results Comparison")
    A("")
    A("This artifact compares the primary stage-1 Qwen3-8B QLoRA adapter against the stage-2 weak-layer continuation adapter.")
    A("")
    A("## Decision")
    A("")
    A("**Stage 1 remains the primary model. Stage 2 is diagnostic and is not promoted.**")
    A("")
    A("Reason: stage 2 does not materially improve `o1_nrm` or `a1_policy`, slightly regresses global normalized metrics, and reduces adversarial robustness.")
    A("")

    A("## Global normalized metrics")
    A("")
    A("| Split | Stage 1 norm field F1 | Stage 2 norm field F1 | Δ field F1 | Stage 1 norm key F1 | Stage 2 norm key F1 | Δ key F1 | Stage 1 parse | Stage 2 parse |")
    A("|---|---:|---:|---:|---:|---:|---:|---:|---:|")
    for split in args.splits:
        f1a, f1b, f1d = metric_row(s1_norm, s2_norm, split, "norm_field_f1")
        k1a, k1b, k1d = metric_row(s1_norm, s2_norm, split, "norm_key_f1")
        pa, pb, pd = metric_row(s1_norm, s2_norm, split, "parse_json")
        summary["splits"][split] = {
            "stage1_norm_field_f1": f1a, "stage2_norm_field_f1": f1b, "delta_norm_field_f1": f1d,
            "stage1_norm_key_f1": k1a, "stage2_norm_key_f1": k1b, "delta_norm_key_f1": k1d,
            "stage1_parse_json": pa, "stage2_parse_json": pb, "delta_parse_json": pd,
        }
        A(f"| `{split}` | {fmt(f1a)} | {fmt(f1b)} | {fmt(f1d, 4)} | {fmt(k1a)} | {fmt(k1b)} | {fmt(k1d, 4)} | {fmt(pa)} | {fmt(pb)} |")
    A("")

    A("## Weak-layer normalized field F1")
    A("")
    for split in [s for s in args.splits if s != "test_adversarial"]:
        A(f"### `{split}`")
        A("")
        A("| Layer | Stage 1 | Stage 2 | Δ |")
        A("|---|---:|---:|---:|")
        by1 = s1_norm.get(split, {}).get("by_target_layer", {})
        by2 = s2_norm.get(split, {}).get("by_target_layer", {})
        summary["weak_layers"][split] = {}
        for layer in args.weak_layers:
            if layer in by1 and layer in by2:
                a = by1[layer].get("norm_field_f1")
                b = by2[layer].get("norm_field_f1")
                d = delta(a, b)
                summary["weak_layers"][split][layer] = {"stage1": a, "stage2": b, "delta": d}
                A(f"| `{layer}` | {fmt(a)} | {fmt(b)} | {fmt(d, 4)} |")
        A("")

    A("## Interpretation")
    A("")
    A("Stage 2 is scientifically useful as a negative/diagnostic experiment: weak-layer exposure alone did not solve low-level O1 NRM or A1 policy value fidelity. The next step should be layer-specific semantic evaluation and better canonical data generation rather than more blind continuation training.")
    A("")
    A("## Current best model")
    A("")
    A("The primary model remains the stage-1 Qwen3-8B QLoRA adapter.")

    write_json(out / "metrics_summary.json", summary)
    (out / "stage1_vs_stage2_comparison.md").write_text("\n".join(lines), encoding="utf-8")
    print(out / "stage1_vs_stage2_comparison.md")
    print(out / "metrics_summary.json")


if __name__ == "__main__":
    main()