PEFT
qlora
sft
trl
qwen3
tmf921
intent-based-networking
network-slicing
rtx-6000-ada
ml-intern
nraptisss commited on
Commit
aaf8c59
·
verified ·
1 Parent(s): 0a2edf2

Add results packaging script

Browse files
Files changed (1) hide show
  1. scripts/package_results.py +145 -0
scripts/package_results.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Package stage-1/stage-2 evaluation metrics into publication-ready artifacts.
3
+
4
+ Reads raw and normalized metrics from run directories and writes:
5
+ - results/metrics_summary.json
6
+ - results/stage1_vs_stage2_comparison.md
7
+ - results/stage1_normalized_metrics.json
8
+ - results/stage2_normalized_metrics.json
9
+ - results/stage1_raw_metrics.json
10
+ - results/stage2_raw_metrics.json
11
+ """
12
+ import argparse
13
+ import json
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List
16
+
17
+ from tmf921_train.utils import write_json
18
+
19
+ DEFAULT_SPLITS = ["test_in_distribution", "test_template_ood", "test_use_case_ood", "test_sector_ood", "test_adversarial"]
20
+ DEFAULT_WEAK = ["o1_nrm", "a1_policy", "tmf921_lifecycle_report", "tmf921_lifecycle_monitor", "tmf921_lifecycle_scale"]
21
+
22
+
23
+ def load_json(path: Path) -> Dict[str, Any]:
24
+ if not path.exists():
25
+ raise FileNotFoundError(path)
26
+ return json.loads(path.read_text())
27
+
28
+
29
+ def fmt(x, digits=4):
30
+ if x is None:
31
+ return "n/a"
32
+ if isinstance(x, float):
33
+ return f"{x:.{digits}f}"
34
+ return str(x)
35
+
36
+
37
+ def delta(a, b):
38
+ if a is None or b is None:
39
+ return None
40
+ return b - a
41
+
42
+
43
+ def metric_row(s1, s2, split, key):
44
+ a = s1.get(split, {}).get(key)
45
+ b = s2.get(split, {}).get(key)
46
+ d = delta(a, b)
47
+ return a, b, d
48
+
49
+
50
+ def main():
51
+ ap = argparse.ArgumentParser()
52
+ ap.add_argument("--stage1_eval_dir", required=True, help="e.g. runs/qwen3-.../eval_merged")
53
+ ap.add_argument("--stage2_eval_dir", required=True, help="e.g. runs/stage2-.../eval")
54
+ ap.add_argument("--output_dir", default="results")
55
+ ap.add_argument("--splits", nargs="+", default=DEFAULT_SPLITS)
56
+ ap.add_argument("--weak_layers", nargs="+", default=DEFAULT_WEAK)
57
+ args = ap.parse_args()
58
+
59
+ out = Path(args.output_dir)
60
+ out.mkdir(parents=True, exist_ok=True)
61
+ s1_dir = Path(args.stage1_eval_dir)
62
+ s2_dir = Path(args.stage2_eval_dir)
63
+
64
+ s1_raw = load_json(s1_dir / "all_metrics.json")
65
+ s2_raw = load_json(s2_dir / "all_metrics.json")
66
+ s1_norm = load_json(s1_dir / "all_normalized_metrics.json")
67
+ s2_norm = load_json(s2_dir / "all_normalized_metrics.json")
68
+
69
+ write_json(out / "stage1_raw_metrics.json", s1_raw)
70
+ write_json(out / "stage2_raw_metrics.json", s2_raw)
71
+ write_json(out / "stage1_normalized_metrics.json", s1_norm)
72
+ write_json(out / "stage2_normalized_metrics.json", s2_norm)
73
+
74
+ summary = {
75
+ "stage1_eval_dir": str(s1_dir),
76
+ "stage2_eval_dir": str(s2_dir),
77
+ "splits": {},
78
+ "weak_layers": {},
79
+ "decision": "stage1_primary_stage2_diagnostic",
80
+ }
81
+
82
+ lines: List[str] = []
83
+ A = lines.append
84
+ A("# Stage 1 vs Stage 2 Results Comparison")
85
+ A("")
86
+ A("This artifact compares the primary stage-1 Qwen3-8B QLoRA adapter against the stage-2 weak-layer continuation adapter.")
87
+ A("")
88
+ A("## Decision")
89
+ A("")
90
+ A("**Stage 1 remains the primary model. Stage 2 is diagnostic and is not promoted.**")
91
+ A("")
92
+ A("Reason: stage 2 does not materially improve `o1_nrm` or `a1_policy`, slightly regresses global normalized metrics, and reduces adversarial robustness.")
93
+ A("")
94
+
95
+ A("## Global normalized metrics")
96
+ A("")
97
+ A("| Split | Stage 1 norm field F1 | Stage 2 norm field F1 | Δ field F1 | Stage 1 norm key F1 | Stage 2 norm key F1 | Δ key F1 | Stage 1 parse | Stage 2 parse |")
98
+ A("|---|---:|---:|---:|---:|---:|---:|---:|---:|")
99
+ for split in args.splits:
100
+ f1a, f1b, f1d = metric_row(s1_norm, s2_norm, split, "norm_field_f1")
101
+ k1a, k1b, k1d = metric_row(s1_norm, s2_norm, split, "norm_key_f1")
102
+ pa, pb, pd = metric_row(s1_norm, s2_norm, split, "parse_json")
103
+ summary["splits"][split] = {
104
+ "stage1_norm_field_f1": f1a, "stage2_norm_field_f1": f1b, "delta_norm_field_f1": f1d,
105
+ "stage1_norm_key_f1": k1a, "stage2_norm_key_f1": k1b, "delta_norm_key_f1": k1d,
106
+ "stage1_parse_json": pa, "stage2_parse_json": pb, "delta_parse_json": pd,
107
+ }
108
+ A(f"| `{split}` | {fmt(f1a)} | {fmt(f1b)} | {fmt(f1d, 4)} | {fmt(k1a)} | {fmt(k1b)} | {fmt(k1d, 4)} | {fmt(pa)} | {fmt(pb)} |")
109
+ A("")
110
+
111
+ A("## Weak-layer normalized field F1")
112
+ A("")
113
+ for split in [s for s in args.splits if s != "test_adversarial"]:
114
+ A(f"### `{split}`")
115
+ A("")
116
+ A("| Layer | Stage 1 | Stage 2 | Δ |")
117
+ A("|---|---:|---:|---:|")
118
+ by1 = s1_norm.get(split, {}).get("by_target_layer", {})
119
+ by2 = s2_norm.get(split, {}).get("by_target_layer", {})
120
+ summary["weak_layers"][split] = {}
121
+ for layer in args.weak_layers:
122
+ if layer in by1 and layer in by2:
123
+ a = by1[layer].get("norm_field_f1")
124
+ b = by2[layer].get("norm_field_f1")
125
+ d = delta(a, b)
126
+ summary["weak_layers"][split][layer] = {"stage1": a, "stage2": b, "delta": d}
127
+ A(f"| `{layer}` | {fmt(a)} | {fmt(b)} | {fmt(d, 4)} |")
128
+ A("")
129
+
130
+ A("## Interpretation")
131
+ A("")
132
+ A("Stage 2 is scientifically useful as a negative/diagnostic experiment: weak-layer exposure alone did not solve low-level O1 NRM or A1 policy value fidelity. The next step should be layer-specific semantic evaluation and better canonical data generation rather than more blind continuation training.")
133
+ A("")
134
+ A("## Current best model")
135
+ A("")
136
+ A("The primary model remains the stage-1 Qwen3-8B QLoRA adapter.")
137
+
138
+ write_json(out / "metrics_summary.json", summary)
139
+ (out / "stage1_vs_stage2_comparison.md").write_text("\n".join(lines), encoding="utf-8")
140
+ print(out / "stage1_vs_stage2_comparison.md")
141
+ print(out / "metrics_summary.json")
142
+
143
+
144
+ if __name__ == "__main__":
145
+ main()