QuantHive-Research-Tech commited on
Commit
62ad9f6
·
verified ·
1 Parent(s): 7ffc277

Upload results/benchmark_results.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. results/benchmark_results.json +136 -0
results/benchmark_results.json ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-05-08T00:12:00.713088",
3
+ "checkpoint": "checkpoints/phase9_production/step_200000.pt",
4
+ "model_params": 172376579,
5
+ "total_time_s": 1562.0058193206787,
6
+ "n_seeds": 5,
7
+ "guidance_scale": 2.0,
8
+ "results": [
9
+ {
10
+ "test_id": "A1",
11
+ "tier": "A",
12
+ "name": "Consistency",
13
+ "n_cases": 100,
14
+ "n_pass": 100,
15
+ "pass_rate": 1.0,
16
+ "passed": true,
17
+ "details": {},
18
+ "duration_s": 571.2158267498016
19
+ },
20
+ {
21
+ "test_id": "A2",
22
+ "tier": "A",
23
+ "name": "Causal Asymmetry",
24
+ "n_cases": 19,
25
+ "n_pass": 19,
26
+ "pass_rate": 1.0,
27
+ "passed": true,
28
+ "details": {
29
+ "mean_corr": 0.5317086431628028
30
+ },
31
+ "duration_s": 85.96794986724854
32
+ },
33
+ {
34
+ "test_id": "A3",
35
+ "tier": "A",
36
+ "name": "Compositionality",
37
+ "n_cases": 30,
38
+ "n_pass": 28,
39
+ "pass_rate": 0.9333333333333333,
40
+ "passed": true,
41
+ "details": {},
42
+ "duration_s": 214.8946647644043
43
+ },
44
+ {
45
+ "test_id": "A4",
46
+ "tier": "A",
47
+ "name": "Counterfactual Coherence",
48
+ "n_cases": 50,
49
+ "n_pass": 46,
50
+ "pass_rate": 0.92,
51
+ "passed": true,
52
+ "details": {
53
+ "median_corr": 0.999642601137857,
54
+ "min_corr": 0.7887732292567268
55
+ },
56
+ "duration_s": 214.578773021698
57
+ },
58
+ {
59
+ "test_id": "A5",
60
+ "tier": "A",
61
+ "name": "Robustness",
62
+ "n_cases": 56,
63
+ "n_pass": 56,
64
+ "pass_rate": 1.0,
65
+ "passed": true,
66
+ "details": {},
67
+ "duration_s": 88.48681592941284
68
+ },
69
+ {
70
+ "test_id": "B1",
71
+ "tier": "B",
72
+ "name": "Placebo (non-edges)",
73
+ "n_cases": 39,
74
+ "n_pass": 23,
75
+ "pass_rate": 0.5897435897435898,
76
+ "passed": false,
77
+ "details": {
78
+ "mean_effect": 0.12529101967811584,
79
+ "max_effect": 0.444835901260376
80
+ },
81
+ "duration_s": 90.49227929115295
82
+ },
83
+ {
84
+ "test_id": "B2",
85
+ "tier": "B",
86
+ "name": "Real effects (edges)",
87
+ "n_cases": 19,
88
+ "n_pass": 19,
89
+ "pass_rate": 1.0,
90
+ "passed": true,
91
+ "details": {
92
+ "mean_effect": 0.3813858926296234
93
+ },
94
+ "duration_s": 45.237102031707764
95
+ },
96
+ {
97
+ "test_id": "B4",
98
+ "tier": "B",
99
+ "name": "Sensitivity monotonicity",
100
+ "n_cases": 10,
101
+ "n_pass": 10,
102
+ "pass_rate": 1.0,
103
+ "passed": true,
104
+ "details": {},
105
+ "duration_s": 140.24205374717712
106
+ },
107
+ {
108
+ "test_id": "C1",
109
+ "tier": "C",
110
+ "name": "RBI Rate Decisions",
111
+ "n_cases": 42,
112
+ "n_pass": 42,
113
+ "pass_rate": 1.0,
114
+ "passed": true,
115
+ "details": {
116
+ "rate_changes_tested": 42
117
+ },
118
+ "duration_s": 99.34343814849854
119
+ }
120
+ ],
121
+ "tier_summary": {
122
+ "A": {
123
+ "total": 5,
124
+ "passed": 5
125
+ },
126
+ "B": {
127
+ "total": 3,
128
+ "passed": 2
129
+ },
130
+ "C": {
131
+ "total": 1,
132
+ "passed": 1
133
+ }
134
+ },
135
+ "overall_pass": false
136
+ }