horizon-v1 / results /benchmark_results.json
QuantHive-Research-Tech's picture
Upload results/benchmark_results.json with huggingface_hub
62ad9f6 verified
{
"timestamp": "2026-05-08T00:12:00.713088",
"checkpoint": "checkpoints/phase9_production/step_200000.pt",
"model_params": 172376579,
"total_time_s": 1562.0058193206787,
"n_seeds": 5,
"guidance_scale": 2.0,
"results": [
{
"test_id": "A1",
"tier": "A",
"name": "Consistency",
"n_cases": 100,
"n_pass": 100,
"pass_rate": 1.0,
"passed": true,
"details": {},
"duration_s": 571.2158267498016
},
{
"test_id": "A2",
"tier": "A",
"name": "Causal Asymmetry",
"n_cases": 19,
"n_pass": 19,
"pass_rate": 1.0,
"passed": true,
"details": {
"mean_corr": 0.5317086431628028
},
"duration_s": 85.96794986724854
},
{
"test_id": "A3",
"tier": "A",
"name": "Compositionality",
"n_cases": 30,
"n_pass": 28,
"pass_rate": 0.9333333333333333,
"passed": true,
"details": {},
"duration_s": 214.8946647644043
},
{
"test_id": "A4",
"tier": "A",
"name": "Counterfactual Coherence",
"n_cases": 50,
"n_pass": 46,
"pass_rate": 0.92,
"passed": true,
"details": {
"median_corr": 0.999642601137857,
"min_corr": 0.7887732292567268
},
"duration_s": 214.578773021698
},
{
"test_id": "A5",
"tier": "A",
"name": "Robustness",
"n_cases": 56,
"n_pass": 56,
"pass_rate": 1.0,
"passed": true,
"details": {},
"duration_s": 88.48681592941284
},
{
"test_id": "B1",
"tier": "B",
"name": "Placebo (non-edges)",
"n_cases": 39,
"n_pass": 23,
"pass_rate": 0.5897435897435898,
"passed": false,
"details": {
"mean_effect": 0.12529101967811584,
"max_effect": 0.444835901260376
},
"duration_s": 90.49227929115295
},
{
"test_id": "B2",
"tier": "B",
"name": "Real effects (edges)",
"n_cases": 19,
"n_pass": 19,
"pass_rate": 1.0,
"passed": true,
"details": {
"mean_effect": 0.3813858926296234
},
"duration_s": 45.237102031707764
},
{
"test_id": "B4",
"tier": "B",
"name": "Sensitivity monotonicity",
"n_cases": 10,
"n_pass": 10,
"pass_rate": 1.0,
"passed": true,
"details": {},
"duration_s": 140.24205374717712
},
{
"test_id": "C1",
"tier": "C",
"name": "RBI Rate Decisions",
"n_cases": 42,
"n_pass": 42,
"pass_rate": 1.0,
"passed": true,
"details": {
"rate_changes_tested": 42
},
"duration_s": 99.34343814849854
}
],
"tier_summary": {
"A": {
"total": 5,
"passed": 5
},
"B": {
"total": 3,
"passed": 2
},
"C": {
"total": 1,
"passed": 1
}
},
"overall_pass": false
}