| { |
| "timestamp": "2026-05-08T00:12:00.713088", |
| "checkpoint": "checkpoints/phase9_production/step_200000.pt", |
| "model_params": 172376579, |
| "total_time_s": 1562.0058193206787, |
| "n_seeds": 5, |
| "guidance_scale": 2.0, |
| "results": [ |
| { |
| "test_id": "A1", |
| "tier": "A", |
| "name": "Consistency", |
| "n_cases": 100, |
| "n_pass": 100, |
| "pass_rate": 1.0, |
| "passed": true, |
| "details": {}, |
| "duration_s": 571.2158267498016 |
| }, |
| { |
| "test_id": "A2", |
| "tier": "A", |
| "name": "Causal Asymmetry", |
| "n_cases": 19, |
| "n_pass": 19, |
| "pass_rate": 1.0, |
| "passed": true, |
| "details": { |
| "mean_corr": 0.5317086431628028 |
| }, |
| "duration_s": 85.96794986724854 |
| }, |
| { |
| "test_id": "A3", |
| "tier": "A", |
| "name": "Compositionality", |
| "n_cases": 30, |
| "n_pass": 28, |
| "pass_rate": 0.9333333333333333, |
| "passed": true, |
| "details": {}, |
| "duration_s": 214.8946647644043 |
| }, |
| { |
| "test_id": "A4", |
| "tier": "A", |
| "name": "Counterfactual Coherence", |
| "n_cases": 50, |
| "n_pass": 46, |
| "pass_rate": 0.92, |
| "passed": true, |
| "details": { |
| "median_corr": 0.999642601137857, |
| "min_corr": 0.7887732292567268 |
| }, |
| "duration_s": 214.578773021698 |
| }, |
| { |
| "test_id": "A5", |
| "tier": "A", |
| "name": "Robustness", |
| "n_cases": 56, |
| "n_pass": 56, |
| "pass_rate": 1.0, |
| "passed": true, |
| "details": {}, |
| "duration_s": 88.48681592941284 |
| }, |
| { |
| "test_id": "B1", |
| "tier": "B", |
| "name": "Placebo (non-edges)", |
| "n_cases": 39, |
| "n_pass": 23, |
| "pass_rate": 0.5897435897435898, |
| "passed": false, |
| "details": { |
| "mean_effect": 0.12529101967811584, |
| "max_effect": 0.444835901260376 |
| }, |
| "duration_s": 90.49227929115295 |
| }, |
| { |
| "test_id": "B2", |
| "tier": "B", |
| "name": "Real effects (edges)", |
| "n_cases": 19, |
| "n_pass": 19, |
| "pass_rate": 1.0, |
| "passed": true, |
| "details": { |
| "mean_effect": 0.3813858926296234 |
| }, |
| "duration_s": 45.237102031707764 |
| }, |
| { |
| "test_id": "B4", |
| "tier": "B", |
| "name": "Sensitivity monotonicity", |
| "n_cases": 10, |
| "n_pass": 10, |
| "pass_rate": 1.0, |
| "passed": true, |
| "details": {}, |
| "duration_s": 140.24205374717712 |
| }, |
| { |
| "test_id": "C1", |
| "tier": "C", |
| "name": "RBI Rate Decisions", |
| "n_cases": 42, |
| "n_pass": 42, |
| "pass_rate": 1.0, |
| "passed": true, |
| "details": { |
| "rate_changes_tested": 42 |
| }, |
| "duration_s": 99.34343814849854 |
| } |
| ], |
| "tier_summary": { |
| "A": { |
| "total": 5, |
| "passed": 5 |
| }, |
| "B": { |
| "total": 3, |
| "passed": 2 |
| }, |
| "C": { |
| "total": 1, |
| "passed": 1 |
| } |
| }, |
| "overall_pass": false |
| } |