Codette-Reasoning / benchmarks /correctness_benchmark_results.json
Jonathan Harrison
Full Codette codebase sync — transparency release
74f2af5
{
"timestamp": 1774055916.062495,
"results": {
"Phase_6_Only": {
"overall_accuracy": 0.42857142857142855,
"accuracy_by_category": {
"factual_easy": 0.5,
"factual_medium": 0.0,
"conceptual_medium": 0.5,
"reasoning_medium": 1.0,
"tricky_medium": 1.0,
"nuanced_hard": 0.0,
"meta_loop_prone": 0.0
},
"accuracy_by_difficulty": {
"1": 0.5,
"2": 0.625,
"3": 0.0
},
"avg_latency_ms": 0.1,
"total_tests": 14,
"correct_count": 6,
"category_stats": {
"factual_easy": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1
},
"factual_medium": {
"accuracy": 0.0,
"count": 2,
"avg_latency_ms": 0.1
},
"conceptual_medium": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1
},
"reasoning_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.1
},
"tricky_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.1
},
"nuanced_hard": {
"accuracy": 0.0,
"count": 2,
"avg_latency_ms": 0.1
},
"meta_loop_prone": {
"accuracy": 0.0,
"count": 2,
"avg_latency_ms": 0.1
}
}
},
"Phase_6_Plus_13": {
"overall_accuracy": 0.5714285714285714,
"accuracy_by_category": {
"factual_easy": 0.5,
"factual_medium": 0.5,
"conceptual_medium": 1.0,
"reasoning_medium": 1.0,
"tricky_medium": 0.5,
"nuanced_hard": 0.0,
"meta_loop_prone": 0.5
},
"accuracy_by_difficulty": {
"1": 0.5,
"2": 0.75,
"3": 0.25
},
"avg_latency_ms": 0.1,
"total_tests": 14,
"correct_count": 8,
"category_stats": {
"factual_easy": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1
},
"factual_medium": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1
},
"conceptual_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.1
},
"reasoning_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.1
},
"tricky_medium": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1
},
"nuanced_hard": {
"accuracy": 0.0,
"count": 2,
"avg_latency_ms": 0.1
},
"meta_loop_prone": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1
}
}
},
"Phase_6_Plus_13_Plus_14": {
"overall_accuracy": 0.7857142857142857,
"accuracy_by_category": {
"factual_easy": 1.0,
"factual_medium": 0.5,
"conceptual_medium": 1.0,
"reasoning_medium": 0.5,
"tricky_medium": 1.0,
"nuanced_hard": 1.0,
"meta_loop_prone": 0.5
},
"accuracy_by_difficulty": {
"1": 1.0,
"2": 0.75,
"3": 0.75
},
"avg_latency_ms": 0.1,
"total_tests": 14,
"correct_count": 11,
"category_stats": {
"factual_easy": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.1
},
"factual_medium": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1
},
"conceptual_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.1
},
"reasoning_medium": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1
},
"tricky_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.1
},
"nuanced_hard": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.1
},
"meta_loop_prone": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1
}
}
}
},
"summary": {
"phase6_accuracy": 0.42857142857142855,
"phase6_13_accuracy": 0.5714285714285714,
"phase6_13_14_accuracy": 0.7857142857142857,
"improvement_13_pct": 33.33333333333333,
"improvement_14_pct": 37.50000000000001,
"total_improvement_pct": 227.38095238095238
}
}