| { |
| "timestamp": 1774055916.062495, |
| "results": { |
| "Phase_6_Only": { |
| "overall_accuracy": 0.42857142857142855, |
| "accuracy_by_category": { |
| "factual_easy": 0.5, |
| "factual_medium": 0.0, |
| "conceptual_medium": 0.5, |
| "reasoning_medium": 1.0, |
| "tricky_medium": 1.0, |
| "nuanced_hard": 0.0, |
| "meta_loop_prone": 0.0 |
| }, |
| "accuracy_by_difficulty": { |
| "1": 0.5, |
| "2": 0.625, |
| "3": 0.0 |
| }, |
| "avg_latency_ms": 0.1, |
| "total_tests": 14, |
| "correct_count": 6, |
| "category_stats": { |
| "factual_easy": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "factual_medium": { |
| "accuracy": 0.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "conceptual_medium": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "reasoning_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "tricky_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "nuanced_hard": { |
| "accuracy": 0.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "meta_loop_prone": { |
| "accuracy": 0.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| } |
| } |
| }, |
| "Phase_6_Plus_13": { |
| "overall_accuracy": 0.5714285714285714, |
| "accuracy_by_category": { |
| "factual_easy": 0.5, |
| "factual_medium": 0.5, |
| "conceptual_medium": 1.0, |
| "reasoning_medium": 1.0, |
| "tricky_medium": 0.5, |
| "nuanced_hard": 0.0, |
| "meta_loop_prone": 0.5 |
| }, |
| "accuracy_by_difficulty": { |
| "1": 0.5, |
| "2": 0.75, |
| "3": 0.25 |
| }, |
| "avg_latency_ms": 0.1, |
| "total_tests": 14, |
| "correct_count": 8, |
| "category_stats": { |
| "factual_easy": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "factual_medium": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "conceptual_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "reasoning_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "tricky_medium": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "nuanced_hard": { |
| "accuracy": 0.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "meta_loop_prone": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| } |
| } |
| }, |
| "Phase_6_Plus_13_Plus_14": { |
| "overall_accuracy": 0.7857142857142857, |
| "accuracy_by_category": { |
| "factual_easy": 1.0, |
| "factual_medium": 0.5, |
| "conceptual_medium": 1.0, |
| "reasoning_medium": 0.5, |
| "tricky_medium": 1.0, |
| "nuanced_hard": 1.0, |
| "meta_loop_prone": 0.5 |
| }, |
| "accuracy_by_difficulty": { |
| "1": 1.0, |
| "2": 0.75, |
| "3": 0.75 |
| }, |
| "avg_latency_ms": 0.1, |
| "total_tests": 14, |
| "correct_count": 11, |
| "category_stats": { |
| "factual_easy": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "factual_medium": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "conceptual_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "reasoning_medium": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "tricky_medium": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "nuanced_hard": { |
| "accuracy": 1.0, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| }, |
| "meta_loop_prone": { |
| "accuracy": 0.5, |
| "count": 2, |
| "avg_latency_ms": 0.1 |
| } |
| } |
| } |
| }, |
| "summary": { |
| "phase6_accuracy": 0.42857142857142855, |
| "phase6_13_accuracy": 0.5714285714285714, |
| "phase6_13_14_accuracy": 0.7857142857142857, |
| "improvement_13_pct": 33.33333333333333, |
| "improvement_14_pct": 37.50000000000001, |
| "total_improvement_pct": 227.38095238095238 |
| } |
| } |