agent-cost-optimizer / eval /swe_v10_results.json
narcolepticchicken's picture
Upload eval/swe_v10_results.json with huggingface_hub
fb62d8b verified
{
"v10_direct": {
"success": 0.766,
"avg_cost": 0.1878,
"costRed": 40.7
},
"v10_cascade": {
"success": 0.756,
"avg_cost": 0.1767,
"costRed": 44.2
},
"v10_feedback": {
"success": 0.848,
"avg_cost": 0.2014,
"costRed": 36.4
},
"v8_synthetic": {
"success": 0.658,
"avg_cost": 0.3534,
"costRed": -11.6
},
"frontier": {
"success": 0.782,
"avg_cost": 0.3167,
"costRed": 0.0
},
"oracle": {
"success": 0.87,
"avg_cost": 0.0624,
"costRed": 80.3
},
"always_cheap": {
"success": 0.632,
"avg_cost": 0.0142,
"costRed": 95.5
},
"key_finding": "v10 trained on REAL SWE-Router data achieves 36-44% cost reduction vs 8% for synthetic-trained v8. v10_feedback achieves HIGHER success than always-frontier (84.8% vs 78.2%) at 36.4% cost reduction."
}