Spaces:
Sleeping
Sleeping
| { | |
| "base": { | |
| "results": [ | |
| { | |
| "seed": 42, | |
| "difficulty": "easy", | |
| "score": 0.1736, | |
| "correct_flags": 1, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 5, | |
| "total_errors": 6, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| }, | |
| { | |
| "seed": 137, | |
| "difficulty": "easy", | |
| "score": 0.0167, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 0, | |
| "missed_errors": 6, | |
| "total_errors": 6, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| }, | |
| { | |
| "seed": 256, | |
| "difficulty": "easy", | |
| "score": 0.0389, | |
| "correct_flags": 0, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 6, | |
| "total_errors": 6, | |
| "actions_taken": 7, | |
| "actions_parsed": 7 | |
| }, | |
| { | |
| "seed": 512, | |
| "difficulty": "easy", | |
| "score": 0.0333, | |
| "correct_flags": 0, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 6, | |
| "total_errors": 6, | |
| "actions_taken": 6, | |
| "actions_parsed": 6 | |
| }, | |
| { | |
| "seed": 1024, | |
| "difficulty": "easy", | |
| "score": 0.1736, | |
| "correct_flags": 1, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 5, | |
| "total_errors": 6, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| }, | |
| { | |
| "seed": 42, | |
| "difficulty": "medium", | |
| "score": 0.0179, | |
| "correct_flags": 0, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 13, | |
| "total_errors": 13, | |
| "actions_taken": 7, | |
| "actions_parsed": 7 | |
| }, | |
| { | |
| "seed": 137, | |
| "difficulty": "medium", | |
| "score": 0.0256, | |
| "correct_flags": 0, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 13, | |
| "total_errors": 13, | |
| "actions_taken": 10, | |
| "actions_parsed": 10 | |
| }, | |
| { | |
| "seed": 256, | |
| "difficulty": "medium", | |
| "score": 0.01, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 0, | |
| "missed_errors": 13, | |
| "total_errors": 13, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| }, | |
| { | |
| "seed": 512, | |
| "difficulty": "medium", | |
| "score": 0.0256, | |
| "correct_flags": 0, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 13, | |
| "total_errors": 13, | |
| "actions_taken": 10, | |
| "actions_parsed": 10 | |
| }, | |
| { | |
| "seed": 1024, | |
| "difficulty": "medium", | |
| "score": 0.01, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 0, | |
| "missed_errors": 13, | |
| "total_errors": 13, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| }, | |
| { | |
| "seed": 42, | |
| "difficulty": "hard", | |
| "score": 0.01, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 0, | |
| "missed_errors": 16, | |
| "total_errors": 16, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| }, | |
| { | |
| "seed": 137, | |
| "difficulty": "hard", | |
| "score": 0.01, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 0, | |
| "missed_errors": 17, | |
| "total_errors": 17, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| }, | |
| { | |
| "seed": 256, | |
| "difficulty": "hard", | |
| "score": 0.01, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 0, | |
| "missed_errors": 14, | |
| "total_errors": 14, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| }, | |
| { | |
| "seed": 512, | |
| "difficulty": "hard", | |
| "score": 0.0214, | |
| "correct_flags": 0, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 14, | |
| "total_errors": 14, | |
| "actions_taken": 9, | |
| "actions_parsed": 9 | |
| }, | |
| { | |
| "seed": 1024, | |
| "difficulty": "hard", | |
| "score": 0.0235, | |
| "correct_flags": 0, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 17, | |
| "total_errors": 17, | |
| "actions_taken": 12, | |
| "actions_parsed": 12 | |
| } | |
| ], | |
| "overall": 0.04 | |
| }, | |
| "trained": { | |
| "results": [ | |
| { | |
| "seed": 42, | |
| "difficulty": "easy", | |
| "score": 0.2958, | |
| "correct_flags": 1, | |
| "false_positives": 0, | |
| "correct_approvals": 2, | |
| "missed_errors": 5, | |
| "total_errors": 6, | |
| "actions_taken": 7, | |
| "actions_parsed": 7 | |
| }, | |
| { | |
| "seed": 137, | |
| "difficulty": "easy", | |
| "score": 0.25, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 2, | |
| "missed_errors": 6, | |
| "total_errors": 6, | |
| "actions_taken": 9, | |
| "actions_parsed": 9 | |
| }, | |
| { | |
| "seed": 256, | |
| "difficulty": "easy", | |
| "score": 0.3402, | |
| "correct_flags": 1, | |
| "false_positives": 0, | |
| "correct_approvals": 4, | |
| "missed_errors": 5, | |
| "total_errors": 6, | |
| "actions_taken": 9, | |
| "actions_parsed": 9 | |
| }, | |
| { | |
| "seed": 512, | |
| "difficulty": "easy", | |
| "score": 0.2712, | |
| "correct_flags": 1, | |
| "false_positives": 2, | |
| "correct_approvals": 1, | |
| "missed_errors": 5, | |
| "total_errors": 6, | |
| "actions_taken": 12, | |
| "actions_parsed": 12 | |
| }, | |
| { | |
| "seed": 1024, | |
| "difficulty": "easy", | |
| "score": 0.2791, | |
| "correct_flags": 1, | |
| "false_positives": 0, | |
| "correct_approvals": 1, | |
| "missed_errors": 5, | |
| "total_errors": 6, | |
| "actions_taken": 10, | |
| "actions_parsed": 10 | |
| }, | |
| { | |
| "seed": 42, | |
| "difficulty": "medium", | |
| "score": 0.1308, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 2, | |
| "missed_errors": 13, | |
| "total_errors": 13, | |
| "actions_taken": 12, | |
| "actions_parsed": 12 | |
| }, | |
| { | |
| "seed": 137, | |
| "difficulty": "medium", | |
| "score": 0.01, | |
| "correct_flags": 0, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 13, | |
| "total_errors": 13, | |
| "actions_taken": 0, | |
| "actions_parsed": 0 | |
| }, | |
| { | |
| "seed": 256, | |
| "difficulty": "medium", | |
| "score": 0.0923, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 1, | |
| "missed_errors": 13, | |
| "total_errors": 13, | |
| "actions_taken": 10, | |
| "actions_parsed": 10 | |
| }, | |
| { | |
| "seed": 512, | |
| "difficulty": "medium", | |
| "score": 0.2393, | |
| "correct_flags": 1, | |
| "false_positives": 0, | |
| "correct_approvals": 6, | |
| "missed_errors": 12, | |
| "total_errors": 13, | |
| "actions_taken": 12, | |
| "actions_parsed": 12 | |
| }, | |
| { | |
| "seed": 1024, | |
| "difficulty": "medium", | |
| "score": 0.1735, | |
| "correct_flags": 1, | |
| "false_positives": 3, | |
| "correct_approvals": 1, | |
| "missed_errors": 12, | |
| "total_errors": 13, | |
| "actions_taken": 16, | |
| "actions_parsed": 16 | |
| }, | |
| { | |
| "seed": 42, | |
| "difficulty": "hard", | |
| "score": 0.0271, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 0, | |
| "missed_errors": 16, | |
| "total_errors": 16, | |
| "actions_taken": 14, | |
| "actions_parsed": 14 | |
| }, | |
| { | |
| "seed": 137, | |
| "difficulty": "hard", | |
| "score": 0.0235, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 0, | |
| "missed_errors": 17, | |
| "total_errors": 17, | |
| "actions_taken": 13, | |
| "actions_parsed": 13 | |
| }, | |
| { | |
| "seed": 256, | |
| "difficulty": "hard", | |
| "score": 0.0262, | |
| "correct_flags": 0, | |
| "false_positives": 1, | |
| "correct_approvals": 0, | |
| "missed_errors": 14, | |
| "total_errors": 14, | |
| "actions_taken": 12, | |
| "actions_parsed": 12 | |
| }, | |
| { | |
| "seed": 512, | |
| "difficulty": "hard", | |
| "score": 0.0771, | |
| "correct_flags": 1, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 13, | |
| "total_errors": 14, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| }, | |
| { | |
| "seed": 1024, | |
| "difficulty": "hard", | |
| "score": 0.0638, | |
| "correct_flags": 1, | |
| "false_positives": 0, | |
| "correct_approvals": 0, | |
| "missed_errors": 16, | |
| "total_errors": 17, | |
| "actions_taken": 4, | |
| "actions_parsed": 4 | |
| } | |
| ], | |
| "overall": 0.1533 | |
| }, | |
| "improvement": 283.25 | |
| } |