SynthAudit-Env / outputs /post_training_eval.json
Timusgeorge's picture
Upload post_training_eval.json
e63870d verified
{
"base": {
"results": [
{
"seed": 42,
"difficulty": "easy",
"score": 0.1736,
"correct_flags": 1,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 5,
"total_errors": 6,
"actions_taken": 4,
"actions_parsed": 4
},
{
"seed": 137,
"difficulty": "easy",
"score": 0.0167,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 0,
"missed_errors": 6,
"total_errors": 6,
"actions_taken": 4,
"actions_parsed": 4
},
{
"seed": 256,
"difficulty": "easy",
"score": 0.0389,
"correct_flags": 0,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 6,
"total_errors": 6,
"actions_taken": 7,
"actions_parsed": 7
},
{
"seed": 512,
"difficulty": "easy",
"score": 0.0333,
"correct_flags": 0,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 6,
"total_errors": 6,
"actions_taken": 6,
"actions_parsed": 6
},
{
"seed": 1024,
"difficulty": "easy",
"score": 0.1736,
"correct_flags": 1,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 5,
"total_errors": 6,
"actions_taken": 4,
"actions_parsed": 4
},
{
"seed": 42,
"difficulty": "medium",
"score": 0.0179,
"correct_flags": 0,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 13,
"total_errors": 13,
"actions_taken": 7,
"actions_parsed": 7
},
{
"seed": 137,
"difficulty": "medium",
"score": 0.0256,
"correct_flags": 0,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 13,
"total_errors": 13,
"actions_taken": 10,
"actions_parsed": 10
},
{
"seed": 256,
"difficulty": "medium",
"score": 0.01,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 0,
"missed_errors": 13,
"total_errors": 13,
"actions_taken": 4,
"actions_parsed": 4
},
{
"seed": 512,
"difficulty": "medium",
"score": 0.0256,
"correct_flags": 0,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 13,
"total_errors": 13,
"actions_taken": 10,
"actions_parsed": 10
},
{
"seed": 1024,
"difficulty": "medium",
"score": 0.01,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 0,
"missed_errors": 13,
"total_errors": 13,
"actions_taken": 4,
"actions_parsed": 4
},
{
"seed": 42,
"difficulty": "hard",
"score": 0.01,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 0,
"missed_errors": 16,
"total_errors": 16,
"actions_taken": 4,
"actions_parsed": 4
},
{
"seed": 137,
"difficulty": "hard",
"score": 0.01,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 0,
"missed_errors": 17,
"total_errors": 17,
"actions_taken": 4,
"actions_parsed": 4
},
{
"seed": 256,
"difficulty": "hard",
"score": 0.01,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 0,
"missed_errors": 14,
"total_errors": 14,
"actions_taken": 4,
"actions_parsed": 4
},
{
"seed": 512,
"difficulty": "hard",
"score": 0.0214,
"correct_flags": 0,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 14,
"total_errors": 14,
"actions_taken": 9,
"actions_parsed": 9
},
{
"seed": 1024,
"difficulty": "hard",
"score": 0.0235,
"correct_flags": 0,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 17,
"total_errors": 17,
"actions_taken": 12,
"actions_parsed": 12
}
],
"overall": 0.04
},
"trained": {
"results": [
{
"seed": 42,
"difficulty": "easy",
"score": 0.2958,
"correct_flags": 1,
"false_positives": 0,
"correct_approvals": 2,
"missed_errors": 5,
"total_errors": 6,
"actions_taken": 7,
"actions_parsed": 7
},
{
"seed": 137,
"difficulty": "easy",
"score": 0.25,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 2,
"missed_errors": 6,
"total_errors": 6,
"actions_taken": 9,
"actions_parsed": 9
},
{
"seed": 256,
"difficulty": "easy",
"score": 0.3402,
"correct_flags": 1,
"false_positives": 0,
"correct_approvals": 4,
"missed_errors": 5,
"total_errors": 6,
"actions_taken": 9,
"actions_parsed": 9
},
{
"seed": 512,
"difficulty": "easy",
"score": 0.2712,
"correct_flags": 1,
"false_positives": 2,
"correct_approvals": 1,
"missed_errors": 5,
"total_errors": 6,
"actions_taken": 12,
"actions_parsed": 12
},
{
"seed": 1024,
"difficulty": "easy",
"score": 0.2791,
"correct_flags": 1,
"false_positives": 0,
"correct_approvals": 1,
"missed_errors": 5,
"total_errors": 6,
"actions_taken": 10,
"actions_parsed": 10
},
{
"seed": 42,
"difficulty": "medium",
"score": 0.1308,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 2,
"missed_errors": 13,
"total_errors": 13,
"actions_taken": 12,
"actions_parsed": 12
},
{
"seed": 137,
"difficulty": "medium",
"score": 0.01,
"correct_flags": 0,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 13,
"total_errors": 13,
"actions_taken": 0,
"actions_parsed": 0
},
{
"seed": 256,
"difficulty": "medium",
"score": 0.0923,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 1,
"missed_errors": 13,
"total_errors": 13,
"actions_taken": 10,
"actions_parsed": 10
},
{
"seed": 512,
"difficulty": "medium",
"score": 0.2393,
"correct_flags": 1,
"false_positives": 0,
"correct_approvals": 6,
"missed_errors": 12,
"total_errors": 13,
"actions_taken": 12,
"actions_parsed": 12
},
{
"seed": 1024,
"difficulty": "medium",
"score": 0.1735,
"correct_flags": 1,
"false_positives": 3,
"correct_approvals": 1,
"missed_errors": 12,
"total_errors": 13,
"actions_taken": 16,
"actions_parsed": 16
},
{
"seed": 42,
"difficulty": "hard",
"score": 0.0271,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 0,
"missed_errors": 16,
"total_errors": 16,
"actions_taken": 14,
"actions_parsed": 14
},
{
"seed": 137,
"difficulty": "hard",
"score": 0.0235,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 0,
"missed_errors": 17,
"total_errors": 17,
"actions_taken": 13,
"actions_parsed": 13
},
{
"seed": 256,
"difficulty": "hard",
"score": 0.0262,
"correct_flags": 0,
"false_positives": 1,
"correct_approvals": 0,
"missed_errors": 14,
"total_errors": 14,
"actions_taken": 12,
"actions_parsed": 12
},
{
"seed": 512,
"difficulty": "hard",
"score": 0.0771,
"correct_flags": 1,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 13,
"total_errors": 14,
"actions_taken": 4,
"actions_parsed": 4
},
{
"seed": 1024,
"difficulty": "hard",
"score": 0.0638,
"correct_flags": 1,
"false_positives": 0,
"correct_approvals": 0,
"missed_errors": 16,
"total_errors": 17,
"actions_taken": 4,
"actions_parsed": 4
}
],
"overall": 0.1533
},
"improvement": 283.25
}