Spaces:
Running
Running
deploy: update reports/eval_report.md
Browse files- reports/eval_report.md +38 -38
reports/eval_report.md
CHANGED
|
@@ -1,38 +1,38 @@
|
|
| 1 |
-
# Evaluation Report
|
| 2 |
-
|
| 3 |
-
Generated at: 2026-04-25T18:12:09.069260+00:00
|
| 4 |
-
Base URL: http://localhost:7860
|
| 5 |
-
Tasks: clean_claim, contradictory_claim, coordinated_fraud, distribution_shift_claim, identity_fraud
|
| 6 |
-
Seeds: 7, 11, 13, 19, 25
|
| 7 |
-
Distinct variant_ids: [0, 1, 2, 3, 4]
|
| 8 |
-
|
| 9 |
-
| Task | Seed | Variant | Steps | Done | Reward | Evidence Quality | Exploit Penalty |
|
| 10 |
-
|---|---:|---:|---:|:---:|---:|---:|---:|
|
| 11 |
-
| clean_claim | 7 | 2 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 12 |
-
| clean_claim | 11 | 1 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 13 |
-
| clean_claim | 13 | 3 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 14 |
-
| clean_claim | 19 | 4 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 15 |
-
| clean_claim | 25 | 0 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 16 |
-
| contradictory_claim | 7 | 2 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 17 |
-
| contradictory_claim | 11 | 1 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 18 |
-
| contradictory_claim | 13 | 3 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 19 |
-
| contradictory_claim | 19 | 4 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 20 |
-
| contradictory_claim | 25 | 0 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 21 |
-
| coordinated_fraud | 7 | 2 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 22 |
-
| coordinated_fraud | 11 | 1 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 23 |
-
| coordinated_fraud | 13 | 3 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 24 |
-
| coordinated_fraud | 19 | 4 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 25 |
-
| coordinated_fraud | 25 | 0 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 26 |
-
| distribution_shift_claim | 7 | 2 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 27 |
-
| distribution_shift_claim | 11 | 1 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 28 |
-
| distribution_shift_claim | 13 | 3 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 29 |
-
| distribution_shift_claim | 19 | 4 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 30 |
-
| distribution_shift_claim | 25 | 0 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 31 |
-
| identity_fraud | 7 | 2 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 32 |
-
| identity_fraud | 11 | 1 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 33 |
-
| identity_fraud | 13 | 3 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 34 |
-
| identity_fraud | 19 | 4 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 35 |
-
| identity_fraud | 25 | 0 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 36 |
-
|
| 37 |
-
Average Reward: 0.8092
|
| 38 |
-
Completion Rate: 100.00%
|
|
|
|
| 1 |
+
# Evaluation Report
|
| 2 |
+
|
| 3 |
+
Generated at: 2026-04-25T18:12:09.069260+00:00
|
| 4 |
+
Base URL: http://localhost:7860
|
| 5 |
+
Tasks: clean_claim, contradictory_claim, coordinated_fraud, distribution_shift_claim, identity_fraud
|
| 6 |
+
Seeds: 7, 11, 13, 19, 25
|
| 7 |
+
Distinct variant_ids: [0, 1, 2, 3, 4]
|
| 8 |
+
|
| 9 |
+
| Task | Seed | Variant | Steps | Done | Reward | Evidence Quality | Exploit Penalty |
|
| 10 |
+
|---|---:|---:|---:|:---:|---:|---:|---:|
|
| 11 |
+
| clean_claim | 7 | 2 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 12 |
+
| clean_claim | 11 | 1 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 13 |
+
| clean_claim | 13 | 3 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 14 |
+
| clean_claim | 19 | 4 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 15 |
+
| clean_claim | 25 | 0 | 4 | yes | 0.8725 | 1.0000 | 0.0000 |
|
| 16 |
+
| contradictory_claim | 7 | 2 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 17 |
+
| contradictory_claim | 11 | 1 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 18 |
+
| contradictory_claim | 13 | 3 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 19 |
+
| contradictory_claim | 19 | 4 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 20 |
+
| contradictory_claim | 25 | 0 | 8 | yes | 0.7497 | 1.0000 | 0.0000 |
|
| 21 |
+
| coordinated_fraud | 7 | 2 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 22 |
+
| coordinated_fraud | 11 | 1 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 23 |
+
| coordinated_fraud | 13 | 3 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 24 |
+
| coordinated_fraud | 19 | 4 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 25 |
+
| coordinated_fraud | 25 | 0 | 12 | yes | 0.8230 | 1.0000 | 0.0000 |
|
| 26 |
+
| distribution_shift_claim | 7 | 2 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 27 |
+
| distribution_shift_claim | 11 | 1 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 28 |
+
| distribution_shift_claim | 13 | 3 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 29 |
+
| distribution_shift_claim | 19 | 4 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 30 |
+
| distribution_shift_claim | 25 | 0 | 12 | yes | 0.7827 | 1.0000 | 0.0000 |
|
| 31 |
+
| identity_fraud | 7 | 2 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 32 |
+
| identity_fraud | 11 | 1 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 33 |
+
| identity_fraud | 13 | 3 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 34 |
+
| identity_fraud | 19 | 4 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 35 |
+
| identity_fraud | 25 | 0 | 10 | yes | 0.8180 | 1.0000 | 0.0000 |
|
| 36 |
+
|
| 37 |
+
Average Reward: 0.8092
|
| 38 |
+
Completion Rate: 100.00%
|