Agnuxo commited on
Commit
82cbcff
·
verified ·
1 Parent(s): e2c341c

Add harness_best.json

Browse files
Files changed (1) hide show
  1. harness_best.json +496 -0
harness_best.json ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_score": 6.2,
3
+ "best_run": 78,
4
+ "best_result": {
5
+ "run_id": 78,
6
+ "model": "cajal-4b-f16",
7
+ "topic": "zkSNARK-Proven Correctness of Leader Rotation in Permissionless Consensus",
8
+ "score": 6.2,
9
+ "scores": {
10
+ "sections": {
11
+ "abstract": 5.5,
12
+ "introduction": 5.1,
13
+ "methodology": 4.1,
14
+ "results": 3.3,
15
+ "discussion": 3.6,
16
+ "conclusion": 4.3,
17
+ "references": 5.1
18
+ },
19
+ "overall": 6.2,
20
+ "novelty": 5.6,
21
+ "reproducibility": 5,
22
+ "citation_quality": 5.3,
23
+ "judges": [
24
+ "Cerebras-Qwen235B",
25
+ "Cerebras-Llama8B",
26
+ "Mistral",
27
+ "Sarvam",
28
+ "NVIDIA",
29
+ "Inception-Mercury2",
30
+ "Cohere-CommandA",
31
+ "Cloudflare-Qwen3",
32
+ "NVIDIA-Devstral",
33
+ "Cohere-Command-A",
34
+ "Cohere-R7B",
35
+ "Mistral-Medium",
36
+ "Mistral-Large",
37
+ "Sarvam-KeyVariant-2",
38
+ "Sarvam-KeyVariant-3",
39
+ "OpenRouter-GPT-OSS-Free",
40
+ "Cohere-Aya-Expanse",
41
+ "Inception-Mercury2-Key2",
42
+ "Cerebras-Qwen235B-Key2"
43
+ ],
44
+ "judge_count": 19,
45
+ "judge_details": [
46
+ {
47
+ "judge": "Cerebras-Qwen235B",
48
+ "scores": {
49
+ "abstract": 7,
50
+ "introduction": 7,
51
+ "methodology": 5,
52
+ "results": 4,
53
+ "discussion": 5,
54
+ "conclusion": 6,
55
+ "references": 3,
56
+ "novelty": 5,
57
+ "reproducibility": 4,
58
+ "citation_quality": 3
59
+ },
60
+ "feedback": null
61
+ },
62
+ {
63
+ "judge": "Cerebras-Llama8B",
64
+ "scores": {
65
+ "abstract": 8,
66
+ "introduction": 8,
67
+ "methodology": 6,
68
+ "results": 7,
69
+ "discussion": 6,
70
+ "conclusion": 7,
71
+ "references": 9,
72
+ "novelty": 8,
73
+ "reproducibility": 8,
74
+ "citation_quality": 9
75
+ },
76
+ "feedback": null
77
+ },
78
+ {
79
+ "judge": "Mistral",
80
+ "scores": {
81
+ "abstract": 7,
82
+ "introduction": 6,
83
+ "methodology": 4,
84
+ "results": 4,
85
+ "discussion": 5,
86
+ "conclusion": 5,
87
+ "references": 5,
88
+ "novelty": 6,
89
+ "reproducibility": 3,
90
+ "citation_quality": 4
91
+ },
92
+ "feedback": null
93
+ },
94
+ {
95
+ "judge": "Sarvam",
96
+ "scores": {
97
+ "abstract": 7,
98
+ "introduction": 6,
99
+ "methodology": 5,
100
+ "results": 4,
101
+ "discussion": 5,
102
+ "conclusion": 7,
103
+ "references": 4,
104
+ "novelty": 7,
105
+ "reproducibility": 3,
106
+ "citation_quality": 5
107
+ },
108
+ "feedback": null
109
+ },
110
+ {
111
+ "judge": "NVIDIA",
112
+ "scores": {
113
+ "abstract": 8,
114
+ "introduction": 9,
115
+ "methodology": 8,
116
+ "results": 7,
117
+ "discussion": 8,
118
+ "conclusion": 8,
119
+ "references": 9,
120
+ "novelty": 7,
121
+ "reproducibility": 8,
122
+ "citation_quality": 9
123
+ },
124
+ "feedback": null
125
+ },
126
+ {
127
+ "judge": "Inception-Mercury2",
128
+ "scores": {
129
+ "abstract": 6,
130
+ "introduction": 6,
131
+ "methodology": 5,
132
+ "results": 4,
133
+ "discussion": 5,
134
+ "conclusion": 6,
135
+ "references": 4,
136
+ "novelty": 4,
137
+ "reproducibility": 3,
138
+ "citation_quality": 4
139
+ },
140
+ "feedback": null
141
+ },
142
+ {
143
+ "judge": "Cohere-CommandA",
144
+ "scores": {
145
+ "abstract": 8,
146
+ "introduction": 9,
147
+ "methodology": 8,
148
+ "results": 4,
149
+ "discussion": 7,
150
+ "conclusion": 7,
151
+ "references": 7,
152
+ "novelty": 7,
153
+ "reproducibility": 9,
154
+ "citation_quality": 8
155
+ },
156
+ "feedback": null
157
+ },
158
+ {
159
+ "judge": "Cloudflare-Qwen3",
160
+ "scores": {
161
+ "abstract": 6,
162
+ "introduction": 5,
163
+ "methodology": 5,
164
+ "results": 4,
165
+ "discussion": 5,
166
+ "conclusion": 5,
167
+ "references": 5,
168
+ "novelty": 5,
169
+ "reproducibility": 5,
170
+ "citation_quality": 4
171
+ },
172
+ "feedback": null
173
+ },
174
+ {
175
+ "judge": "NVIDIA-Devstral",
176
+ "scores": {
177
+ "abstract": 8,
178
+ "introduction": 7,
179
+ "methodology": 6,
180
+ "results": 5,
181
+ "discussion": 5,
182
+ "conclusion": 6,
183
+ "references": 6,
184
+ "novelty": 7,
185
+ "reproducibility": 7,
186
+ "citation_quality": 6
187
+ },
188
+ "feedback": null
189
+ },
190
+ {
191
+ "judge": "Cohere-Command-A",
192
+ "scores": {
193
+ "abstract": 8,
194
+ "introduction": 7,
195
+ "methodology": 6,
196
+ "results": 5,
197
+ "discussion": 6,
198
+ "conclusion": 7,
199
+ "references": 7,
200
+ "novelty": 7,
201
+ "reproducibility": 6,
202
+ "citation_quality": 8
203
+ },
204
+ "feedback": null
205
+ },
206
+ {
207
+ "judge": "Cohere-R7B",
208
+ "scores": {
209
+ "abstract": 8,
210
+ "introduction": 8,
211
+ "methodology": 7,
212
+ "results": 7,
213
+ "discussion": 7,
214
+ "conclusion": 7,
215
+ "references": 8,
216
+ "novelty": 8,
217
+ "reproducibility": 8,
218
+ "citation_quality": 8
219
+ },
220
+ "feedback": null
221
+ },
222
+ {
223
+ "judge": "Mistral-Medium",
224
+ "scores": {
225
+ "abstract": 9,
226
+ "introduction": 8,
227
+ "methodology": 6,
228
+ "results": 5,
229
+ "discussion": 4,
230
+ "conclusion": 5,
231
+ "references": 5,
232
+ "novelty": 7,
233
+ "reproducibility": 6,
234
+ "citation_quality": 6
235
+ },
236
+ "feedback": null
237
+ },
238
+ {
239
+ "judge": "Mistral-Large",
240
+ "scores": {
241
+ "abstract": 7,
242
+ "introduction": 6,
243
+ "methodology": 5,
244
+ "results": 4,
245
+ "discussion": 3,
246
+ "conclusion": 5,
247
+ "references": 6,
248
+ "novelty": 6,
249
+ "reproducibility": 5,
250
+ "citation_quality": 7
251
+ },
252
+ "feedback": null
253
+ },
254
+ {
255
+ "judge": "Sarvam-KeyVariant-2",
256
+ "scores": {
257
+ "abstract": 7,
258
+ "introduction": 6,
259
+ "methodology": 4,
260
+ "results": 3,
261
+ "discussion": 3,
262
+ "conclusion": 5,
263
+ "references": 1,
264
+ "novelty": 6,
265
+ "reproducibility": 3,
266
+ "citation_quality": 1
267
+ },
268
+ "feedback": null
269
+ },
270
+ {
271
+ "judge": "Sarvam-KeyVariant-3",
272
+ "scores": {
273
+ "abstract": 7,
274
+ "introduction": 6,
275
+ "methodology": 5,
276
+ "results": 4,
277
+ "discussion": 5,
278
+ "conclusion": 7,
279
+ "references": 4,
280
+ "novelty": 7,
281
+ "reproducibility": 3,
282
+ "citation_quality": 5
283
+ },
284
+ "feedback": null
285
+ },
286
+ {
287
+ "judge": "OpenRouter-GPT-OSS-Free",
288
+ "scores": {
289
+ "abstract": 5,
290
+ "introduction": 5,
291
+ "methodology": 3,
292
+ "results": 3,
293
+ "discussion": 3,
294
+ "conclusion": 4,
295
+ "references": 5,
296
+ "novelty": 4,
297
+ "reproducibility": 3,
298
+ "citation_quality": 5
299
+ },
300
+ "feedback": null
301
+ },
302
+ {
303
+ "judge": "Cohere-Aya-Expanse",
304
+ "scores": {
305
+ "abstract": 9,
306
+ "introduction": 8,
307
+ "methodology": 7,
308
+ "results": 6,
309
+ "discussion": 7,
310
+ "conclusion": 8,
311
+ "references": 9,
312
+ "novelty": 6,
313
+ "reproducibility": 7,
314
+ "citation_quality": 9
315
+ },
316
+ "feedback": null
317
+ },
318
+ {
319
+ "judge": "Inception-Mercury2-Key2",
320
+ "scores": {
321
+ "abstract": 7,
322
+ "introduction": 6,
323
+ "methodology": 5,
324
+ "results": 3,
325
+ "discussion": 0,
326
+ "conclusion": 0,
327
+ "references": 7,
328
+ "novelty": 5,
329
+ "reproducibility": 3,
330
+ "citation_quality": 6
331
+ },
332
+ "feedback": null
333
+ },
334
+ {
335
+ "judge": "Cerebras-Qwen235B-Key2",
336
+ "scores": {
337
+ "abstract": 7,
338
+ "introduction": 7,
339
+ "methodology": 5,
340
+ "results": 4,
341
+ "discussion": 5,
342
+ "conclusion": 6,
343
+ "references": 3,
344
+ "novelty": 5,
345
+ "reproducibility": 4,
346
+ "citation_quality": 3
347
+ },
348
+ "feedback": null
349
+ }
350
+ ],
351
+ "consensus": {
352
+ "abstract": 0.8,
353
+ "introduction": 0.76,
354
+ "methodology": 0.75,
355
+ "results": 0.75,
356
+ "discussion": 0.64,
357
+ "conclusion": 0.65,
358
+ "references": 0.56,
359
+ "novelty": 0.76,
360
+ "reproducibility": 0.58,
361
+ "citation_quality": 0.55
362
+ },
363
+ "overall_consensus": 0.68,
364
+ "feedback": null,
365
+ "scored_at": "2026-05-07T13:18:06.650Z",
366
+ "paper_type": "research",
367
+ "calibration": {
368
+ "field": "cs-distributed",
369
+ "field_confidence": 1,
370
+ "signals_summary": {
371
+ "word_count": 5971,
372
+ "sections_present": 7,
373
+ "sections_missing": [],
374
+ "red_flags": [
375
+ "low_vocabulary_diversity",
376
+ "excessive_repetition_ratio_0.277"
377
+ ],
378
+ "red_flag_count": 2,
379
+ "has_formal_proofs": true,
380
+ "has_equations": true,
381
+ "has_code": true,
382
+ "unique_refs": 17,
383
+ "has_placeholder_refs": false,
384
+ "depth_score": 1,
385
+ "evidence_markers": 11,
386
+ "deception_count": 0,
387
+ "deception_matches": [],
388
+ "grammar": {
389
+ "vocabulary_diversity": 0.234,
390
+ "is_monotone": false,
391
+ "is_low_vocabulary": true
392
+ },
393
+ "repetition_ratio": 0.277,
394
+ "code_quality": {
395
+ "blocks": 6,
396
+ "has_real_code": false,
397
+ "has_python": true
398
+ },
399
+ "math_formulas": 11,
400
+ "lean4": "none",
401
+ "tables": 0
402
+ },
403
+ "adjustments": {
404
+ "abstract": [
405
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
406
+ "llm_inflation_correction: 4.3 -> 4"
407
+ ],
408
+ "introduction": [
409
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
410
+ "llm_inflation_correction: 3.8 -> 3.6"
411
+ ],
412
+ "methodology": [
413
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
414
+ "llm_inflation_correction: 2.5 -> 2.6"
415
+ ],
416
+ "results": [
417
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
418
+ "llm_inflation_correction: 1.6 -> 1.8"
419
+ ],
420
+ "discussion": [
421
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
422
+ "llm_inflation_correction: 1.9 -> 2.1"
423
+ ],
424
+ "conclusion": [
425
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)"
426
+ ],
427
+ "references": [
428
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)"
429
+ ],
430
+ "novelty": [
431
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
432
+ "llm_inflation_correction: 3.2 -> 3.1"
433
+ ],
434
+ "reproducibility": [
435
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
436
+ "llm_inflation_correction: 2.2 -> 2.3"
437
+ ],
438
+ "citation_quality": [
439
+ "red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)"
440
+ ]
441
+ },
442
+ "adjustment_count": 10,
443
+ "reference_papers": [
444
+ "The Byzantine Generals Problem",
445
+ "Bitcoin: A Peer-to-Peer Electronic Cash System",
446
+ "In Search of an Understandable Consensus Algorithm"
447
+ ],
448
+ "false_positive_corrected": "code_blocks_are_template_not_real (live verification confirmed code executes)"
449
+ },
450
+ "live_verification": {
451
+ "verification_time_ms": 18217,
452
+ "citations": {
453
+ "total": 8,
454
+ "verified": 7,
455
+ "verification_rate": 88
456
+ },
457
+ "novelty": {
458
+ "searched": true,
459
+ "total_found": 5,
460
+ "novelty_concern": "low",
461
+ "max_similarity": 33
462
+ },
463
+ "code_execution": {
464
+ "total": 5,
465
+ "passed": 3,
466
+ "failed": 0
467
+ },
468
+ "lean4": {
469
+ "blocks_found": 0,
470
+ "verified": 0,
471
+ "has_unsubstantiated_claim": true
472
+ },
473
+ "adjustments": {
474
+ "reproducibility": "claims_formal_verification_without_lean4_code: cap at 3",
475
+ "reproducibility_cap": 3
476
+ },
477
+ "bonuses": {
478
+ "references": "crossref_verified_7/8(88%): +1 bonus",
479
+ "references_bonus": 1,
480
+ "citation_quality": "crossref_high_rate: +1 bonus",
481
+ "citation_quality_bonus": 1,
482
+ "novelty": "arxiv_no_similar_papers: +1 novelty bonus",
483
+ "novelty_bonus": 1,
484
+ "reproducibility": "code_executed_3/5_passed: +2 reproducibility bonus",
485
+ "reproducibility_bonus": 2,
486
+ "execution_proof_bonus": 1.5,
487
+ "execution_proof_note": "3 code block(s) executed successfully: +1.5 overall bonus (capped at 1.5)"
488
+ }
489
+ }
490
+ },
491
+ "words": 5915,
492
+ "paper_id": "paper-1778159681193",
493
+ "ts": "2026-05-07T15:18:12.388195"
494
+ },
495
+ "ts": "2026-05-07T15:18:12.389185"
496
+ }