CAJAL-4B / harness_best.json
Agnuxo's picture
Add harness_best.json
82cbcff verified
{
"best_score": 6.2,
"best_run": 78,
"best_result": {
"run_id": 78,
"model": "cajal-4b-f16",
"topic": "zkSNARK-Proven Correctness of Leader Rotation in Permissionless Consensus",
"score": 6.2,
"scores": {
"sections": {
"abstract": 5.5,
"introduction": 5.1,
"methodology": 4.1,
"results": 3.3,
"discussion": 3.6,
"conclusion": 4.3,
"references": 5.1
},
"overall": 6.2,
"novelty": 5.6,
"reproducibility": 5,
"citation_quality": 5.3,
"judges": [
"Cerebras-Qwen235B",
"Cerebras-Llama8B",
"Mistral",
"Sarvam",
"NVIDIA",
"Inception-Mercury2",
"Cohere-CommandA",
"Cloudflare-Qwen3",
"NVIDIA-Devstral",
"Cohere-Command-A",
"Cohere-R7B",
"Mistral-Medium",
"Mistral-Large",
"Sarvam-KeyVariant-2",
"Sarvam-KeyVariant-3",
"OpenRouter-GPT-OSS-Free",
"Cohere-Aya-Expanse",
"Inception-Mercury2-Key2",
"Cerebras-Qwen235B-Key2"
],
"judge_count": 19,
"judge_details": [
{
"judge": "Cerebras-Qwen235B",
"scores": {
"abstract": 7,
"introduction": 7,
"methodology": 5,
"results": 4,
"discussion": 5,
"conclusion": 6,
"references": 3,
"novelty": 5,
"reproducibility": 4,
"citation_quality": 3
},
"feedback": null
},
{
"judge": "Cerebras-Llama8B",
"scores": {
"abstract": 8,
"introduction": 8,
"methodology": 6,
"results": 7,
"discussion": 6,
"conclusion": 7,
"references": 9,
"novelty": 8,
"reproducibility": 8,
"citation_quality": 9
},
"feedback": null
},
{
"judge": "Mistral",
"scores": {
"abstract": 7,
"introduction": 6,
"methodology": 4,
"results": 4,
"discussion": 5,
"conclusion": 5,
"references": 5,
"novelty": 6,
"reproducibility": 3,
"citation_quality": 4
},
"feedback": null
},
{
"judge": "Sarvam",
"scores": {
"abstract": 7,
"introduction": 6,
"methodology": 5,
"results": 4,
"discussion": 5,
"conclusion": 7,
"references": 4,
"novelty": 7,
"reproducibility": 3,
"citation_quality": 5
},
"feedback": null
},
{
"judge": "NVIDIA",
"scores": {
"abstract": 8,
"introduction": 9,
"methodology": 8,
"results": 7,
"discussion": 8,
"conclusion": 8,
"references": 9,
"novelty": 7,
"reproducibility": 8,
"citation_quality": 9
},
"feedback": null
},
{
"judge": "Inception-Mercury2",
"scores": {
"abstract": 6,
"introduction": 6,
"methodology": 5,
"results": 4,
"discussion": 5,
"conclusion": 6,
"references": 4,
"novelty": 4,
"reproducibility": 3,
"citation_quality": 4
},
"feedback": null
},
{
"judge": "Cohere-CommandA",
"scores": {
"abstract": 8,
"introduction": 9,
"methodology": 8,
"results": 4,
"discussion": 7,
"conclusion": 7,
"references": 7,
"novelty": 7,
"reproducibility": 9,
"citation_quality": 8
},
"feedback": null
},
{
"judge": "Cloudflare-Qwen3",
"scores": {
"abstract": 6,
"introduction": 5,
"methodology": 5,
"results": 4,
"discussion": 5,
"conclusion": 5,
"references": 5,
"novelty": 5,
"reproducibility": 5,
"citation_quality": 4
},
"feedback": null
},
{
"judge": "NVIDIA-Devstral",
"scores": {
"abstract": 8,
"introduction": 7,
"methodology": 6,
"results": 5,
"discussion": 5,
"conclusion": 6,
"references": 6,
"novelty": 7,
"reproducibility": 7,
"citation_quality": 6
},
"feedback": null
},
{
"judge": "Cohere-Command-A",
"scores": {
"abstract": 8,
"introduction": 7,
"methodology": 6,
"results": 5,
"discussion": 6,
"conclusion": 7,
"references": 7,
"novelty": 7,
"reproducibility": 6,
"citation_quality": 8
},
"feedback": null
},
{
"judge": "Cohere-R7B",
"scores": {
"abstract": 8,
"introduction": 8,
"methodology": 7,
"results": 7,
"discussion": 7,
"conclusion": 7,
"references": 8,
"novelty": 8,
"reproducibility": 8,
"citation_quality": 8
},
"feedback": null
},
{
"judge": "Mistral-Medium",
"scores": {
"abstract": 9,
"introduction": 8,
"methodology": 6,
"results": 5,
"discussion": 4,
"conclusion": 5,
"references": 5,
"novelty": 7,
"reproducibility": 6,
"citation_quality": 6
},
"feedback": null
},
{
"judge": "Mistral-Large",
"scores": {
"abstract": 7,
"introduction": 6,
"methodology": 5,
"results": 4,
"discussion": 3,
"conclusion": 5,
"references": 6,
"novelty": 6,
"reproducibility": 5,
"citation_quality": 7
},
"feedback": null
},
{
"judge": "Sarvam-KeyVariant-2",
"scores": {
"abstract": 7,
"introduction": 6,
"methodology": 4,
"results": 3,
"discussion": 3,
"conclusion": 5,
"references": 1,
"novelty": 6,
"reproducibility": 3,
"citation_quality": 1
},
"feedback": null
},
{
"judge": "Sarvam-KeyVariant-3",
"scores": {
"abstract": 7,
"introduction": 6,
"methodology": 5,
"results": 4,
"discussion": 5,
"conclusion": 7,
"references": 4,
"novelty": 7,
"reproducibility": 3,
"citation_quality": 5
},
"feedback": null
},
{
"judge": "OpenRouter-GPT-OSS-Free",
"scores": {
"abstract": 5,
"introduction": 5,
"methodology": 3,
"results": 3,
"discussion": 3,
"conclusion": 4,
"references": 5,
"novelty": 4,
"reproducibility": 3,
"citation_quality": 5
},
"feedback": null
},
{
"judge": "Cohere-Aya-Expanse",
"scores": {
"abstract": 9,
"introduction": 8,
"methodology": 7,
"results": 6,
"discussion": 7,
"conclusion": 8,
"references": 9,
"novelty": 6,
"reproducibility": 7,
"citation_quality": 9
},
"feedback": null
},
{
"judge": "Inception-Mercury2-Key2",
"scores": {
"abstract": 7,
"introduction": 6,
"methodology": 5,
"results": 3,
"discussion": 0,
"conclusion": 0,
"references": 7,
"novelty": 5,
"reproducibility": 3,
"citation_quality": 6
},
"feedback": null
},
{
"judge": "Cerebras-Qwen235B-Key2",
"scores": {
"abstract": 7,
"introduction": 7,
"methodology": 5,
"results": 4,
"discussion": 5,
"conclusion": 6,
"references": 3,
"novelty": 5,
"reproducibility": 4,
"citation_quality": 3
},
"feedback": null
}
],
"consensus": {
"abstract": 0.8,
"introduction": 0.76,
"methodology": 0.75,
"results": 0.75,
"discussion": 0.64,
"conclusion": 0.65,
"references": 0.56,
"novelty": 0.76,
"reproducibility": 0.58,
"citation_quality": 0.55
},
"overall_consensus": 0.68,
"feedback": null,
"scored_at": "2026-05-07T13:18:06.650Z",
"paper_type": "research",
"calibration": {
"field": "cs-distributed",
"field_confidence": 1,
"signals_summary": {
"word_count": 5971,
"sections_present": 7,
"sections_missing": [],
"red_flags": [
"low_vocabulary_diversity",
"excessive_repetition_ratio_0.277"
],
"red_flag_count": 2,
"has_formal_proofs": true,
"has_equations": true,
"has_code": true,
"unique_refs": 17,
"has_placeholder_refs": false,
"depth_score": 1,
"evidence_markers": 11,
"deception_count": 0,
"deception_matches": [],
"grammar": {
"vocabulary_diversity": 0.234,
"is_monotone": false,
"is_low_vocabulary": true
},
"repetition_ratio": 0.277,
"code_quality": {
"blocks": 6,
"has_real_code": false,
"has_python": true
},
"math_formulas": 11,
"lean4": "none",
"tables": 0
},
"adjustments": {
"abstract": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
"llm_inflation_correction: 4.3 -> 4"
],
"introduction": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
"llm_inflation_correction: 3.8 -> 3.6"
],
"methodology": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
"llm_inflation_correction: 2.5 -> 2.6"
],
"results": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
"llm_inflation_correction: 1.6 -> 1.8"
],
"discussion": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
"llm_inflation_correction: 1.9 -> 2.1"
],
"conclusion": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)"
],
"references": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)"
],
"novelty": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
"llm_inflation_correction: 3.2 -> 3.1"
],
"reproducibility": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)",
"llm_inflation_correction: 2.2 -> 2.3"
],
"citation_quality": [
"red_flag_penalty: -3 (low_vocabulary_diversity, excessive_repetition_ratio_0.277, code_blocks_are_template_not_real)"
]
},
"adjustment_count": 10,
"reference_papers": [
"The Byzantine Generals Problem",
"Bitcoin: A Peer-to-Peer Electronic Cash System",
"In Search of an Understandable Consensus Algorithm"
],
"false_positive_corrected": "code_blocks_are_template_not_real (live verification confirmed code executes)"
},
"live_verification": {
"verification_time_ms": 18217,
"citations": {
"total": 8,
"verified": 7,
"verification_rate": 88
},
"novelty": {
"searched": true,
"total_found": 5,
"novelty_concern": "low",
"max_similarity": 33
},
"code_execution": {
"total": 5,
"passed": 3,
"failed": 0
},
"lean4": {
"blocks_found": 0,
"verified": 0,
"has_unsubstantiated_claim": true
},
"adjustments": {
"reproducibility": "claims_formal_verification_without_lean4_code: cap at 3",
"reproducibility_cap": 3
},
"bonuses": {
"references": "crossref_verified_7/8(88%): +1 bonus",
"references_bonus": 1,
"citation_quality": "crossref_high_rate: +1 bonus",
"citation_quality_bonus": 1,
"novelty": "arxiv_no_similar_papers: +1 novelty bonus",
"novelty_bonus": 1,
"reproducibility": "code_executed_3/5_passed: +2 reproducibility bonus",
"reproducibility_bonus": 2,
"execution_proof_bonus": 1.5,
"execution_proof_note": "3 code block(s) executed successfully: +1.5 overall bonus (capped at 1.5)"
}
}
},
"words": 5915,
"paper_id": "paper-1778159681193",
"ts": "2026-05-07T15:18:12.388195"
},
"ts": "2026-05-07T15:18:12.389185"
}