Add generate_dataset.py
Browse files- generate_dataset.py +814 -0
generate_dataset.py
ADDED
|
@@ -0,0 +1,814 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhD Research OS — Synthetic Training Dataset Generator
|
| 3 |
+
=====================================================
|
| 4 |
+
Generates multi-task SFT dataset in TRL conversational format for:
|
| 5 |
+
Task 1: Scientific Claim Extraction (text → structured claims JSON)
|
| 6 |
+
Task 2: Epistemic Classification (Fact / Interpretation / Hypothesis / Conflict_Hypothesis)
|
| 7 |
+
Task 3: Confidence Scoring (evidence_strength × study_quality × journal_tier × completeness)
|
| 8 |
+
Task 4: Contradiction Detection (claim pair → conflict analysis)
|
| 9 |
+
Task 5: Query Decomposition (broad question → sub-queries)
|
| 10 |
+
Task 6: Decision Object Generation (gaps + goals → proposed actions with info gain)
|
| 11 |
+
|
| 12 |
+
Output: HF Dataset with "messages" column in conversational ChatML format.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import random
|
| 17 |
+
import hashlib
|
| 18 |
+
from datasets import Dataset, DatasetDict
|
| 19 |
+
|
| 20 |
+
random.seed(42)
|
| 21 |
+
|
| 22 |
+
# ============================================================
|
| 23 |
+
# SYSTEM PROMPTS (one per task — stored in /config/prompts/)
|
| 24 |
+
# ============================================================
|
| 25 |
+
|
| 26 |
+
SYSTEM_CLAIM_EXTRACTION = """You are the Researcher Agent of a PhD Research OS. Your role is to extract structured scientific claims from research paper text.
|
| 27 |
+
|
| 28 |
+
For each claim, output a JSON object with these fields:
|
| 29 |
+
- claim_id: string (CLM_XXXX format)
|
| 30 |
+
- text: the claim text as stated in the paper
|
| 31 |
+
- epistemic_tag: one of "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"
|
| 32 |
+
- confidence: float [0,1] computed as evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty
|
| 33 |
+
- evidence_strength: float [0,1] based on directness of evidence
|
| 34 |
+
- study_type: one of "primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study"
|
| 35 |
+
- missing_fields: list of field names that could not be determined from the text
|
| 36 |
+
- status: "Complete" if no missing fields, else "Incomplete"
|
| 37 |
+
- parameters: dict of key experimental parameters mentioned (concentrations, temperatures, etc.)
|
| 38 |
+
|
| 39 |
+
Output must be valid JSON: {"claims": [...]}
|
| 40 |
+
Always classify epistemic tags conservatively. When uncertain, prefer "Interpretation" over "Fact"."""
|
| 41 |
+
|
| 42 |
+
SYSTEM_EPISTEMIC_CLASSIFIER = """You are the Epistemic Classifier of a PhD Research OS. Given a scientific statement, classify it into exactly one category:
|
| 43 |
+
|
| 44 |
+
- Fact: Directly supported by experimental data with quantitative evidence. Reproducible measurements.
|
| 45 |
+
- Interpretation: Author's explanation of data. Goes beyond what the numbers strictly show. Often uses words like "suggests", "indicates", "consistent with".
|
| 46 |
+
- Hypothesis: Proposed mechanism or prediction not yet tested. Uses "may", "could", "we propose", "it is possible".
|
| 47 |
+
- Conflict_Hypothesis: A claim that explicitly contradicts another established claim in the field. Evidence exists on both sides.
|
| 48 |
+
|
| 49 |
+
Output JSON: {"epistemic_tag": "...", "reasoning": "...", "confidence_in_classification": float}
|
| 50 |
+
Be conservative: if a statement mixes fact and interpretation, classify as Interpretation."""
|
| 51 |
+
|
| 52 |
+
SYSTEM_CONFIDENCE_SCORER = """You are the Confidence Scorer of a PhD Research OS. Score the confidence of a scientific claim using this formula:
|
| 53 |
+
|
| 54 |
+
confidence = evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty
|
| 55 |
+
|
| 56 |
+
Where:
|
| 57 |
+
- evidence_strength [0,1]: How directly the evidence supports the claim
|
| 58 |
+
- study_quality_weight: primary_experimental=1.0, in_vitro=0.8, simulation=0.6, review_non_systematic=0.4, meta_analysis=1.0, case_study=0.3
|
| 59 |
+
- journal_tier_weight: tier1=1.0, tier2=0.85, tier3=0.7, preprint=0.5
|
| 60 |
+
- completeness_penalty: 1.0 if all parameters reported, 0.7 if missing key parameters
|
| 61 |
+
|
| 62 |
+
Output JSON: {"confidence": float, "evidence_strength": float, "study_quality_weight": float, "journal_tier_weight": float, "completeness_penalty": float, "reasoning": "..."}
|
| 63 |
+
|
| 64 |
+
Use fixed-point scaled integers internally (multiply by 1000, round, divide by 1000) to avoid floating-point drift."""
|
| 65 |
+
|
| 66 |
+
SYSTEM_CONFLICT_DETECTOR = """You are the Verifier Agent of a PhD Research OS. Given two scientific claims, determine if they contradict each other.
|
| 67 |
+
|
| 68 |
+
Analyze the claims and output a Conflict Resolution Object:
|
| 69 |
+
- conflict_detected: boolean
|
| 70 |
+
- conflict_type: one of "value_mismatch", "methodology_difference", "scope_difference", "no_conflict"
|
| 71 |
+
- generated_hypothesis: text explaining the possible cause of the conflict
|
| 72 |
+
- hypothesis_confidence: always "low" (never auto-set to high — human review required)
|
| 73 |
+
- resolution_status: "Unresolved"
|
| 74 |
+
- key_differences: list of specific parameter/methodology differences
|
| 75 |
+
- recommended_action: what the researcher should investigate to resolve this
|
| 76 |
+
|
| 77 |
+
Output valid JSON. Be thorough but conservative — flag real conflicts, not superficial differences."""
|
| 78 |
+
|
| 79 |
+
SYSTEM_QUERY_DECOMPOSER = """You are the Query Planner of a PhD Research OS. Given a broad research question, decompose it into 2-4 specific sub-queries that can be independently searched in a scientific knowledge base.
|
| 80 |
+
|
| 81 |
+
Each sub-query should:
|
| 82 |
+
- Target a specific aspect of the question
|
| 83 |
+
- Be answerable from individual paper claims
|
| 84 |
+
- Together, cover the full scope of the original question
|
| 85 |
+
|
| 86 |
+
Output JSON: {"original_query": "...", "sub_queries": ["...", "..."], "reasoning": "..."}"""
|
| 87 |
+
|
| 88 |
+
SYSTEM_DECISION_GENERATOR = """You are the Decision Agent of a PhD Research OS. Given the current research goals, knowledge gaps, and incomplete/low-confidence claims, propose a Decision Object.
|
| 89 |
+
|
| 90 |
+
A Decision Object includes:
|
| 91 |
+
- decision_id: string (DEC_XXXX)
|
| 92 |
+
- recommended_action: one of "experiment", "literature_search", "collaboration", "replication", "methodology_review"
|
| 93 |
+
- action_description: specific description of what to do
|
| 94 |
+
- expected_information_gain: float [0,1] = uncertainty_of_claim × impact_on_goal
|
| 95 |
+
- linked_goal_id: which research goal this addresses
|
| 96 |
+
- linked_claim_ids: which claims this would resolve
|
| 97 |
+
- priority: "high", "medium", "low"
|
| 98 |
+
- estimated_effort: rough time estimate
|
| 99 |
+
|
| 100 |
+
Output valid JSON. Prioritize actions with highest information gain per unit effort."""
|
| 101 |
+
|
| 102 |
+
# ============================================================
|
| 103 |
+
# STEM DOMAIN KNOWLEDGE BASE (for generating realistic examples)
|
| 104 |
+
# ============================================================
|
| 105 |
+
|
| 106 |
+
STEM_DOMAINS = {
|
| 107 |
+
"biosensors": {
|
| 108 |
+
"topics": ["graphene FET sensors", "Debye length screening", "aptamer functionalization",
|
| 109 |
+
"limit of detection", "signal-to-noise ratio", "ionic strength effects",
|
| 110 |
+
"surface chemistry", "biomarker detection", "point-of-care diagnostics"],
|
| 111 |
+
"parameters": ["concentration", "ionic_strength_mM", "temperature_C", "pH",
|
| 112 |
+
"incubation_time_min", "gate_voltage_V", "drain_current_uA",
|
| 113 |
+
"sensitivity_mV_per_decade", "LOD_fM", "selectivity_ratio"],
|
| 114 |
+
"journals_t1": ["Nature Biotechnology", "ACS Nano", "Nano Letters", "Biosensors and Bioelectronics"],
|
| 115 |
+
"journals_t2": ["Analytical Chemistry", "Lab on a Chip", "Sensors and Actuators B"],
|
| 116 |
+
"journals_t3": ["IEEE Sensors Journal", "Microchimica Acta"]
|
| 117 |
+
},
|
| 118 |
+
"materials_science": {
|
| 119 |
+
"topics": ["2D materials", "MoS2 synthesis", "CVD growth", "defect engineering",
|
| 120 |
+
"band gap tuning", "heterostructures", "strain engineering"],
|
| 121 |
+
"parameters": ["thickness_nm", "growth_temperature_C", "pressure_torr", "carrier_gas_flow_sccm",
|
| 122 |
+
"grain_size_um", "mobility_cm2_Vs", "bandgap_eV", "defect_density_cm2"],
|
| 123 |
+
"journals_t1": ["Nature Materials", "Advanced Materials", "ACS Nano"],
|
| 124 |
+
"journals_t2": ["Chemistry of Materials", "2D Materials", "Nanoscale"],
|
| 125 |
+
"journals_t3": ["Materials Research Express", "Journal of Materials Science"]
|
| 126 |
+
},
|
| 127 |
+
"electrochemistry": {
|
| 128 |
+
"topics": ["battery electrolytes", "solid-state batteries", "lithium-ion transport",
|
| 129 |
+
"electrode-electrolyte interface", "impedance spectroscopy", "cycling stability"],
|
| 130 |
+
"parameters": ["ionic_conductivity_S_cm", "activation_energy_eV", "cycle_number",
|
| 131 |
+
"capacity_retention_pct", "voltage_window_V", "current_density_mA_cm2",
|
| 132 |
+
"coulombic_efficiency_pct", "electrode_thickness_um"],
|
| 133 |
+
"journals_t1": ["Nature Energy", "Joule", "Advanced Energy Materials"],
|
| 134 |
+
"journals_t2": ["Journal of the Electrochemical Society", "Electrochimica Acta"],
|
| 135 |
+
"journals_t3": ["Batteries", "Journal of Power Sources"]
|
| 136 |
+
},
|
| 137 |
+
"computational_biology": {
|
| 138 |
+
"topics": ["protein folding", "molecular dynamics", "drug-target interaction",
|
| 139 |
+
"genomic analysis", "CRISPR efficiency prediction", "gene regulatory networks"],
|
| 140 |
+
"parameters": ["RMSD_angstrom", "binding_affinity_kcal_mol", "simulation_time_ns",
|
| 141 |
+
"accuracy_pct", "AUC_ROC", "precision", "recall", "F1_score"],
|
| 142 |
+
"journals_t1": ["Nature Methods", "Nature Biotechnology", "Cell Systems"],
|
| 143 |
+
"journals_t2": ["Bioinformatics", "PLOS Computational Biology", "BMC Genomics"],
|
| 144 |
+
"journals_t3": ["Journal of Computational Biology", "Computational Biology and Chemistry"]
|
| 145 |
+
},
|
| 146 |
+
"quantum_computing": {
|
| 147 |
+
"topics": ["qubit coherence", "quantum error correction", "superconducting circuits",
|
| 148 |
+
"quantum algorithms", "quantum supremacy benchmarks", "topological qubits"],
|
| 149 |
+
"parameters": ["T1_us", "T2_us", "gate_fidelity_pct", "qubit_count",
|
| 150 |
+
"error_rate", "circuit_depth", "quantum_volume"],
|
| 151 |
+
"journals_t1": ["Nature", "Science", "Physical Review Letters"],
|
| 152 |
+
"journals_t2": ["Physical Review A", "Quantum", "npj Quantum Information"],
|
| 153 |
+
"journals_t3": ["Quantum Science and Technology", "Journal of Physics A"]
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# ============================================================
|
| 158 |
+
# EXAMPLE GENERATORS
|
| 159 |
+
# ============================================================
|
| 160 |
+
|
| 161 |
+
def generate_claim_extraction_examples(n=500):
|
| 162 |
+
"""Generate claim extraction training examples."""
|
| 163 |
+
examples = []
|
| 164 |
+
|
| 165 |
+
paper_templates = [
|
| 166 |
+
# Template 1: Experimental results paper
|
| 167 |
+
{
|
| 168 |
+
"text": "We investigated the effect of {param1} on {topic} using {method}. Our results demonstrate that increasing {param1} from {val1} to {val2} led to a {change_pct}% {direction} in {metric}. The {metric} reached a maximum value of {max_val} {unit} at {param1} = {optimal_val}. Statistical analysis ({stat_test}, n={n_samples}) confirmed significance (p < {p_val}). These findings suggest that {interpretation}. We hypothesize that {hypothesis}.",
|
| 169 |
+
"num_claims": 5,
|
| 170 |
+
"claim_types": ["Fact", "Fact", "Fact", "Interpretation", "Hypothesis"]
|
| 171 |
+
},
|
| 172 |
+
# Template 2: Comparison study
|
| 173 |
+
{
|
| 174 |
+
"text": "A comparative analysis of {method_a} and {method_b} for {application} was conducted. {method_a} achieved a {metric} of {val_a} ± {err_a} {unit}, while {method_b} yielded {val_b} ± {err_b} {unit} under identical conditions ({conditions}). The {better_method} outperformed by {diff_pct}% (p = {p_val}). However, {worse_method} showed superior {alt_metric} ({alt_val_w} vs {alt_val_b}). These results indicate that the choice between methods depends on {factor}. Future work should explore {future_direction}.",
|
| 175 |
+
"num_claims": 6,
|
| 176 |
+
"claim_types": ["Fact", "Fact", "Fact", "Fact", "Interpretation", "Hypothesis"]
|
| 177 |
+
},
|
| 178 |
+
# Template 3: Review/synthesis
|
| 179 |
+
{
|
| 180 |
+
"text": "Recent studies have established that {established_fact}. Multiple groups have reported {metric} values ranging from {range_low} to {range_high} {unit} ({ref1}; {ref2}; {ref3}). The consensus view is that {consensus}. However, {conflicting_author} reported contradictory findings, showing {contradicting_claim}. This discrepancy may arise from {possible_explanation}. A unified model has been proposed where {model_description}.",
|
| 181 |
+
"num_claims": 5,
|
| 182 |
+
"claim_types": ["Fact", "Fact", "Interpretation", "Conflict_Hypothesis", "Hypothesis"]
|
| 183 |
+
},
|
| 184 |
+
# Template 4: Methodology paper
|
| 185 |
+
{
|
| 186 |
+
"text": "We present a novel {technique} for {application} that achieves {metric} of {value} {unit}. The method involves {step1}, followed by {step2}, and final {step3}. Calibration was performed using {calibration_method} with {standard}. The limit of detection was determined to be {lod} {lod_unit} (S/N = 3). Reproducibility was assessed over {n_trials} independent measurements, yielding RSD of {rsd}%. The technique is applicable to {sample_types} with matrix effects below {matrix_effect}%.",
|
| 187 |
+
"num_claims": 6,
|
| 188 |
+
"claim_types": ["Fact", "Fact", "Fact", "Fact", "Fact", "Interpretation"]
|
| 189 |
+
},
|
| 190 |
+
# Template 5: Negative/unexpected results
|
| 191 |
+
{
|
| 192 |
+
"text": "Contrary to predictions from {theory}, our measurements of {parameter} in {system} showed no significant dependence on {variable} (p = {p_val_ns}, n = {n_samples}). The observed {parameter} remained at {constant_val} ± {error} {unit} across the entire range of {variable} tested ({range}). This null result suggests that {null_interpretation}. One possible explanation is that {alternative_mechanism}. These findings challenge the assumption that {challenged_assumption}.",
|
| 193 |
+
"num_claims": 5,
|
| 194 |
+
"claim_types": ["Fact", "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
|
| 195 |
+
}
|
| 196 |
+
]
|
| 197 |
+
|
| 198 |
+
for i in range(n):
|
| 199 |
+
domain_name = random.choice(list(STEM_DOMAINS.keys()))
|
| 200 |
+
domain = STEM_DOMAINS[domain_name]
|
| 201 |
+
template = random.choice(paper_templates)
|
| 202 |
+
topic = random.choice(domain["topics"])
|
| 203 |
+
params = random.sample(domain["parameters"], min(4, len(domain["parameters"])))
|
| 204 |
+
|
| 205 |
+
# Generate realistic parameter values
|
| 206 |
+
journal_tier = random.choices([1, 2, 3], weights=[0.3, 0.4, 0.3])[0]
|
| 207 |
+
if journal_tier == 1:
|
| 208 |
+
journal = random.choice(domain["journals_t1"])
|
| 209 |
+
elif journal_tier == 2:
|
| 210 |
+
journal = random.choice(domain["journals_t2"])
|
| 211 |
+
else:
|
| 212 |
+
journal = random.choice(domain["journals_t3"])
|
| 213 |
+
|
| 214 |
+
study_types = ["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis"]
|
| 215 |
+
study_type = random.choices(study_types, weights=[0.4, 0.2, 0.15, 0.1, 0.15])[0]
|
| 216 |
+
|
| 217 |
+
# Build paper excerpt (simplified — real version would use LLM)
|
| 218 |
+
excerpt = f"""[Excerpt from: "{topic}: Recent Advances" — Published in {journal}, 2024]
|
| 219 |
+
|
| 220 |
+
In this study, we examined {topic} with particular focus on the relationship between {params[0]} and {params[1] if len(params) > 1 else 'system performance'}. Using {study_type.replace('_', ' ')} methodology, we measured {params[0]} under varying conditions of {params[1] if len(params) > 1 else 'standard parameters'}.
|
| 221 |
+
|
| 222 |
+
Our primary finding is that {params[0]} exhibits a {random.choice(['linear', 'exponential', 'logarithmic', 'sigmoidal'])} dependence on {params[1] if len(params) > 1 else 'the control variable'}, with a correlation coefficient of {round(random.uniform(0.7, 0.99), 3)}. The optimal value of {params[0]} was found to be {round(random.uniform(0.1, 100), 2)} under conditions where {params[1] if len(params) > 1 else 'temperature'} = {round(random.uniform(20, 200), 1)}.
|
| 223 |
+
|
| 224 |
+
{random.choice(['Statistical analysis confirmed', 'ANOVA testing revealed', 'Mann-Whitney U test showed'])} significance at p < {random.choice(['0.001', '0.01', '0.05'])} (n = {random.choice([3, 5, 10, 20, 50])}). These results {'align with' if random.random() > 0.3 else 'contradict'} previous reports by {random.choice(['Smith et al.', 'Zhang et al.', 'Kumar et al.', 'Johnson et al.'])} who found {params[0]} values of {round(random.uniform(0.1, 100), 2)}.
|
| 225 |
+
|
| 226 |
+
We interpret these findings as evidence that {random.choice(['the proposed mechanism involves', 'the dominant factor is', 'surface interactions govern'])} the observed behavior. We hypothesize that {random.choice(['further optimization could achieve', 'a threshold effect exists at', 'competing mechanisms dominate above'])} {round(random.uniform(10, 500), 1)}."""
|
| 227 |
+
|
| 228 |
+
# Generate structured claims
|
| 229 |
+
study_quality_map = {
|
| 230 |
+
"primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6,
|
| 231 |
+
"review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3
|
| 232 |
+
}
|
| 233 |
+
journal_tier_map = {1: 1.0, 2: 0.85, 3: 0.7}
|
| 234 |
+
|
| 235 |
+
num_claims = random.randint(3, 7)
|
| 236 |
+
claims = []
|
| 237 |
+
for j in range(num_claims):
|
| 238 |
+
epistemic = random.choices(
|
| 239 |
+
["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"],
|
| 240 |
+
weights=[0.4, 0.3, 0.2, 0.1]
|
| 241 |
+
)[0]
|
| 242 |
+
|
| 243 |
+
evidence_strength = round(random.uniform(0.5, 1.0), 3)
|
| 244 |
+
sq_weight = study_quality_map.get(study_type, 0.6)
|
| 245 |
+
jt_weight = journal_tier_map.get(journal_tier, 0.7)
|
| 246 |
+
|
| 247 |
+
has_missing = random.random() < 0.25
|
| 248 |
+
completeness = 0.7 if has_missing else 1.0
|
| 249 |
+
missing = random.sample(params, random.randint(1, 2)) if has_missing else []
|
| 250 |
+
|
| 251 |
+
# Fixed-point calculation (multiply by 1000, round, divide)
|
| 252 |
+
conf_raw = evidence_strength * sq_weight * jt_weight * completeness
|
| 253 |
+
confidence = round(int(conf_raw * 1000) / 1000, 3)
|
| 254 |
+
|
| 255 |
+
claim_text_options = [
|
| 256 |
+
f"The {params[0]} was measured at {round(random.uniform(0.1, 100), 2)} under standard conditions.",
|
| 257 |
+
f"A {random.choice(['positive', 'negative', 'non-linear'])} correlation was observed between {params[0]} and {params[1] if len(params) > 1 else 'output'}.",
|
| 258 |
+
f"The proposed mechanism suggests that {topic} is primarily governed by {random.choice(['surface effects', 'bulk properties', 'interfacial phenomena'])}.",
|
| 259 |
+
f"We hypothesize that optimizing {params[0]} beyond {round(random.uniform(50, 200), 1)} could yield {random.choice(['enhanced', 'diminished', 'qualitatively different'])} results.",
|
| 260 |
+
f"These findings contradict the established model by {random.choice(['Smith et al.', 'Zhang et al.', 'Lee et al.'])}, who reported {random.choice(['opposite', 'significantly different', 'null'])} effects.",
|
| 261 |
+
]
|
| 262 |
+
|
| 263 |
+
claim_obj = {
|
| 264 |
+
"claim_id": f"CLM_{i*10+j:04d}",
|
| 265 |
+
"text": random.choice(claim_text_options),
|
| 266 |
+
"epistemic_tag": epistemic,
|
| 267 |
+
"confidence": confidence,
|
| 268 |
+
"evidence_strength": evidence_strength,
|
| 269 |
+
"study_type": study_type,
|
| 270 |
+
"study_quality_weight": sq_weight,
|
| 271 |
+
"journal_tier_weight": jt_weight,
|
| 272 |
+
"completeness_penalty": completeness,
|
| 273 |
+
"missing_fields": missing,
|
| 274 |
+
"status": "Incomplete" if has_missing else "Complete",
|
| 275 |
+
"parameters": {p: round(random.uniform(0.1, 100), 2) for p in random.sample(params, min(2, len(params)))}
|
| 276 |
+
}
|
| 277 |
+
claims.append(claim_obj)
|
| 278 |
+
|
| 279 |
+
response = json.dumps({"claims": claims, "source_doi": f"10.1234/example.{i:04d}", "paper_domain": domain_name}, indent=2)
|
| 280 |
+
|
| 281 |
+
examples.append({
|
| 282 |
+
"messages": [
|
| 283 |
+
{"role": "system", "content": SYSTEM_CLAIM_EXTRACTION},
|
| 284 |
+
{"role": "user", "content": f"Extract all scientific claims from the following paper excerpt:\n\n{excerpt}"},
|
| 285 |
+
{"role": "assistant", "content": response}
|
| 286 |
+
]
|
| 287 |
+
})
|
| 288 |
+
|
| 289 |
+
return examples
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def generate_epistemic_classification_examples(n=400):
|
| 293 |
+
"""Generate epistemic tag classification examples."""
|
| 294 |
+
examples = []
|
| 295 |
+
|
| 296 |
+
statements = {
|
| 297 |
+
"Fact": [
|
| 298 |
+
"The measured ionic conductivity of the LLZO pellet was 4.2 × 10⁻⁴ S/cm at 25°C.",
|
| 299 |
+
"Graphene field-effect transistors showed a Dirac point shift of 45 mV upon target binding.",
|
| 300 |
+
"The crystal structure was confirmed as tetragonal by XRD analysis (JCPDS 00-024-0867).",
|
| 301 |
+
"Cell viability remained above 95% after 48 hours of exposure to nanoparticle concentrations up to 100 μg/mL.",
|
| 302 |
+
"The reaction yield increased from 32% to 87% when the catalyst loading was doubled from 5 mol% to 10 mol%.",
|
| 303 |
+
"Raman spectroscopy revealed a G/2D peak ratio of 0.35, consistent with monolayer graphene.",
|
| 304 |
+
"The bandgap energy was determined to be 1.85 eV from UV-Vis absorption spectroscopy.",
|
| 305 |
+
"Atomic force microscopy confirmed a film thickness of 12.3 ± 0.4 nm across 20 measurement points.",
|
| 306 |
+
"The protein folding simulation converged after 850 ns with RMSD < 2.0 Å.",
|
| 307 |
+
"HPLC analysis showed 99.2% purity of the synthesized compound.",
|
| 308 |
+
"The qubit T1 relaxation time was measured at 152 ± 8 μs at 15 mK.",
|
| 309 |
+
"Mass spectrometry confirmed the molecular ion peak at m/z = 342.18, consistent with the expected product.",
|
| 310 |
+
"The transistor exhibited an on/off current ratio of 10⁶ with a subthreshold swing of 68 mV/decade.",
|
| 311 |
+
"Flow cytometry analysis revealed 78.3% of cells were in the G1 phase after treatment.",
|
| 312 |
+
"The electrode maintained 94.2% capacity retention after 500 charge-discharge cycles at 1C rate."
|
| 313 |
+
],
|
| 314 |
+
"Interpretation": [
|
| 315 |
+
"The observed Dirac point shift suggests successful functionalization of the graphene surface.",
|
| 316 |
+
"These results indicate that the ion transport mechanism is primarily governed by grain boundary diffusion.",
|
| 317 |
+
"The non-linear dose-response curve is consistent with a cooperative binding model.",
|
| 318 |
+
"The improved performance at elevated temperatures points to thermally activated charge transport.",
|
| 319 |
+
"Our data support the hypothesis that surface defects play a critical role in catalytic activity.",
|
| 320 |
+
"The correlation between particle size and reactivity implies surface-area-dependent kinetics.",
|
| 321 |
+
"These findings are consistent with a two-step nucleation mechanism rather than classical nucleation theory.",
|
| 322 |
+
"The asymmetric peak broadening in XRD patterns suggests the presence of microstrain.",
|
| 323 |
+
"The enhanced fluorescence lifetime indicates reduced non-radiative recombination pathways.",
|
| 324 |
+
"The inverse relationship between ionic strength and sensitivity aligns with Debye screening predictions.",
|
| 325 |
+
"Based on the activation energy of 0.32 eV, we conclude that lithium diffusion occurs via an interstitial mechanism.",
|
| 326 |
+
"The observed blue shift in photoluminescence is attributable to quantum confinement effects.",
|
| 327 |
+
"The saturation behavior above 100 nM concentration reflects receptor site limitation."
|
| 328 |
+
],
|
| 329 |
+
"Hypothesis": [
|
| 330 |
+
"We propose that the anomalous conductivity enhancement arises from a percolation network of amorphous regions.",
|
| 331 |
+
"It is possible that the observed bistability originates from competing ferroelectric and antiferroelectric phases.",
|
| 332 |
+
"Future experiments with isotope labeling could determine whether proton hopping or vehicle mechanism dominates.",
|
| 333 |
+
"We hypothesize that introducing tensile strain into the MoS2 lattice will reduce the bandgap below 1.5 eV.",
|
| 334 |
+
"A possible explanation is that the protein undergoes a conformational change upon ligand binding that exposes a hidden epitope.",
|
| 335 |
+
"We speculate that the unexpected catalytic activity may arise from edge-site defects not captured in bulk characterization.",
|
| 336 |
+
"If the proposed mechanism is correct, replacing the counter-ion should produce a measurable shift in the voltammetric response.",
|
| 337 |
+
"The anomalous transport behavior could potentially be explained by a polaron hopping model.",
|
| 338 |
+
"We conjecture that the system exhibits a quantum phase transition at a critical doping concentration of approximately 0.15.",
|
| 339 |
+
"A theoretical framework based on Marcus theory predicts that electron transfer rates should increase by 10× at these reorganization energies.",
|
| 340 |
+
"It remains to be tested whether this enhancement persists under physiological buffer conditions."
|
| 341 |
+
],
|
| 342 |
+
"Conflict_Hypothesis": [
|
| 343 |
+
"Our observation of decreasing sensitivity at high ionic strength directly contradicts Chen et al. (2022), who reported sensitivity enhancement under similar conditions.",
|
| 344 |
+
"While the established model predicts a linear relationship between film thickness and resistance, our data show clear deviation above 20 nm.",
|
| 345 |
+
"These results challenge the widely accepted Langmuir adsorption model, suggesting that multilayer formation occurs at concentrations previously considered sub-monolayer.",
|
| 346 |
+
"Contrary to the predictions of density functional theory calculations by Park et al., we observe metallic rather than semiconducting behavior in this phase.",
|
| 347 |
+
"The measured activation energy of 0.52 eV is significantly higher than the 0.28 eV reported by three independent groups, suggesting a fundamentally different transport mechanism.",
|
| 348 |
+
"Our finding that the reaction proceeds without the proposed intermediate contradicts the established mechanism.",
|
| 349 |
+
"The negative correlation we observe between grain size and conductivity opposes the conventional understanding based on brick-layer model predictions.",
|
| 350 |
+
"While Johnson et al. reported complete stability over 1000 cycles, our replication attempt shows measurable degradation beginning at cycle 300."
|
| 351 |
+
]
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
for i in range(n):
|
| 355 |
+
tag = random.choices(
|
| 356 |
+
["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"],
|
| 357 |
+
weights=[0.35, 0.30, 0.20, 0.15]
|
| 358 |
+
)[0]
|
| 359 |
+
statement = random.choice(statements[tag])
|
| 360 |
+
|
| 361 |
+
reasoning_templates = {
|
| 362 |
+
"Fact": "This statement reports a direct measurement/observation with specific quantitative data. It does not include author interpretation or speculation.",
|
| 363 |
+
"Interpretation": "This statement goes beyond raw data to explain or attribute meaning to observations. The author draws conclusions that are not strictly contained in the measurements alone.",
|
| 364 |
+
"Hypothesis": "This statement proposes an untested mechanism or prediction. Key indicators: uses speculative language (may, could, hypothesize, propose, possible).",
|
| 365 |
+
"Conflict_Hypothesis": "This statement explicitly contradicts an established finding or widely accepted model, with evidence supporting both positions."
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
conf_in_class = round(random.uniform(0.75, 0.98), 2)
|
| 369 |
+
|
| 370 |
+
response = json.dumps({
|
| 371 |
+
"epistemic_tag": tag,
|
| 372 |
+
"reasoning": reasoning_templates[tag],
|
| 373 |
+
"confidence_in_classification": conf_in_class
|
| 374 |
+
}, indent=2)
|
| 375 |
+
|
| 376 |
+
examples.append({
|
| 377 |
+
"messages": [
|
| 378 |
+
{"role": "system", "content": SYSTEM_EPISTEMIC_CLASSIFIER},
|
| 379 |
+
{"role": "user", "content": f"Classify the epistemic status of this scientific statement:\n\n\"{statement}\""},
|
| 380 |
+
{"role": "assistant", "content": response}
|
| 381 |
+
]
|
| 382 |
+
})
|
| 383 |
+
|
| 384 |
+
return examples
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def generate_confidence_scoring_examples(n=300):
|
| 388 |
+
"""Generate confidence scoring training examples."""
|
| 389 |
+
examples = []
|
| 390 |
+
|
| 391 |
+
for i in range(n):
|
| 392 |
+
domain_name = random.choice(list(STEM_DOMAINS.keys()))
|
| 393 |
+
domain = STEM_DOMAINS[domain_name]
|
| 394 |
+
|
| 395 |
+
study_type = random.choices(
|
| 396 |
+
["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study"],
|
| 397 |
+
weights=[0.35, 0.20, 0.15, 0.10, 0.10, 0.10]
|
| 398 |
+
)[0]
|
| 399 |
+
|
| 400 |
+
journal_tier = random.choices([1, 2, 3, "preprint"], weights=[0.25, 0.35, 0.25, 0.15])[0]
|
| 401 |
+
|
| 402 |
+
if journal_tier == 1:
|
| 403 |
+
journal = random.choice(domain["journals_t1"])
|
| 404 |
+
elif journal_tier == 2:
|
| 405 |
+
journal = random.choice(domain["journals_t2"])
|
| 406 |
+
elif journal_tier == 3:
|
| 407 |
+
journal = random.choice(domain["journals_t3"])
|
| 408 |
+
else:
|
| 409 |
+
journal = "arXiv preprint"
|
| 410 |
+
|
| 411 |
+
sq_map = {"primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6,
|
| 412 |
+
"review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3}
|
| 413 |
+
jt_map = {1: 1.0, 2: 0.85, 3: 0.7, "preprint": 0.5}
|
| 414 |
+
|
| 415 |
+
evidence_strength = round(random.uniform(0.3, 1.0), 3)
|
| 416 |
+
sq_weight = sq_map[study_type]
|
| 417 |
+
jt_weight = jt_map[journal_tier]
|
| 418 |
+
|
| 419 |
+
has_missing = random.random() < 0.3
|
| 420 |
+
completeness = 0.7 if has_missing else 1.0
|
| 421 |
+
|
| 422 |
+
# Fixed-point calculation
|
| 423 |
+
raw = evidence_strength * sq_weight * jt_weight * completeness
|
| 424 |
+
confidence = round(int(raw * 1000) / 1000, 3)
|
| 425 |
+
|
| 426 |
+
claim_text = f"Claim from {journal} ({study_type.replace('_', ' ')} study): {random.choice(domain['topics'])} measured {random.choice(domain['parameters'])} at {round(random.uniform(0.1, 100), 2)}."
|
| 427 |
+
|
| 428 |
+
if has_missing:
|
| 429 |
+
claim_text += " [Note: sample size and error margins not reported]"
|
| 430 |
+
|
| 431 |
+
reasoning = f"Study type '{study_type}' gets weight {sq_weight}. Journal '{journal}' is tier {journal_tier} (weight {jt_weight}). Evidence strength assessed at {evidence_strength} based on directness of measurement. {'Missing key parameters reduce completeness penalty to 0.7.' if has_missing else 'All parameters reported, completeness 1.0.'} Final: {evidence_strength} × {sq_weight} × {jt_weight} × {completeness} = {confidence}."
|
| 432 |
+
|
| 433 |
+
response = json.dumps({
|
| 434 |
+
"confidence": confidence,
|
| 435 |
+
"evidence_strength": evidence_strength,
|
| 436 |
+
"study_quality_weight": sq_weight,
|
| 437 |
+
"journal_tier_weight": jt_weight,
|
| 438 |
+
"completeness_penalty": completeness,
|
| 439 |
+
"reasoning": reasoning
|
| 440 |
+
}, indent=2)
|
| 441 |
+
|
| 442 |
+
examples.append({
|
| 443 |
+
"messages": [
|
| 444 |
+
{"role": "system", "content": SYSTEM_CONFIDENCE_SCORER},
|
| 445 |
+
{"role": "user", "content": f"Score the confidence of this claim:\n\n{claim_text}\n\nSource: {journal}\nStudy type: {study_type}\nJournal tier: {journal_tier}"},
|
| 446 |
+
{"role": "assistant", "content": response}
|
| 447 |
+
]
|
| 448 |
+
})
|
| 449 |
+
|
| 450 |
+
return examples
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
def generate_conflict_detection_examples(n=300):
|
| 454 |
+
"""Generate contradiction detection training examples."""
|
| 455 |
+
examples = []
|
| 456 |
+
|
| 457 |
+
conflict_pairs = [
|
| 458 |
+
# Real conflicts
|
| 459 |
+
{
|
| 460 |
+
"claim_a": "Graphene FET sensitivity increases monotonically with decreasing ionic strength (measured range: 0.1-100 mM PBS).",
|
| 461 |
+
"claim_b": "Below 1 mM ionic strength, GFET sensitivity plateaus due to parasitic charge screening from surface-adsorbed species.",
|
| 462 |
+
"conflict": True,
|
| 463 |
+
"conflict_type": "value_mismatch",
|
| 464 |
+
"hypothesis": "The discrepancy likely arises from different surface functionalization protocols. Claim A used bare graphene while Claim B used PEG-passivated surfaces, which would accumulate different surface charges at very low ionic strength.",
|
| 465 |
+
"key_diffs": ["surface treatment protocol", "ionic strength range tested", "measurement technique"]
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"claim_a": "The Li-ion conductivity of LLZO was 1.2 × 10⁻³ S/cm at room temperature.",
|
| 469 |
+
"claim_b": "LLZO pellets sintered under identical conditions showed conductivity of 2.8 × 10⁻⁴ S/cm at 25°C.",
|
| 470 |
+
"conflict": True,
|
| 471 |
+
"conflict_type": "value_mismatch",
|
| 472 |
+
"hypothesis": "The order-of-magnitude difference may stem from different grain boundary densities. Claim A likely reports single-crystal or highly-densified samples, while Claim B may include significant grain boundary resistance.",
|
| 473 |
+
"key_diffs": ["sample preparation method", "densification level", "measurement geometry"]
|
| 474 |
+
},
|
| 475 |
+
{
|
| 476 |
+
"claim_a": "MoS2 monolayers grown by CVD at 700°C show exclusively 2H phase.",
|
| 477 |
+
"claim_b": "CVD-grown MoS2 at 700°C contains 30-40% 1T phase, as confirmed by XPS peak deconvolution.",
|
| 478 |
+
"conflict": True,
|
| 479 |
+
"conflict_type": "methodology_difference",
|
| 480 |
+
"hypothesis": "The 1T phase in Claim B may be induced during the transfer process (common with wet-chemical transfer) or by the choice of sulfur precursor. Claim A may have used a different transfer method or in-situ characterization.",
|
| 481 |
+
"key_diffs": ["sulfur precursor", "transfer method", "characterization timing (in-situ vs ex-situ)"]
|
| 482 |
+
},
|
| 483 |
+
{
|
| 484 |
+
"claim_a": "The protein binding affinity (Kd) for aptamer X was 2.3 nM in buffer.",
|
| 485 |
+
"claim_b": "Aptamer X showed Kd of 180 nM when tested in 50% human serum.",
|
| 486 |
+
"conflict": False,
|
| 487 |
+
"conflict_type": "scope_difference",
|
| 488 |
+
"hypothesis": "These measurements are not contradictory — they reflect different measurement conditions. The 78× decrease in affinity in serum is expected due to non-specific protein interactions, ionic strength differences, and potential aptamer degradation by nucleases.",
|
| 489 |
+
"key_diffs": ["measurement medium (buffer vs serum)", "matrix effects"]
|
| 490 |
+
},
|
| 491 |
+
{
|
| 492 |
+
"claim_a": "Quantum dot fluorescence quenching follows Stern-Volmer kinetics with KSV = 4.5 × 10⁴ M⁻¹.",
|
| 493 |
+
"claim_b": "The same QD-quencher system shows non-linear Stern-Volmer behavior above 10 μM quencher concentration.",
|
| 494 |
+
"conflict": False,
|
| 495 |
+
"conflict_type": "scope_difference",
|
| 496 |
+
"hypothesis": "Both claims can be simultaneously true. The Stern-Volmer relationship is linear at low quencher concentrations (Claim A's measurement range) but deviates at higher concentrations (Claim B) due to static quenching or ground-state complex formation.",
|
| 497 |
+
"key_diffs": ["concentration range", "quenching mechanism regime"]
|
| 498 |
+
},
|
| 499 |
+
# More conflicts
|
| 500 |
+
{
|
| 501 |
+
"claim_a": "Sonication for 30 minutes produces graphene flakes with average lateral size of 500 nm.",
|
| 502 |
+
"claim_b": "Extended sonication (30 min) yields graphene fragments predominantly below 100 nm with significant edge defects.",
|
| 503 |
+
"conflict": True,
|
| 504 |
+
"conflict_type": "value_mismatch",
|
| 505 |
+
"hypothesis": "The 5× difference in reported flake size likely stems from different sonication power/frequency settings, solvent choice (NMP vs water/surfactant), or measurement method (DLS vs AFM). Additionally, definition of 'average' may differ (number-averaged vs volume-averaged).",
|
| 506 |
+
"key_diffs": ["sonication parameters (power, frequency)", "solvent system", "size measurement technique", "averaging method"]
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"claim_a": "The neural network achieved 96.3% accuracy on the protein structure prediction benchmark.",
|
| 510 |
+
"claim_b": "On the same benchmark, the identical architecture achieved only 84.1% accuracy when trained with a different random seed.",
|
| 511 |
+
"conflict": True,
|
| 512 |
+
"conflict_type": "methodology_difference",
|
| 513 |
+
"hypothesis": "The 12.2% accuracy gap from random seed variation alone suggests the model is highly sensitive to initialization. This may indicate overfitting to specific training data partitions or instability in the optimization landscape.",
|
| 514 |
+
"key_diffs": ["random seed", "potentially different train/test splits", "convergence criteria"]
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"claim_a": "The catalyst achieves 99% conversion at 80°C.",
|
| 518 |
+
"claim_b": "The catalyst achieves 99% conversion at 80°C with fresh reagents, but only 45% with recycled catalyst after 3 cycles.",
|
| 519 |
+
"conflict": False,
|
| 520 |
+
"conflict_type": "scope_difference",
|
| 521 |
+
"hypothesis": "These are not contradictory. Claim A reports initial performance while Claim B adds information about durability. The 54% drop after 3 cycles reveals catalyst deactivation, possibly from active site poisoning or structural degradation.",
|
| 522 |
+
"key_diffs": ["catalyst reuse cycle", "implicit freshness assumption in Claim A"]
|
| 523 |
+
}
|
| 524 |
+
]
|
| 525 |
+
|
| 526 |
+
for i in range(n):
|
| 527 |
+
pair = random.choice(conflict_pairs)
|
| 528 |
+
|
| 529 |
+
response = json.dumps({
|
| 530 |
+
"conflict_detected": pair["conflict"],
|
| 531 |
+
"conflict_type": pair["conflict_type"],
|
| 532 |
+
"generated_hypothesis": pair["hypothesis"],
|
| 533 |
+
"hypothesis_confidence": "low",
|
| 534 |
+
"resolution_status": "Unresolved",
|
| 535 |
+
"key_differences": pair["key_diffs"],
|
| 536 |
+
"recommended_action": f"{'Investigate the specific methodological differences identified. Request raw data from both groups if possible.' if pair['conflict'] else 'No conflict resolution needed. Both claims are valid within their respective scopes. Document the scope boundary.'}"
|
| 537 |
+
}, indent=2)
|
| 538 |
+
|
| 539 |
+
examples.append({
|
| 540 |
+
"messages": [
|
| 541 |
+
{"role": "system", "content": SYSTEM_CONFLICT_DETECTOR},
|
| 542 |
+
{"role": "user", "content": f"Analyze these two claims for contradictions:\n\nClaim A: \"{pair['claim_a']}\"\n\nClaim B: \"{pair['claim_b']}\""},
|
| 543 |
+
{"role": "assistant", "content": response}
|
| 544 |
+
]
|
| 545 |
+
})
|
| 546 |
+
|
| 547 |
+
return examples
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
def generate_query_decomposition_examples(n=200):
|
| 551 |
+
"""Generate query decomposition examples."""
|
| 552 |
+
examples = []
|
| 553 |
+
|
| 554 |
+
queries = [
|
| 555 |
+
{
|
| 556 |
+
"query": "What is the current state of graphene-based biosensors for cancer biomarker detection?",
|
| 557 |
+
"sub_queries": [
|
| 558 |
+
"What cancer biomarkers have been detected using graphene FET sensors?",
|
| 559 |
+
"What are the reported limits of detection for graphene biosensors targeting cancer markers?",
|
| 560 |
+
"How does graphene biosensor performance compare to ELISA and other standard methods?",
|
| 561 |
+
"What are the main challenges preventing clinical translation of graphene biosensors?"
|
| 562 |
+
],
|
| 563 |
+
"reasoning": "This broad question spans detection targets, performance metrics, comparative assessment, and translational barriers. Each sub-query targets a specific knowledge domain that maps to different claim types in the database."
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"query": "How does ionic strength affect the performance of field-effect transistor biosensors?",
|
| 567 |
+
"sub_queries": [
|
| 568 |
+
"What is the Debye screening length at different ionic strength values?",
|
| 569 |
+
"How does sensitivity change as a function of buffer ionic strength for FET biosensors?",
|
| 570 |
+
"What strategies have been developed to overcome Debye screening limitations?"
|
| 571 |
+
],
|
| 572 |
+
"reasoning": "The question involves fundamental physics (Debye length), empirical relationships (sensitivity vs ionic strength), and engineering solutions (overcoming limitations)."
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"query": "What are the best solid-state electrolytes for next-generation lithium batteries?",
|
| 576 |
+
"sub_queries": [
|
| 577 |
+
"What ionic conductivities have been achieved in garnet-type, sulfide, and polymer electrolytes?",
|
| 578 |
+
"What are the interfacial stability challenges between solid electrolytes and lithium metal anodes?",
|
| 579 |
+
"How do manufacturing scalability and cost compare across solid electrolyte families?",
|
| 580 |
+
"What degradation mechanisms limit cycle life in solid-state batteries?"
|
| 581 |
+
],
|
| 582 |
+
"reasoning": "This question spans material performance (conductivity), interface engineering, practical considerations (cost/scale), and durability. Each is a distinct research sub-domain."
|
| 583 |
+
},
|
| 584 |
+
{
|
| 585 |
+
"query": "Can CRISPR-Cas9 efficiency be predicted computationally?",
|
| 586 |
+
"sub_queries": [
|
| 587 |
+
"What machine learning models have been developed for CRISPR guide RNA efficiency prediction?",
|
| 588 |
+
"What features (sequence, structure, chromatin) are most predictive of CRISPR cutting efficiency?",
|
| 589 |
+
"How do computational predictions compare to experimental validation data?"
|
| 590 |
+
],
|
| 591 |
+
"reasoning": "The question bridges computational methods, feature engineering, and experimental validation — three distinct knowledge areas."
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"query": "What determines qubit coherence time in superconducting quantum processors?",
|
| 595 |
+
"sub_queries": [
|
| 596 |
+
"What are the dominant decoherence mechanisms (T1, T2) in transmon qubits?",
|
| 597 |
+
"How do materials and fabrication choices affect qubit coherence?",
|
| 598 |
+
"What is the current state-of-the-art for superconducting qubit coherence times?"
|
| 599 |
+
],
|
| 600 |
+
"reasoning": "This question involves fundamental physics (decoherence mechanisms), engineering (fabrication), and benchmarking (state-of-the-art values)."
|
| 601 |
+
},
|
| 602 |
+
{
|
| 603 |
+
"query": "How effective are 2D material heterostructures for photocatalytic water splitting?",
|
| 604 |
+
"sub_queries": [
|
| 605 |
+
"What 2D material combinations have been tested for photocatalytic hydrogen evolution?",
|
| 606 |
+
"What are the reported solar-to-hydrogen conversion efficiencies for 2D heterostructure photocatalysts?",
|
| 607 |
+
"How does band alignment in 2D heterostructures affect charge separation and catalytic activity?",
|
| 608 |
+
"What stability and durability challenges exist for 2D photocatalysts?"
|
| 609 |
+
],
|
| 610 |
+
"reasoning": "Covers materials (combinations), performance metrics (efficiency), mechanism (band alignment), and practical challenges (stability) — four distinct research angles."
|
| 611 |
+
}
|
| 612 |
+
]
|
| 613 |
+
|
| 614 |
+
for i in range(n):
|
| 615 |
+
q = random.choice(queries)
|
| 616 |
+
response = json.dumps({
|
| 617 |
+
"original_query": q["query"],
|
| 618 |
+
"sub_queries": q["sub_queries"],
|
| 619 |
+
"reasoning": q["reasoning"]
|
| 620 |
+
}, indent=2)
|
| 621 |
+
|
| 622 |
+
examples.append({
|
| 623 |
+
"messages": [
|
| 624 |
+
{"role": "system", "content": SYSTEM_QUERY_DECOMPOSER},
|
| 625 |
+
{"role": "user", "content": f"Decompose this research question into specific sub-queries:\n\n\"{q['query']}\""},
|
| 626 |
+
{"role": "assistant", "content": response}
|
| 627 |
+
]
|
| 628 |
+
})
|
| 629 |
+
|
| 630 |
+
return examples
|
| 631 |
+
|
| 632 |
+
|
| 633 |
+
def generate_decision_object_examples(n=200):
|
| 634 |
+
"""Generate Decision Object training examples."""
|
| 635 |
+
examples = []
|
| 636 |
+
|
| 637 |
+
scenarios = [
|
| 638 |
+
{
|
| 639 |
+
"goal": "Achieve sub-femtomolar detection limit for cardiac troponin I using GFET biosensor",
|
| 640 |
+
"gaps": ["Optimal aptamer sequence for cTnI not determined", "Debye screening at physiological ionic strength limits sensitivity", "No data on sensor-to-sensor reproducibility"],
|
| 641 |
+
"low_confidence_claims": ["CLM_0042: PEG spacer length of 5 kDa optimal (confidence: 0.35)", "CLM_0089: Desalting step preserves >90% of target protein (confidence: 0.41)"],
|
| 642 |
+
"decision": {
|
| 643 |
+
"decision_id": "DEC_0001",
|
| 644 |
+
"recommended_action": "experiment",
|
| 645 |
+
"action_description": "Systematic optimization of aptamer surface density and PEG spacer length on GFET. Test 3 aptamer concentrations × 3 PEG lengths × 3 ionic strength conditions. Include negative controls.",
|
| 646 |
+
"expected_information_gain": 0.72,
|
| 647 |
+
"linked_goal_id": "GOAL_001",
|
| 648 |
+
"linked_claim_ids": ["CLM_0042", "CLM_0089"],
|
| 649 |
+
"priority": "high",
|
| 650 |
+
"estimated_effort": "2-3 weeks of lab work + 1 week analysis"
|
| 651 |
+
}
|
| 652 |
+
},
|
| 653 |
+
{
|
| 654 |
+
"goal": "Understand degradation mechanism in solid-state lithium battery at >100 cycles",
|
| 655 |
+
"gaps": ["Interface evolution not characterized in-situ", "Role of grain boundary resistance vs bulk unclear", "Temperature dependence of degradation unknown"],
|
| 656 |
+
"low_confidence_claims": ["CLM_0156: Lithium dendrite penetration through grain boundaries (confidence: 0.28)", "CLM_0201: SEI formation at cathode interface dominates early degradation (confidence: 0.39)"],
|
| 657 |
+
"decision": {
|
| 658 |
+
"decision_id": "DEC_0002",
|
| 659 |
+
"recommended_action": "literature_search",
|
| 660 |
+
"action_description": "Comprehensive search for in-situ/operando characterization studies of LLZO-lithium interfaces during cycling. Focus on synchrotron XRD and cryo-TEM studies from 2022-2024.",
|
| 661 |
+
"expected_information_gain": 0.58,
|
| 662 |
+
"linked_goal_id": "GOAL_003",
|
| 663 |
+
"linked_claim_ids": ["CLM_0156", "CLM_0201"],
|
| 664 |
+
"priority": "medium",
|
| 665 |
+
"estimated_effort": "1 week literature search + synthesis"
|
| 666 |
+
}
|
| 667 |
+
},
|
| 668 |
+
{
|
| 669 |
+
"goal": "Validate computational model of protein-aptamer binding",
|
| 670 |
+
"gaps": ["MD simulation parameters not benchmarked against experimental Kd", "Force field choice may introduce systematic bias", "Solvent model effects unexplored"],
|
| 671 |
+
"low_confidence_claims": ["CLM_0312: AMBER ff14SB adequate for aptamer-protein complexes (confidence: 0.32)"],
|
| 672 |
+
"decision": {
|
| 673 |
+
"decision_id": "DEC_0003",
|
| 674 |
+
"recommended_action": "collaboration",
|
| 675 |
+
"action_description": "Contact the computational chemistry group (Prof. Martinez) for force field parameterization expertise. Their recent paper on RNA-protein interactions used an optimized force field that may apply here.",
|
| 676 |
+
"expected_information_gain": 0.65,
|
| 677 |
+
"linked_goal_id": "GOAL_005",
|
| 678 |
+
"linked_claim_ids": ["CLM_0312"],
|
| 679 |
+
"priority": "medium",
|
| 680 |
+
"estimated_effort": "Initial meeting + 2-4 weeks of collaborative work"
|
| 681 |
+
}
|
| 682 |
+
},
|
| 683 |
+
{
|
| 684 |
+
"goal": "Establish reproducibility of the nanofabrication process",
|
| 685 |
+
"gaps": ["Batch-to-batch variation not quantified", "Critical process parameters not identified", "No statistical process control"],
|
| 686 |
+
"low_confidence_claims": ["CLM_0089: Yield >80% for device fabrication (confidence: 0.25, based on single batch)"],
|
| 687 |
+
"decision": {
|
| 688 |
+
"decision_id": "DEC_0004",
|
| 689 |
+
"recommended_action": "replication",
|
| 690 |
+
"action_description": "Fabricate 5 independent batches of 20 devices each over 5 separate days. Measure key performance metrics. Perform ANOVA to identify day-to-day and within-batch variation. Calculate Cpk for critical parameters.",
|
| 691 |
+
"expected_information_gain": 0.81,
|
| 692 |
+
"linked_goal_id": "GOAL_002",
|
| 693 |
+
"linked_claim_ids": ["CLM_0089"],
|
| 694 |
+
"priority": "high",
|
| 695 |
+
"estimated_effort": "3-4 weeks fabrication + 1 week characterization + 1 week analysis"
|
| 696 |
+
}
|
| 697 |
+
},
|
| 698 |
+
{
|
| 699 |
+
"goal": "Resolve contradictory claims about qubit decoherence mechanism",
|
| 700 |
+
"gaps": ["Two competing theories with similar explanatory power", "No experiment designed to distinguish between mechanisms"],
|
| 701 |
+
"low_confidence_claims": ["CLM_0445: TLS-dominated decoherence (confidence: 0.42)", "CLM_0446: Quasiparticle tunneling dominates above 50mK (confidence: 0.38)"],
|
| 702 |
+
"decision": {
|
| 703 |
+
"decision_id": "DEC_0005",
|
| 704 |
+
"recommended_action": "methodology_review",
|
| 705 |
+
"action_description": "Design a discriminating experiment: measure T1 as a function of temperature (10-200 mK) and applied magnetic field (0-100 mT). TLS model predicts logarithmic T1(T) while quasiparticle model predicts exponential. This single experiment can resolve the contradiction.",
|
| 706 |
+
"expected_information_gain": 0.89,
|
| 707 |
+
"linked_goal_id": "GOAL_007",
|
| 708 |
+
"linked_claim_ids": ["CLM_0445", "CLM_0446"],
|
| 709 |
+
"priority": "high",
|
| 710 |
+
"estimated_effort": "1 week experiment design + 2 weeks measurement + 1 week analysis"
|
| 711 |
+
}
|
| 712 |
+
}
|
| 713 |
+
]
|
| 714 |
+
|
| 715 |
+
for i in range(n):
|
| 716 |
+
scenario = random.choice(scenarios)
|
| 717 |
+
|
| 718 |
+
user_prompt = f"""Current research goal: {scenario['goal']}
|
| 719 |
+
|
| 720 |
+
Knowledge gaps:
|
| 721 |
+
{chr(10).join('- ' + g for g in scenario['gaps'])}
|
| 722 |
+
|
| 723 |
+
Low-confidence claims requiring resolution:
|
| 724 |
+
{chr(10).join('- ' + c for c in scenario['low_confidence_claims'])}
|
| 725 |
+
|
| 726 |
+
Propose a Decision Object with the highest expected information gain."""
|
| 727 |
+
|
| 728 |
+
response = json.dumps(scenario["decision"], indent=2)
|
| 729 |
+
|
| 730 |
+
examples.append({
|
| 731 |
+
"messages": [
|
| 732 |
+
{"role": "system", "content": SYSTEM_DECISION_GENERATOR},
|
| 733 |
+
{"role": "user", "content": user_prompt},
|
| 734 |
+
{"role": "assistant", "content": response}
|
| 735 |
+
]
|
| 736 |
+
})
|
| 737 |
+
|
| 738 |
+
return examples
|
| 739 |
+
|
| 740 |
+
|
| 741 |
+
# ============================================================
|
| 742 |
+
# MAIN: Generate and combine all task datasets
|
| 743 |
+
# ============================================================
|
| 744 |
+
|
| 745 |
+
def main():
|
| 746 |
+
print("Generating PhD Research OS training dataset...")
|
| 747 |
+
|
| 748 |
+
print(" Task 1: Claim Extraction (500 examples)...")
|
| 749 |
+
claim_examples = generate_claim_extraction_examples(500)
|
| 750 |
+
|
| 751 |
+
print(" Task 2: Epistemic Classification (400 examples)...")
|
| 752 |
+
epistemic_examples = generate_epistemic_classification_examples(400)
|
| 753 |
+
|
| 754 |
+
print(" Task 3: Confidence Scoring (300 examples)...")
|
| 755 |
+
confidence_examples = generate_confidence_scoring_examples(300)
|
| 756 |
+
|
| 757 |
+
print(" Task 4: Conflict Detection (300 examples)...")
|
| 758 |
+
conflict_examples = generate_conflict_detection_examples(300)
|
| 759 |
+
|
| 760 |
+
print(" Task 5: Query Decomposition (200 examples)...")
|
| 761 |
+
query_examples = generate_query_decomposition_examples(200)
|
| 762 |
+
|
| 763 |
+
print(" Task 6: Decision Objects (200 examples)...")
|
| 764 |
+
decision_examples = generate_decision_object_examples(200)
|
| 765 |
+
|
| 766 |
+
# Combine all examples
|
| 767 |
+
all_examples = (claim_examples + epistemic_examples + confidence_examples +
|
| 768 |
+
conflict_examples + query_examples + decision_examples)
|
| 769 |
+
|
| 770 |
+
# Shuffle
|
| 771 |
+
random.shuffle(all_examples)
|
| 772 |
+
|
| 773 |
+
print(f"\n Total examples: {len(all_examples)}")
|
| 774 |
+
|
| 775 |
+
# Split into train/eval (90/10)
|
| 776 |
+
split_idx = int(len(all_examples) * 0.9)
|
| 777 |
+
train_data = all_examples[:split_idx]
|
| 778 |
+
eval_data = all_examples[split_idx:]
|
| 779 |
+
|
| 780 |
+
print(f" Train: {len(train_data)}, Eval: {len(eval_data)}")
|
| 781 |
+
|
| 782 |
+
# Create HF Dataset
|
| 783 |
+
train_dataset = Dataset.from_list(train_data)
|
| 784 |
+
eval_dataset = Dataset.from_list(eval_data)
|
| 785 |
+
|
| 786 |
+
dataset_dict = DatasetDict({
|
| 787 |
+
"train": train_dataset,
|
| 788 |
+
"test": eval_dataset
|
| 789 |
+
})
|
| 790 |
+
|
| 791 |
+
# Save locally
|
| 792 |
+
dataset_dict.save_to_disk("/app/phd_research_os_dataset")
|
| 793 |
+
print(f"\n Dataset saved to /app/phd_research_os_dataset")
|
| 794 |
+
|
| 795 |
+
# Also save as JSON for inspection
|
| 796 |
+
with open("/app/sample_examples.json", "w") as f:
|
| 797 |
+
json.dump(all_examples[:10], f, indent=2)
|
| 798 |
+
print(" Sample examples saved to /app/sample_examples.json")
|
| 799 |
+
|
| 800 |
+
# Print distribution stats
|
| 801 |
+
task_counts = {}
|
| 802 |
+
for ex in all_examples:
|
| 803 |
+
system_msg = ex["messages"][0]["content"][:50]
|
| 804 |
+
task_counts[system_msg] = task_counts.get(system_msg, 0) + 1
|
| 805 |
+
|
| 806 |
+
print("\n Task distribution:")
|
| 807 |
+
for task, count in task_counts.items():
|
| 808 |
+
print(f" {task}... : {count}")
|
| 809 |
+
|
| 810 |
+
return dataset_dict
|
| 811 |
+
|
| 812 |
+
|
| 813 |
+
if __name__ == "__main__":
|
| 814 |
+
main()
|