generate_dataset.py · nkshirsa/phd-research-os-brain at main

File size: 51,592 Bytes

b44c1ed

"""
PhD Research OS — Synthetic Training Dataset Generator
=====================================================
Generates multi-task SFT dataset in TRL conversational format for:
  Task 1: Scientific Claim Extraction (text → structured claims JSON)
  Task 2: Epistemic Classification (Fact / Interpretation / Hypothesis / Conflict_Hypothesis)
  Task 3: Confidence Scoring (evidence_strength × study_quality × journal_tier × completeness)
  Task 4: Contradiction Detection (claim pair → conflict analysis)
  Task 5: Query Decomposition (broad question → sub-queries)
  Task 6: Decision Object Generation (gaps + goals → proposed actions with info gain)

Output: HF Dataset with "messages" column in conversational ChatML format.
"""

import json
import random
import hashlib
from datasets import Dataset, DatasetDict

random.seed(42)

# ============================================================
# SYSTEM PROMPTS (one per task — stored in /config/prompts/)
# ============================================================

SYSTEM_CLAIM_EXTRACTION = """You are the Researcher Agent of a PhD Research OS. Your role is to extract structured scientific claims from research paper text.

For each claim, output a JSON object with these fields:
- claim_id: string (CLM_XXXX format)
- text: the claim text as stated in the paper
- epistemic_tag: one of "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"
- confidence: float [0,1] computed as evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty
- evidence_strength: float [0,1] based on directness of evidence
- study_type: one of "primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study"
- missing_fields: list of field names that could not be determined from the text
- status: "Complete" if no missing fields, else "Incomplete"
- parameters: dict of key experimental parameters mentioned (concentrations, temperatures, etc.)

Output must be valid JSON: {"claims": [...]}
Always classify epistemic tags conservatively. When uncertain, prefer "Interpretation" over "Fact"."""

SYSTEM_EPISTEMIC_CLASSIFIER = """You are the Epistemic Classifier of a PhD Research OS. Given a scientific statement, classify it into exactly one category:

- Fact: Directly supported by experimental data with quantitative evidence. Reproducible measurements.
- Interpretation: Author's explanation of data. Goes beyond what the numbers strictly show. Often uses words like "suggests", "indicates", "consistent with".
- Hypothesis: Proposed mechanism or prediction not yet tested. Uses "may", "could", "we propose", "it is possible".
- Conflict_Hypothesis: A claim that explicitly contradicts another established claim in the field. Evidence exists on both sides.

Output JSON: {"epistemic_tag": "...", "reasoning": "...", "confidence_in_classification": float}
Be conservative: if a statement mixes fact and interpretation, classify as Interpretation."""

SYSTEM_CONFIDENCE_SCORER = """You are the Confidence Scorer of a PhD Research OS. Score the confidence of a scientific claim using this formula:

confidence = evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty

Where:
- evidence_strength [0,1]: How directly the evidence supports the claim
- study_quality_weight: primary_experimental=1.0, in_vitro=0.8, simulation=0.6, review_non_systematic=0.4, meta_analysis=1.0, case_study=0.3
- journal_tier_weight: tier1=1.0, tier2=0.85, tier3=0.7, preprint=0.5
- completeness_penalty: 1.0 if all parameters reported, 0.7 if missing key parameters

Output JSON: {"confidence": float, "evidence_strength": float, "study_quality_weight": float, "journal_tier_weight": float, "completeness_penalty": float, "reasoning": "..."}

Use fixed-point scaled integers internally (multiply by 1000, round, divide by 1000) to avoid floating-point drift."""

SYSTEM_CONFLICT_DETECTOR = """You are the Verifier Agent of a PhD Research OS. Given two scientific claims, determine if they contradict each other.

Analyze the claims and output a Conflict Resolution Object:
- conflict_detected: boolean
- conflict_type: one of "value_mismatch", "methodology_difference", "scope_difference", "no_conflict"
- generated_hypothesis: text explaining the possible cause of the conflict
- hypothesis_confidence: always "low" (never auto-set to high — human review required)
- resolution_status: "Unresolved"
- key_differences: list of specific parameter/methodology differences
- recommended_action: what the researcher should investigate to resolve this

Output valid JSON. Be thorough but conservative — flag real conflicts, not superficial differences."""

SYSTEM_QUERY_DECOMPOSER = """You are the Query Planner of a PhD Research OS. Given a broad research question, decompose it into 2-4 specific sub-queries that can be independently searched in a scientific knowledge base.

Each sub-query should:
- Target a specific aspect of the question
- Be answerable from individual paper claims
- Together, cover the full scope of the original question

Output JSON: {"original_query": "...", "sub_queries": ["...", "..."], "reasoning": "..."}"""

SYSTEM_DECISION_GENERATOR = """You are the Decision Agent of a PhD Research OS. Given the current research goals, knowledge gaps, and incomplete/low-confidence claims, propose a Decision Object.

A Decision Object includes:
- decision_id: string (DEC_XXXX)
- recommended_action: one of "experiment", "literature_search", "collaboration", "replication", "methodology_review"
- action_description: specific description of what to do
- expected_information_gain: float [0,1] = uncertainty_of_claim × impact_on_goal
- linked_goal_id: which research goal this addresses
- linked_claim_ids: which claims this would resolve
- priority: "high", "medium", "low"
- estimated_effort: rough time estimate

Output valid JSON. Prioritize actions with highest information gain per unit effort."""

# ============================================================
# STEM DOMAIN KNOWLEDGE BASE (for generating realistic examples)
# ============================================================

STEM_DOMAINS = {
    "biosensors": {
        "topics": ["graphene FET sensors", "Debye length screening", "aptamer functionalization", 
                   "limit of detection", "signal-to-noise ratio", "ionic strength effects",
                   "surface chemistry", "biomarker detection", "point-of-care diagnostics"],
        "parameters": ["concentration", "ionic_strength_mM", "temperature_C", "pH", 
                       "incubation_time_min", "gate_voltage_V", "drain_current_uA",
                       "sensitivity_mV_per_decade", "LOD_fM", "selectivity_ratio"],
        "journals_t1": ["Nature Biotechnology", "ACS Nano", "Nano Letters", "Biosensors and Bioelectronics"],
        "journals_t2": ["Analytical Chemistry", "Lab on a Chip", "Sensors and Actuators B"],
        "journals_t3": ["IEEE Sensors Journal", "Microchimica Acta"]
    },
    "materials_science": {
        "topics": ["2D materials", "MoS2 synthesis", "CVD growth", "defect engineering",
                   "band gap tuning", "heterostructures", "strain engineering"],
        "parameters": ["thickness_nm", "growth_temperature_C", "pressure_torr", "carrier_gas_flow_sccm",
                       "grain_size_um", "mobility_cm2_Vs", "bandgap_eV", "defect_density_cm2"],
        "journals_t1": ["Nature Materials", "Advanced Materials", "ACS Nano"],
        "journals_t2": ["Chemistry of Materials", "2D Materials", "Nanoscale"],
        "journals_t3": ["Materials Research Express", "Journal of Materials Science"]
    },
    "electrochemistry": {
        "topics": ["battery electrolytes", "solid-state batteries", "lithium-ion transport",
                   "electrode-electrolyte interface", "impedance spectroscopy", "cycling stability"],
        "parameters": ["ionic_conductivity_S_cm", "activation_energy_eV", "cycle_number",
                       "capacity_retention_pct", "voltage_window_V", "current_density_mA_cm2",
                       "coulombic_efficiency_pct", "electrode_thickness_um"],
        "journals_t1": ["Nature Energy", "Joule", "Advanced Energy Materials"],
        "journals_t2": ["Journal of the Electrochemical Society", "Electrochimica Acta"],
        "journals_t3": ["Batteries", "Journal of Power Sources"]
    },
    "computational_biology": {
        "topics": ["protein folding", "molecular dynamics", "drug-target interaction",
                   "genomic analysis", "CRISPR efficiency prediction", "gene regulatory networks"],
        "parameters": ["RMSD_angstrom", "binding_affinity_kcal_mol", "simulation_time_ns",
                       "accuracy_pct", "AUC_ROC", "precision", "recall", "F1_score"],
        "journals_t1": ["Nature Methods", "Nature Biotechnology", "Cell Systems"],
        "journals_t2": ["Bioinformatics", "PLOS Computational Biology", "BMC Genomics"],
        "journals_t3": ["Journal of Computational Biology", "Computational Biology and Chemistry"]
    },
    "quantum_computing": {
        "topics": ["qubit coherence", "quantum error correction", "superconducting circuits",
                   "quantum algorithms", "quantum supremacy benchmarks", "topological qubits"],
        "parameters": ["T1_us", "T2_us", "gate_fidelity_pct", "qubit_count",
                       "error_rate", "circuit_depth", "quantum_volume"],
        "journals_t1": ["Nature", "Science", "Physical Review Letters"],
        "journals_t2": ["Physical Review A", "Quantum", "npj Quantum Information"],
        "journals_t3": ["Quantum Science and Technology", "Journal of Physics A"]
    }
}

# ============================================================
# EXAMPLE GENERATORS
# ============================================================

def generate_claim_extraction_examples(n=500):
    """Generate claim extraction training examples."""
    examples = []
    
    paper_templates = [
        # Template 1: Experimental results paper
        {
            "text": "We investigated the effect of {param1} on {topic} using {method}. Our results demonstrate that increasing {param1} from {val1} to {val2} led to a {change_pct}% {direction} in {metric}. The {metric} reached a maximum value of {max_val} {unit} at {param1} = {optimal_val}. Statistical analysis ({stat_test}, n={n_samples}) confirmed significance (p < {p_val}). These findings suggest that {interpretation}. We hypothesize that {hypothesis}.",
            "num_claims": 5,
            "claim_types": ["Fact", "Fact", "Fact", "Interpretation", "Hypothesis"]
        },
        # Template 2: Comparison study
        {
            "text": "A comparative analysis of {method_a} and {method_b} for {application} was conducted. {method_a} achieved a {metric} of {val_a} ± {err_a} {unit}, while {method_b} yielded {val_b} ± {err_b} {unit} under identical conditions ({conditions}). The {better_method} outperformed by {diff_pct}% (p = {p_val}). However, {worse_method} showed superior {alt_metric} ({alt_val_w} vs {alt_val_b}). These results indicate that the choice between methods depends on {factor}. Future work should explore {future_direction}.",
            "num_claims": 6,
            "claim_types": ["Fact", "Fact", "Fact", "Fact", "Interpretation", "Hypothesis"]
        },
        # Template 3: Review/synthesis
        {
            "text": "Recent studies have established that {established_fact}. Multiple groups have reported {metric} values ranging from {range_low} to {range_high} {unit} ({ref1}; {ref2}; {ref3}). The consensus view is that {consensus}. However, {conflicting_author} reported contradictory findings, showing {contradicting_claim}. This discrepancy may arise from {possible_explanation}. A unified model has been proposed where {model_description}.",
            "num_claims": 5,
            "claim_types": ["Fact", "Fact", "Interpretation", "Conflict_Hypothesis", "Hypothesis"]
        },
        # Template 4: Methodology paper
        {
            "text": "We present a novel {technique} for {application} that achieves {metric} of {value} {unit}. The method involves {step1}, followed by {step2}, and final {step3}. Calibration was performed using {calibration_method} with {standard}. The limit of detection was determined to be {lod} {lod_unit} (S/N = 3). Reproducibility was assessed over {n_trials} independent measurements, yielding RSD of {rsd}%. The technique is applicable to {sample_types} with matrix effects below {matrix_effect}%.",
            "num_claims": 6,
            "claim_types": ["Fact", "Fact", "Fact", "Fact", "Fact", "Interpretation"]
        },
        # Template 5: Negative/unexpected results
        {
            "text": "Contrary to predictions from {theory}, our measurements of {parameter} in {system} showed no significant dependence on {variable} (p = {p_val_ns}, n = {n_samples}). The observed {parameter} remained at {constant_val} ± {error} {unit} across the entire range of {variable} tested ({range}). This null result suggests that {null_interpretation}. One possible explanation is that {alternative_mechanism}. These findings challenge the assumption that {challenged_assumption}.",
            "num_claims": 5,
            "claim_types": ["Fact", "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
        }
    ]

    for i in range(n):
        domain_name = random.choice(list(STEM_DOMAINS.keys()))
        domain = STEM_DOMAINS[domain_name]
        template = random.choice(paper_templates)
        topic = random.choice(domain["topics"])
        params = random.sample(domain["parameters"], min(4, len(domain["parameters"])))
        
        # Generate realistic parameter values
        journal_tier = random.choices([1, 2, 3], weights=[0.3, 0.4, 0.3])[0]
        if journal_tier == 1:
            journal = random.choice(domain["journals_t1"])
        elif journal_tier == 2:
            journal = random.choice(domain["journals_t2"])
        else:
            journal = random.choice(domain["journals_t3"])
        
        study_types = ["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis"]
        study_type = random.choices(study_types, weights=[0.4, 0.2, 0.15, 0.1, 0.15])[0]
        
        # Build paper excerpt (simplified — real version would use LLM)
        excerpt = f"""[Excerpt from: "{topic}: Recent Advances" — Published in {journal}, 2024]

In this study, we examined {topic} with particular focus on the relationship between {params[0]} and {params[1] if len(params) > 1 else 'system performance'}. Using {study_type.replace('_', ' ')} methodology, we measured {params[0]} under varying conditions of {params[1] if len(params) > 1 else 'standard parameters'}.

Our primary finding is that {params[0]} exhibits a {random.choice(['linear', 'exponential', 'logarithmic', 'sigmoidal'])} dependence on {params[1] if len(params) > 1 else 'the control variable'}, with a correlation coefficient of {round(random.uniform(0.7, 0.99), 3)}. The optimal value of {params[0]} was found to be {round(random.uniform(0.1, 100), 2)} under conditions where {params[1] if len(params) > 1 else 'temperature'} = {round(random.uniform(20, 200), 1)}.

{random.choice(['Statistical analysis confirmed', 'ANOVA testing revealed', 'Mann-Whitney U test showed'])} significance at p < {random.choice(['0.001', '0.01', '0.05'])} (n = {random.choice([3, 5, 10, 20, 50])}). These results {'align with' if random.random() > 0.3 else 'contradict'} previous reports by {random.choice(['Smith et al.', 'Zhang et al.', 'Kumar et al.', 'Johnson et al.'])} who found {params[0]} values of {round(random.uniform(0.1, 100), 2)}.

We interpret these findings as evidence that {random.choice(['the proposed mechanism involves', 'the dominant factor is', 'surface interactions govern'])} the observed behavior. We hypothesize that {random.choice(['further optimization could achieve', 'a threshold effect exists at', 'competing mechanisms dominate above'])} {round(random.uniform(10, 500), 1)}."""

        # Generate structured claims
        study_quality_map = {
            "primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6,
            "review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3
        }
        journal_tier_map = {1: 1.0, 2: 0.85, 3: 0.7}
        
        num_claims = random.randint(3, 7)
        claims = []
        for j in range(num_claims):
            epistemic = random.choices(
                ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"],
                weights=[0.4, 0.3, 0.2, 0.1]
            )[0]
            
            evidence_strength = round(random.uniform(0.5, 1.0), 3)
            sq_weight = study_quality_map.get(study_type, 0.6)
            jt_weight = journal_tier_map.get(journal_tier, 0.7)
            
            has_missing = random.random() < 0.25
            completeness = 0.7 if has_missing else 1.0
            missing = random.sample(params, random.randint(1, 2)) if has_missing else []
            
            # Fixed-point calculation (multiply by 1000, round, divide)
            conf_raw = evidence_strength * sq_weight * jt_weight * completeness
            confidence = round(int(conf_raw * 1000) / 1000, 3)
            
            claim_text_options = [
                f"The {params[0]} was measured at {round(random.uniform(0.1, 100), 2)} under standard conditions.",
                f"A {random.choice(['positive', 'negative', 'non-linear'])} correlation was observed between {params[0]} and {params[1] if len(params) > 1 else 'output'}.",
                f"The proposed mechanism suggests that {topic} is primarily governed by {random.choice(['surface effects', 'bulk properties', 'interfacial phenomena'])}.",
                f"We hypothesize that optimizing {params[0]} beyond {round(random.uniform(50, 200), 1)} could yield {random.choice(['enhanced', 'diminished', 'qualitatively different'])} results.",
                f"These findings contradict the established model by {random.choice(['Smith et al.', 'Zhang et al.', 'Lee et al.'])}, who reported {random.choice(['opposite', 'significantly different', 'null'])} effects.",
            ]
            
            claim_obj = {
                "claim_id": f"CLM_{i*10+j:04d}",
                "text": random.choice(claim_text_options),
                "epistemic_tag": epistemic,
                "confidence": confidence,
                "evidence_strength": evidence_strength,
                "study_type": study_type,
                "study_quality_weight": sq_weight,
                "journal_tier_weight": jt_weight,
                "completeness_penalty": completeness,
                "missing_fields": missing,
                "status": "Incomplete" if has_missing else "Complete",
                "parameters": {p: round(random.uniform(0.1, 100), 2) for p in random.sample(params, min(2, len(params)))}
            }
            claims.append(claim_obj)
        
        response = json.dumps({"claims": claims, "source_doi": f"10.1234/example.{i:04d}", "paper_domain": domain_name}, indent=2)
        
        examples.append({
            "messages": [
                {"role": "system", "content": SYSTEM_CLAIM_EXTRACTION},
                {"role": "user", "content": f"Extract all scientific claims from the following paper excerpt:\n\n{excerpt}"},
                {"role": "assistant", "content": response}
            ]
        })
    
    return examples


def generate_epistemic_classification_examples(n=400):
    """Generate epistemic tag classification examples."""
    examples = []
    
    statements = {
        "Fact": [
            "The measured ionic conductivity of the LLZO pellet was 4.2 × 10⁻⁴ S/cm at 25°C.",
            "Graphene field-effect transistors showed a Dirac point shift of 45 mV upon target binding.",
            "The crystal structure was confirmed as tetragonal by XRD analysis (JCPDS 00-024-0867).",
            "Cell viability remained above 95% after 48 hours of exposure to nanoparticle concentrations up to 100 μg/mL.",
            "The reaction yield increased from 32% to 87% when the catalyst loading was doubled from 5 mol% to 10 mol%.",
            "Raman spectroscopy revealed a G/2D peak ratio of 0.35, consistent with monolayer graphene.",
            "The bandgap energy was determined to be 1.85 eV from UV-Vis absorption spectroscopy.",
            "Atomic force microscopy confirmed a film thickness of 12.3 ± 0.4 nm across 20 measurement points.",
            "The protein folding simulation converged after 850 ns with RMSD < 2.0 Å.",
            "HPLC analysis showed 99.2% purity of the synthesized compound.",
            "The qubit T1 relaxation time was measured at 152 ± 8 μs at 15 mK.",
            "Mass spectrometry confirmed the molecular ion peak at m/z = 342.18, consistent with the expected product.",
            "The transistor exhibited an on/off current ratio of 10⁶ with a subthreshold swing of 68 mV/decade.",
            "Flow cytometry analysis revealed 78.3% of cells were in the G1 phase after treatment.",
            "The electrode maintained 94.2% capacity retention after 500 charge-discharge cycles at 1C rate."
        ],
        "Interpretation": [
            "The observed Dirac point shift suggests successful functionalization of the graphene surface.",
            "These results indicate that the ion transport mechanism is primarily governed by grain boundary diffusion.",
            "The non-linear dose-response curve is consistent with a cooperative binding model.",
            "The improved performance at elevated temperatures points to thermally activated charge transport.",
            "Our data support the hypothesis that surface defects play a critical role in catalytic activity.",
            "The correlation between particle size and reactivity implies surface-area-dependent kinetics.",
            "These findings are consistent with a two-step nucleation mechanism rather than classical nucleation theory.",
            "The asymmetric peak broadening in XRD patterns suggests the presence of microstrain.",
            "The enhanced fluorescence lifetime indicates reduced non-radiative recombination pathways.",
            "The inverse relationship between ionic strength and sensitivity aligns with Debye screening predictions.",
            "Based on the activation energy of 0.32 eV, we conclude that lithium diffusion occurs via an interstitial mechanism.",
            "The observed blue shift in photoluminescence is attributable to quantum confinement effects.",
            "The saturation behavior above 100 nM concentration reflects receptor site limitation."
        ],
        "Hypothesis": [
            "We propose that the anomalous conductivity enhancement arises from a percolation network of amorphous regions.",
            "It is possible that the observed bistability originates from competing ferroelectric and antiferroelectric phases.",
            "Future experiments with isotope labeling could determine whether proton hopping or vehicle mechanism dominates.",
            "We hypothesize that introducing tensile strain into the MoS2 lattice will reduce the bandgap below 1.5 eV.",
            "A possible explanation is that the protein undergoes a conformational change upon ligand binding that exposes a hidden epitope.",
            "We speculate that the unexpected catalytic activity may arise from edge-site defects not captured in bulk characterization.",
            "If the proposed mechanism is correct, replacing the counter-ion should produce a measurable shift in the voltammetric response.",
            "The anomalous transport behavior could potentially be explained by a polaron hopping model.",
            "We conjecture that the system exhibits a quantum phase transition at a critical doping concentration of approximately 0.15.",
            "A theoretical framework based on Marcus theory predicts that electron transfer rates should increase by 10× at these reorganization energies.",
            "It remains to be tested whether this enhancement persists under physiological buffer conditions."
        ],
        "Conflict_Hypothesis": [
            "Our observation of decreasing sensitivity at high ionic strength directly contradicts Chen et al. (2022), who reported sensitivity enhancement under similar conditions.",
            "While the established model predicts a linear relationship between film thickness and resistance, our data show clear deviation above 20 nm.",
            "These results challenge the widely accepted Langmuir adsorption model, suggesting that multilayer formation occurs at concentrations previously considered sub-monolayer.",
            "Contrary to the predictions of density functional theory calculations by Park et al., we observe metallic rather than semiconducting behavior in this phase.",
            "The measured activation energy of 0.52 eV is significantly higher than the 0.28 eV reported by three independent groups, suggesting a fundamentally different transport mechanism.",
            "Our finding that the reaction proceeds without the proposed intermediate contradicts the established mechanism.",
            "The negative correlation we observe between grain size and conductivity opposes the conventional understanding based on brick-layer model predictions.",
            "While Johnson et al. reported complete stability over 1000 cycles, our replication attempt shows measurable degradation beginning at cycle 300."
        ]
    }
    
    for i in range(n):
        tag = random.choices(
            ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"],
            weights=[0.35, 0.30, 0.20, 0.15]
        )[0]
        statement = random.choice(statements[tag])
        
        reasoning_templates = {
            "Fact": "This statement reports a direct measurement/observation with specific quantitative data. It does not include author interpretation or speculation.",
            "Interpretation": "This statement goes beyond raw data to explain or attribute meaning to observations. The author draws conclusions that are not strictly contained in the measurements alone.",
            "Hypothesis": "This statement proposes an untested mechanism or prediction. Key indicators: uses speculative language (may, could, hypothesize, propose, possible).",
            "Conflict_Hypothesis": "This statement explicitly contradicts an established finding or widely accepted model, with evidence supporting both positions."
        }
        
        conf_in_class = round(random.uniform(0.75, 0.98), 2)
        
        response = json.dumps({
            "epistemic_tag": tag,
            "reasoning": reasoning_templates[tag],
            "confidence_in_classification": conf_in_class
        }, indent=2)
        
        examples.append({
            "messages": [
                {"role": "system", "content": SYSTEM_EPISTEMIC_CLASSIFIER},
                {"role": "user", "content": f"Classify the epistemic status of this scientific statement:\n\n\"{statement}\""},
                {"role": "assistant", "content": response}
            ]
        })
    
    return examples


def generate_confidence_scoring_examples(n=300):
    """Generate confidence scoring training examples."""
    examples = []
    
    for i in range(n):
        domain_name = random.choice(list(STEM_DOMAINS.keys()))
        domain = STEM_DOMAINS[domain_name]
        
        study_type = random.choices(
            ["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study"],
            weights=[0.35, 0.20, 0.15, 0.10, 0.10, 0.10]
        )[0]
        
        journal_tier = random.choices([1, 2, 3, "preprint"], weights=[0.25, 0.35, 0.25, 0.15])[0]
        
        if journal_tier == 1:
            journal = random.choice(domain["journals_t1"])
        elif journal_tier == 2:
            journal = random.choice(domain["journals_t2"])
        elif journal_tier == 3:
            journal = random.choice(domain["journals_t3"])
        else:
            journal = "arXiv preprint"
        
        sq_map = {"primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6, 
                  "review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3}
        jt_map = {1: 1.0, 2: 0.85, 3: 0.7, "preprint": 0.5}
        
        evidence_strength = round(random.uniform(0.3, 1.0), 3)
        sq_weight = sq_map[study_type]
        jt_weight = jt_map[journal_tier]
        
        has_missing = random.random() < 0.3
        completeness = 0.7 if has_missing else 1.0
        
        # Fixed-point calculation
        raw = evidence_strength * sq_weight * jt_weight * completeness
        confidence = round(int(raw * 1000) / 1000, 3)
        
        claim_text = f"Claim from {journal} ({study_type.replace('_', ' ')} study): {random.choice(domain['topics'])} measured {random.choice(domain['parameters'])} at {round(random.uniform(0.1, 100), 2)}."
        
        if has_missing:
            claim_text += " [Note: sample size and error margins not reported]"
        
        reasoning = f"Study type '{study_type}' gets weight {sq_weight}. Journal '{journal}' is tier {journal_tier} (weight {jt_weight}). Evidence strength assessed at {evidence_strength} based on directness of measurement. {'Missing key parameters reduce completeness penalty to 0.7.' if has_missing else 'All parameters reported, completeness 1.0.'} Final: {evidence_strength} × {sq_weight} × {jt_weight} × {completeness} = {confidence}."
        
        response = json.dumps({
            "confidence": confidence,
            "evidence_strength": evidence_strength,
            "study_quality_weight": sq_weight,
            "journal_tier_weight": jt_weight,
            "completeness_penalty": completeness,
            "reasoning": reasoning
        }, indent=2)
        
        examples.append({
            "messages": [
                {"role": "system", "content": SYSTEM_CONFIDENCE_SCORER},
                {"role": "user", "content": f"Score the confidence of this claim:\n\n{claim_text}\n\nSource: {journal}\nStudy type: {study_type}\nJournal tier: {journal_tier}"},
                {"role": "assistant", "content": response}
            ]
        })
    
    return examples


def generate_conflict_detection_examples(n=300):
    """Generate contradiction detection training examples."""
    examples = []
    
    conflict_pairs = [
        # Real conflicts
        {
            "claim_a": "Graphene FET sensitivity increases monotonically with decreasing ionic strength (measured range: 0.1-100 mM PBS).",
            "claim_b": "Below 1 mM ionic strength, GFET sensitivity plateaus due to parasitic charge screening from surface-adsorbed species.",
            "conflict": True,
            "conflict_type": "value_mismatch",
            "hypothesis": "The discrepancy likely arises from different surface functionalization protocols. Claim A used bare graphene while Claim B used PEG-passivated surfaces, which would accumulate different surface charges at very low ionic strength.",
            "key_diffs": ["surface treatment protocol", "ionic strength range tested", "measurement technique"]
        },
        {
            "claim_a": "The Li-ion conductivity of LLZO was 1.2 × 10⁻³ S/cm at room temperature.",
            "claim_b": "LLZO pellets sintered under identical conditions showed conductivity of 2.8 × 10⁻⁴ S/cm at 25°C.",
            "conflict": True,
            "conflict_type": "value_mismatch",
            "hypothesis": "The order-of-magnitude difference may stem from different grain boundary densities. Claim A likely reports single-crystal or highly-densified samples, while Claim B may include significant grain boundary resistance.",
            "key_diffs": ["sample preparation method", "densification level", "measurement geometry"]
        },
        {
            "claim_a": "MoS2 monolayers grown by CVD at 700°C show exclusively 2H phase.",
            "claim_b": "CVD-grown MoS2 at 700°C contains 30-40% 1T phase, as confirmed by XPS peak deconvolution.",
            "conflict": True,
            "conflict_type": "methodology_difference",
            "hypothesis": "The 1T phase in Claim B may be induced during the transfer process (common with wet-chemical transfer) or by the choice of sulfur precursor. Claim A may have used a different transfer method or in-situ characterization.",
            "key_diffs": ["sulfur precursor", "transfer method", "characterization timing (in-situ vs ex-situ)"]
        },
        {
            "claim_a": "The protein binding affinity (Kd) for aptamer X was 2.3 nM in buffer.",
            "claim_b": "Aptamer X showed Kd of 180 nM when tested in 50% human serum.",
            "conflict": False,
            "conflict_type": "scope_difference",
            "hypothesis": "These measurements are not contradictory — they reflect different measurement conditions. The 78× decrease in affinity in serum is expected due to non-specific protein interactions, ionic strength differences, and potential aptamer degradation by nucleases.",
            "key_diffs": ["measurement medium (buffer vs serum)", "matrix effects"]
        },
        {
            "claim_a": "Quantum dot fluorescence quenching follows Stern-Volmer kinetics with KSV = 4.5 × 10⁴ M⁻¹.",
            "claim_b": "The same QD-quencher system shows non-linear Stern-Volmer behavior above 10 μM quencher concentration.",
            "conflict": False,
            "conflict_type": "scope_difference",
            "hypothesis": "Both claims can be simultaneously true. The Stern-Volmer relationship is linear at low quencher concentrations (Claim A's measurement range) but deviates at higher concentrations (Claim B) due to static quenching or ground-state complex formation.",
            "key_diffs": ["concentration range", "quenching mechanism regime"]
        },
        # More conflicts
        {
            "claim_a": "Sonication for 30 minutes produces graphene flakes with average lateral size of 500 nm.",
            "claim_b": "Extended sonication (30 min) yields graphene fragments predominantly below 100 nm with significant edge defects.",
            "conflict": True,
            "conflict_type": "value_mismatch",
            "hypothesis": "The 5× difference in reported flake size likely stems from different sonication power/frequency settings, solvent choice (NMP vs water/surfactant), or measurement method (DLS vs AFM). Additionally, definition of 'average' may differ (number-averaged vs volume-averaged).",
            "key_diffs": ["sonication parameters (power, frequency)", "solvent system", "size measurement technique", "averaging method"]
        },
        {
            "claim_a": "The neural network achieved 96.3% accuracy on the protein structure prediction benchmark.",
            "claim_b": "On the same benchmark, the identical architecture achieved only 84.1% accuracy when trained with a different random seed.",
            "conflict": True,
            "conflict_type": "methodology_difference",
            "hypothesis": "The 12.2% accuracy gap from random seed variation alone suggests the model is highly sensitive to initialization. This may indicate overfitting to specific training data partitions or instability in the optimization landscape.",
            "key_diffs": ["random seed", "potentially different train/test splits", "convergence criteria"]
        },
        {
            "claim_a": "The catalyst achieves 99% conversion at 80°C.",
            "claim_b": "The catalyst achieves 99% conversion at 80°C with fresh reagents, but only 45% with recycled catalyst after 3 cycles.",
            "conflict": False,
            "conflict_type": "scope_difference",
            "hypothesis": "These are not contradictory. Claim A reports initial performance while Claim B adds information about durability. The 54% drop after 3 cycles reveals catalyst deactivation, possibly from active site poisoning or structural degradation.",
            "key_diffs": ["catalyst reuse cycle", "implicit freshness assumption in Claim A"]
        }
    ]
    
    for i in range(n):
        pair = random.choice(conflict_pairs)
        
        response = json.dumps({
            "conflict_detected": pair["conflict"],
            "conflict_type": pair["conflict_type"],
            "generated_hypothesis": pair["hypothesis"],
            "hypothesis_confidence": "low",
            "resolution_status": "Unresolved",
            "key_differences": pair["key_diffs"],
            "recommended_action": f"{'Investigate the specific methodological differences identified. Request raw data from both groups if possible.' if pair['conflict'] else 'No conflict resolution needed. Both claims are valid within their respective scopes. Document the scope boundary.'}"
        }, indent=2)
        
        examples.append({
            "messages": [
                {"role": "system", "content": SYSTEM_CONFLICT_DETECTOR},
                {"role": "user", "content": f"Analyze these two claims for contradictions:\n\nClaim A: \"{pair['claim_a']}\"\n\nClaim B: \"{pair['claim_b']}\""},
                {"role": "assistant", "content": response}
            ]
        })
    
    return examples


def generate_query_decomposition_examples(n=200):
    """Generate query decomposition examples."""
    examples = []
    
    queries = [
        {
            "query": "What is the current state of graphene-based biosensors for cancer biomarker detection?",
            "sub_queries": [
                "What cancer biomarkers have been detected using graphene FET sensors?",
                "What are the reported limits of detection for graphene biosensors targeting cancer markers?",
                "How does graphene biosensor performance compare to ELISA and other standard methods?",
                "What are the main challenges preventing clinical translation of graphene biosensors?"
            ],
            "reasoning": "This broad question spans detection targets, performance metrics, comparative assessment, and translational barriers. Each sub-query targets a specific knowledge domain that maps to different claim types in the database."
        },
        {
            "query": "How does ionic strength affect the performance of field-effect transistor biosensors?",
            "sub_queries": [
                "What is the Debye screening length at different ionic strength values?",
                "How does sensitivity change as a function of buffer ionic strength for FET biosensors?",
                "What strategies have been developed to overcome Debye screening limitations?"
            ],
            "reasoning": "The question involves fundamental physics (Debye length), empirical relationships (sensitivity vs ionic strength), and engineering solutions (overcoming limitations)."
        },
        {
            "query": "What are the best solid-state electrolytes for next-generation lithium batteries?",
            "sub_queries": [
                "What ionic conductivities have been achieved in garnet-type, sulfide, and polymer electrolytes?",
                "What are the interfacial stability challenges between solid electrolytes and lithium metal anodes?",
                "How do manufacturing scalability and cost compare across solid electrolyte families?",
                "What degradation mechanisms limit cycle life in solid-state batteries?"
            ],
            "reasoning": "This question spans material performance (conductivity), interface engineering, practical considerations (cost/scale), and durability. Each is a distinct research sub-domain."
        },
        {
            "query": "Can CRISPR-Cas9 efficiency be predicted computationally?",
            "sub_queries": [
                "What machine learning models have been developed for CRISPR guide RNA efficiency prediction?",
                "What features (sequence, structure, chromatin) are most predictive of CRISPR cutting efficiency?",
                "How do computational predictions compare to experimental validation data?"
            ],
            "reasoning": "The question bridges computational methods, feature engineering, and experimental validation — three distinct knowledge areas."
        },
        {
            "query": "What determines qubit coherence time in superconducting quantum processors?",
            "sub_queries": [
                "What are the dominant decoherence mechanisms (T1, T2) in transmon qubits?",
                "How do materials and fabrication choices affect qubit coherence?",
                "What is the current state-of-the-art for superconducting qubit coherence times?"
            ],
            "reasoning": "This question involves fundamental physics (decoherence mechanisms), engineering (fabrication), and benchmarking (state-of-the-art values)."
        },
        {
            "query": "How effective are 2D material heterostructures for photocatalytic water splitting?",
            "sub_queries": [
                "What 2D material combinations have been tested for photocatalytic hydrogen evolution?",
                "What are the reported solar-to-hydrogen conversion efficiencies for 2D heterostructure photocatalysts?",
                "How does band alignment in 2D heterostructures affect charge separation and catalytic activity?",
                "What stability and durability challenges exist for 2D photocatalysts?"
            ],
            "reasoning": "Covers materials (combinations), performance metrics (efficiency), mechanism (band alignment), and practical challenges (stability) — four distinct research angles."
        }
    ]
    
    for i in range(n):
        q = random.choice(queries)
        response = json.dumps({
            "original_query": q["query"],
            "sub_queries": q["sub_queries"],
            "reasoning": q["reasoning"]
        }, indent=2)
        
        examples.append({
            "messages": [
                {"role": "system", "content": SYSTEM_QUERY_DECOMPOSER},
                {"role": "user", "content": f"Decompose this research question into specific sub-queries:\n\n\"{q['query']}\""},
                {"role": "assistant", "content": response}
            ]
        })
    
    return examples


def generate_decision_object_examples(n=200):
    """Generate Decision Object training examples."""
    examples = []
    
    scenarios = [
        {
            "goal": "Achieve sub-femtomolar detection limit for cardiac troponin I using GFET biosensor",
            "gaps": ["Optimal aptamer sequence for cTnI not determined", "Debye screening at physiological ionic strength limits sensitivity", "No data on sensor-to-sensor reproducibility"],
            "low_confidence_claims": ["CLM_0042: PEG spacer length of 5 kDa optimal (confidence: 0.35)", "CLM_0089: Desalting step preserves >90% of target protein (confidence: 0.41)"],
            "decision": {
                "decision_id": "DEC_0001",
                "recommended_action": "experiment",
                "action_description": "Systematic optimization of aptamer surface density and PEG spacer length on GFET. Test 3 aptamer concentrations × 3 PEG lengths × 3 ionic strength conditions. Include negative controls.",
                "expected_information_gain": 0.72,
                "linked_goal_id": "GOAL_001",
                "linked_claim_ids": ["CLM_0042", "CLM_0089"],
                "priority": "high",
                "estimated_effort": "2-3 weeks of lab work + 1 week analysis"
            }
        },
        {
            "goal": "Understand degradation mechanism in solid-state lithium battery at >100 cycles",
            "gaps": ["Interface evolution not characterized in-situ", "Role of grain boundary resistance vs bulk unclear", "Temperature dependence of degradation unknown"],
            "low_confidence_claims": ["CLM_0156: Lithium dendrite penetration through grain boundaries (confidence: 0.28)", "CLM_0201: SEI formation at cathode interface dominates early degradation (confidence: 0.39)"],
            "decision": {
                "decision_id": "DEC_0002",
                "recommended_action": "literature_search",
                "action_description": "Comprehensive search for in-situ/operando characterization studies of LLZO-lithium interfaces during cycling. Focus on synchrotron XRD and cryo-TEM studies from 2022-2024.",
                "expected_information_gain": 0.58,
                "linked_goal_id": "GOAL_003",
                "linked_claim_ids": ["CLM_0156", "CLM_0201"],
                "priority": "medium",
                "estimated_effort": "1 week literature search + synthesis"
            }
        },
        {
            "goal": "Validate computational model of protein-aptamer binding",
            "gaps": ["MD simulation parameters not benchmarked against experimental Kd", "Force field choice may introduce systematic bias", "Solvent model effects unexplored"],
            "low_confidence_claims": ["CLM_0312: AMBER ff14SB adequate for aptamer-protein complexes (confidence: 0.32)"],
            "decision": {
                "decision_id": "DEC_0003",
                "recommended_action": "collaboration",
                "action_description": "Contact the computational chemistry group (Prof. Martinez) for force field parameterization expertise. Their recent paper on RNA-protein interactions used an optimized force field that may apply here.",
                "expected_information_gain": 0.65,
                "linked_goal_id": "GOAL_005",
                "linked_claim_ids": ["CLM_0312"],
                "priority": "medium",
                "estimated_effort": "Initial meeting + 2-4 weeks of collaborative work"
            }
        },
        {
            "goal": "Establish reproducibility of the nanofabrication process",
            "gaps": ["Batch-to-batch variation not quantified", "Critical process parameters not identified", "No statistical process control"],
            "low_confidence_claims": ["CLM_0089: Yield >80% for device fabrication (confidence: 0.25, based on single batch)"],
            "decision": {
                "decision_id": "DEC_0004",
                "recommended_action": "replication",
                "action_description": "Fabricate 5 independent batches of 20 devices each over 5 separate days. Measure key performance metrics. Perform ANOVA to identify day-to-day and within-batch variation. Calculate Cpk for critical parameters.",
                "expected_information_gain": 0.81,
                "linked_goal_id": "GOAL_002",
                "linked_claim_ids": ["CLM_0089"],
                "priority": "high",
                "estimated_effort": "3-4 weeks fabrication + 1 week characterization + 1 week analysis"
            }
        },
        {
            "goal": "Resolve contradictory claims about qubit decoherence mechanism",
            "gaps": ["Two competing theories with similar explanatory power", "No experiment designed to distinguish between mechanisms"],
            "low_confidence_claims": ["CLM_0445: TLS-dominated decoherence (confidence: 0.42)", "CLM_0446: Quasiparticle tunneling dominates above 50mK (confidence: 0.38)"],
            "decision": {
                "decision_id": "DEC_0005",
                "recommended_action": "methodology_review",
                "action_description": "Design a discriminating experiment: measure T1 as a function of temperature (10-200 mK) and applied magnetic field (0-100 mT). TLS model predicts logarithmic T1(T) while quasiparticle model predicts exponential. This single experiment can resolve the contradiction.",
                "expected_information_gain": 0.89,
                "linked_goal_id": "GOAL_007",
                "linked_claim_ids": ["CLM_0445", "CLM_0446"],
                "priority": "high",
                "estimated_effort": "1 week experiment design + 2 weeks measurement + 1 week analysis"
            }
        }
    ]
    
    for i in range(n):
        scenario = random.choice(scenarios)
        
        user_prompt = f"""Current research goal: {scenario['goal']}

Knowledge gaps:
{chr(10).join('- ' + g for g in scenario['gaps'])}

Low-confidence claims requiring resolution:
{chr(10).join('- ' + c for c in scenario['low_confidence_claims'])}

Propose a Decision Object with the highest expected information gain."""
        
        response = json.dumps(scenario["decision"], indent=2)
        
        examples.append({
            "messages": [
                {"role": "system", "content": SYSTEM_DECISION_GENERATOR},
                {"role": "user", "content": user_prompt},
                {"role": "assistant", "content": response}
            ]
        })
    
    return examples


# ============================================================
# MAIN: Generate and combine all task datasets
# ============================================================

def main():
    print("Generating PhD Research OS training dataset...")
    
    print("  Task 1: Claim Extraction (500 examples)...")
    claim_examples = generate_claim_extraction_examples(500)
    
    print("  Task 2: Epistemic Classification (400 examples)...")
    epistemic_examples = generate_epistemic_classification_examples(400)
    
    print("  Task 3: Confidence Scoring (300 examples)...")
    confidence_examples = generate_confidence_scoring_examples(300)
    
    print("  Task 4: Conflict Detection (300 examples)...")
    conflict_examples = generate_conflict_detection_examples(300)
    
    print("  Task 5: Query Decomposition (200 examples)...")
    query_examples = generate_query_decomposition_examples(200)
    
    print("  Task 6: Decision Objects (200 examples)...")
    decision_examples = generate_decision_object_examples(200)
    
    # Combine all examples
    all_examples = (claim_examples + epistemic_examples + confidence_examples + 
                    conflict_examples + query_examples + decision_examples)
    
    # Shuffle
    random.shuffle(all_examples)
    
    print(f"\n  Total examples: {len(all_examples)}")
    
    # Split into train/eval (90/10)
    split_idx = int(len(all_examples) * 0.9)
    train_data = all_examples[:split_idx]
    eval_data = all_examples[split_idx:]
    
    print(f"  Train: {len(train_data)}, Eval: {len(eval_data)}")
    
    # Create HF Dataset
    train_dataset = Dataset.from_list(train_data)
    eval_dataset = Dataset.from_list(eval_data)
    
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "test": eval_dataset
    })
    
    # Save locally
    dataset_dict.save_to_disk("/app/phd_research_os_dataset")
    print(f"\n  Dataset saved to /app/phd_research_os_dataset")
    
    # Also save as JSON for inspection
    with open("/app/sample_examples.json", "w") as f:
        json.dump(all_examples[:10], f, indent=2)
    print("  Sample examples saved to /app/sample_examples.json")
    
    # Print distribution stats
    task_counts = {}
    for ex in all_examples:
        system_msg = ex["messages"][0]["content"][:50]
        task_counts[system_msg] = task_counts.get(system_msg, 0) + 1
    
    print("\n  Task distribution:")
    for task, count in task_counts.items():
        print(f"    {task}... : {count}")
    
    return dataset_dict


if __name__ == "__main__":
    main()