Add generate_dataset.py

Browse files

Files changed (1) hide show

generate_dataset.py +814 -0

generate_dataset.py ADDED Viewed

	@@ -0,0 +1,814 @@

+"""
+PhD Research OS — Synthetic Training Dataset Generator
+=====================================================
+Generates multi-task SFT dataset in TRL conversational format for:
+  Task 1: Scientific Claim Extraction (text → structured claims JSON)
+  Task 2: Epistemic Classification (Fact / Interpretation / Hypothesis / Conflict_Hypothesis)
+  Task 3: Confidence Scoring (evidence_strength × study_quality × journal_tier × completeness)
+  Task 4: Contradiction Detection (claim pair → conflict analysis)
+  Task 5: Query Decomposition (broad question → sub-queries)
+  Task 6: Decision Object Generation (gaps + goals → proposed actions with info gain)
+Output: HF Dataset with "messages" column in conversational ChatML format.
+"""
+import json
+import random
+import hashlib
+from datasets import Dataset, DatasetDict
+random.seed(42)
+# ============================================================
+# SYSTEM PROMPTS (one per task — stored in /config/prompts/)
+# ============================================================
+SYSTEM_CLAIM_EXTRACTION = """You are the Researcher Agent of a PhD Research OS. Your role is to extract structured scientific claims from research paper text.
+For each claim, output a JSON object with these fields:
+- claim_id: string (CLM_XXXX format)
+- text: the claim text as stated in the paper
+- epistemic_tag: one of "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"
+- confidence: float [0,1] computed as evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty
+- evidence_strength: float [0,1] based on directness of evidence
+- study_type: one of "primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study"
+- missing_fields: list of field names that could not be determined from the text
+- status: "Complete" if no missing fields, else "Incomplete"
+- parameters: dict of key experimental parameters mentioned (concentrations, temperatures, etc.)
+Output must be valid JSON: {"claims": [...]}
+Always classify epistemic tags conservatively. When uncertain, prefer "Interpretation" over "Fact"."""
+SYSTEM_EPISTEMIC_CLASSIFIER = """You are the Epistemic Classifier of a PhD Research OS. Given a scientific statement, classify it into exactly one category:
+- Fact: Directly supported by experimental data with quantitative evidence. Reproducible measurements.
+- Interpretation: Author's explanation of data. Goes beyond what the numbers strictly show. Often uses words like "suggests", "indicates", "consistent with".
+- Hypothesis: Proposed mechanism or prediction not yet tested. Uses "may", "could", "we propose", "it is possible".
+- Conflict_Hypothesis: A claim that explicitly contradicts another established claim in the field. Evidence exists on both sides.
+Output JSON: {"epistemic_tag": "...", "reasoning": "...", "confidence_in_classification": float}
+Be conservative: if a statement mixes fact and interpretation, classify as Interpretation."""
+SYSTEM_CONFIDENCE_SCORER = """You are the Confidence Scorer of a PhD Research OS. Score the confidence of a scientific claim using this formula:
+confidence = evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty
+Where:
+- evidence_strength [0,1]: How directly the evidence supports the claim
+- study_quality_weight: primary_experimental=1.0, in_vitro=0.8, simulation=0.6, review_non_systematic=0.4, meta_analysis=1.0, case_study=0.3
+- journal_tier_weight: tier1=1.0, tier2=0.85, tier3=0.7, preprint=0.5
+- completeness_penalty: 1.0 if all parameters reported, 0.7 if missing key parameters
+Output JSON: {"confidence": float, "evidence_strength": float, "study_quality_weight": float, "journal_tier_weight": float, "completeness_penalty": float, "reasoning": "..."}
+Use fixed-point scaled integers internally (multiply by 1000, round, divide by 1000) to avoid floating-point drift."""
+SYSTEM_CONFLICT_DETECTOR = """You are the Verifier Agent of a PhD Research OS. Given two scientific claims, determine if they contradict each other.
+Analyze the claims and output a Conflict Resolution Object:
+- conflict_detected: boolean
+- conflict_type: one of "value_mismatch", "methodology_difference", "scope_difference", "no_conflict"
+- generated_hypothesis: text explaining the possible cause of the conflict
+- hypothesis_confidence: always "low" (never auto-set to high — human review required)
+- resolution_status: "Unresolved"
+- key_differences: list of specific parameter/methodology differences
+- recommended_action: what the researcher should investigate to resolve this
+Output valid JSON. Be thorough but conservative — flag real conflicts, not superficial differences."""
+SYSTEM_QUERY_DECOMPOSER = """You are the Query Planner of a PhD Research OS. Given a broad research question, decompose it into 2-4 specific sub-queries that can be independently searched in a scientific knowledge base.
+Each sub-query should:
+- Target a specific aspect of the question
+- Be answerable from individual paper claims
+- Together, cover the full scope of the original question
+Output JSON: {"original_query": "...", "sub_queries": ["...", "..."], "reasoning": "..."}"""
+SYSTEM_DECISION_GENERATOR = """You are the Decision Agent of a PhD Research OS. Given the current research goals, knowledge gaps, and incomplete/low-confidence claims, propose a Decision Object.
+A Decision Object includes:
+- decision_id: string (DEC_XXXX)
+- recommended_action: one of "experiment", "literature_search", "collaboration", "replication", "methodology_review"
+- action_description: specific description of what to do
+- expected_information_gain: float [0,1] = uncertainty_of_claim × impact_on_goal
+- linked_goal_id: which research goal this addresses
+- linked_claim_ids: which claims this would resolve
+- priority: "high", "medium", "low"
+- estimated_effort: rough time estimate
+Output valid JSON. Prioritize actions with highest information gain per unit effort."""
+# ============================================================
+# STEM DOMAIN KNOWLEDGE BASE (for generating realistic examples)
+# ============================================================
+STEM_DOMAINS = {
+    "biosensors": {
+        "topics": ["graphene FET sensors", "Debye length screening", "aptamer functionalization",
+                   "limit of detection", "signal-to-noise ratio", "ionic strength effects",
+                   "surface chemistry", "biomarker detection", "point-of-care diagnostics"],
+        "parameters": ["concentration", "ionic_strength_mM", "temperature_C", "pH",
+                       "incubation_time_min", "gate_voltage_V", "drain_current_uA",
+                       "sensitivity_mV_per_decade", "LOD_fM", "selectivity_ratio"],
+        "journals_t1": ["Nature Biotechnology", "ACS Nano", "Nano Letters", "Biosensors and Bioelectronics"],
+        "journals_t2": ["Analytical Chemistry", "Lab on a Chip", "Sensors and Actuators B"],
+        "journals_t3": ["IEEE Sensors Journal", "Microchimica Acta"]
+    },
+    "materials_science": {
+        "topics": ["2D materials", "MoS2 synthesis", "CVD growth", "defect engineering",
+                   "band gap tuning", "heterostructures", "strain engineering"],
+        "parameters": ["thickness_nm", "growth_temperature_C", "pressure_torr", "carrier_gas_flow_sccm",
+                       "grain_size_um", "mobility_cm2_Vs", "bandgap_eV", "defect_density_cm2"],
+        "journals_t1": ["Nature Materials", "Advanced Materials", "ACS Nano"],
+        "journals_t2": ["Chemistry of Materials", "2D Materials", "Nanoscale"],
+        "journals_t3": ["Materials Research Express", "Journal of Materials Science"]
+    },
+    "electrochemistry": {
+        "topics": ["battery electrolytes", "solid-state batteries", "lithium-ion transport",
+                   "electrode-electrolyte interface", "impedance spectroscopy", "cycling stability"],
+        "parameters": ["ionic_conductivity_S_cm", "activation_energy_eV", "cycle_number",
+                       "capacity_retention_pct", "voltage_window_V", "current_density_mA_cm2",
+                       "coulombic_efficiency_pct", "electrode_thickness_um"],
+        "journals_t1": ["Nature Energy", "Joule", "Advanced Energy Materials"],
+        "journals_t2": ["Journal of the Electrochemical Society", "Electrochimica Acta"],
+        "journals_t3": ["Batteries", "Journal of Power Sources"]
+    },
+    "computational_biology": {
+        "topics": ["protein folding", "molecular dynamics", "drug-target interaction",
+                   "genomic analysis", "CRISPR efficiency prediction", "gene regulatory networks"],
+        "parameters": ["RMSD_angstrom", "binding_affinity_kcal_mol", "simulation_time_ns",
+                       "accuracy_pct", "AUC_ROC", "precision", "recall", "F1_score"],
+        "journals_t1": ["Nature Methods", "Nature Biotechnology", "Cell Systems"],
+        "journals_t2": ["Bioinformatics", "PLOS Computational Biology", "BMC Genomics"],
+        "journals_t3": ["Journal of Computational Biology", "Computational Biology and Chemistry"]
+    },
+    "quantum_computing": {
+        "topics": ["qubit coherence", "quantum error correction", "superconducting circuits",
+                   "quantum algorithms", "quantum supremacy benchmarks", "topological qubits"],
+        "parameters": ["T1_us", "T2_us", "gate_fidelity_pct", "qubit_count",
+                       "error_rate", "circuit_depth", "quantum_volume"],
+        "journals_t1": ["Nature", "Science", "Physical Review Letters"],
+        "journals_t2": ["Physical Review A", "Quantum", "npj Quantum Information"],
+        "journals_t3": ["Quantum Science and Technology", "Journal of Physics A"]
+    }
+}
+# ============================================================
+# EXAMPLE GENERATORS
+# ============================================================
+def generate_claim_extraction_examples(n=500):
+    """Generate claim extraction training examples."""
+    examples = []
+    paper_templates = [
+        # Template 1: Experimental results paper
+        {
+            "text": "We investigated the effect of {param1} on {topic} using {method}. Our results demonstrate that increasing {param1} from {val1} to {val2} led to a {change_pct}% {direction} in {metric}. The {metric} reached a maximum value of {max_val} {unit} at {param1} = {optimal_val}. Statistical analysis ({stat_test}, n={n_samples}) confirmed significance (p < {p_val}). These findings suggest that {interpretation}. We hypothesize that {hypothesis}.",
+            "num_claims": 5,
+            "claim_types": ["Fact", "Fact", "Fact", "Interpretation", "Hypothesis"]
+        },
+        # Template 2: Comparison study
+        {
+            "text": "A comparative analysis of {method_a} and {method_b} for {application} was conducted. {method_a} achieved a {metric} of {val_a} ± {err_a} {unit}, while {method_b} yielded {val_b} ± {err_b} {unit} under identical conditions ({conditions}). The {better_method} outperformed by {diff_pct}% (p = {p_val}). However, {worse_method} showed superior {alt_metric} ({alt_val_w} vs {alt_val_b}). These results indicate that the choice between methods depends on {factor}. Future work should explore {future_direction}.",
+            "num_claims": 6,
+            "claim_types": ["Fact", "Fact", "Fact", "Fact", "Interpretation", "Hypothesis"]
+        },
+        # Template 3: Review/synthesis
+        {
+            "text": "Recent studies have established that {established_fact}. Multiple groups have reported {metric} values ranging from {range_low} to {range_high} {unit} ({ref1}; {ref2}; {ref3}). The consensus view is that {consensus}. However, {conflicting_author} reported contradictory findings, showing {contradicting_claim}. This discrepancy may arise from {possible_explanation}. A unified model has been proposed where {model_description}.",
+            "num_claims": 5,
+            "claim_types": ["Fact", "Fact", "Interpretation", "Conflict_Hypothesis", "Hypothesis"]
+        },
+        # Template 4: Methodology paper
+        {
+            "text": "We present a novel {technique} for {application} that achieves {metric} of {value} {unit}. The method involves {step1}, followed by {step2}, and final {step3}. Calibration was performed using {calibration_method} with {standard}. The limit of detection was determined to be {lod} {lod_unit} (S/N = 3). Reproducibility was assessed over {n_trials} independent measurements, yielding RSD of {rsd}%. The technique is applicable to {sample_types} with matrix effects below {matrix_effect}%.",
+            "num_claims": 6,
+            "claim_types": ["Fact", "Fact", "Fact", "Fact", "Fact", "Interpretation"]
+        },
+        # Template 5: Negative/unexpected results
+        {
+            "text": "Contrary to predictions from {theory}, our measurements of {parameter} in {system} showed no significant dependence on {variable} (p = {p_val_ns}, n = {n_samples}). The observed {parameter} remained at {constant_val} ± {error} {unit} across the entire range of {variable} tested ({range}). This null result suggests that {null_interpretation}. One possible explanation is that {alternative_mechanism}. These findings challenge the assumption that {challenged_assumption}.",
+            "num_claims": 5,
+            "claim_types": ["Fact", "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
+        }
+    ]
+    for i in range(n):
+        domain_name = random.choice(list(STEM_DOMAINS.keys()))
+        domain = STEM_DOMAINS[domain_name]
+        template = random.choice(paper_templates)
+        topic = random.choice(domain["topics"])
+        params = random.sample(domain["parameters"], min(4, len(domain["parameters"])))
+        # Generate realistic parameter values
+        journal_tier = random.choices([1, 2, 3], weights=[0.3, 0.4, 0.3])[0]
+        if journal_tier == 1:
+            journal = random.choice(domain["journals_t1"])
+        elif journal_tier == 2:
+            journal = random.choice(domain["journals_t2"])
+        else:
+            journal = random.choice(domain["journals_t3"])
+        study_types = ["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis"]
+        study_type = random.choices(study_types, weights=[0.4, 0.2, 0.15, 0.1, 0.15])[0]
+        # Build paper excerpt (simplified — real version would use LLM)
+        excerpt = f"""[Excerpt from: "{topic}: Recent Advances" — Published in {journal}, 2024]
+In this study, we examined {topic} with particular focus on the relationship between {params[0]} and {params[1] if len(params) > 1 else 'system performance'}. Using {study_type.replace('_', ' ')} methodology, we measured {params[0]} under varying conditions of {params[1] if len(params) > 1 else 'standard parameters'}.
+Our primary finding is that {params[0]} exhibits a {random.choice(['linear', 'exponential', 'logarithmic', 'sigmoidal'])} dependence on {params[1] if len(params) > 1 else 'the control variable'}, with a correlation coefficient of {round(random.uniform(0.7, 0.99), 3)}. The optimal value of {params[0]} was found to be {round(random.uniform(0.1, 100), 2)} under conditions where {params[1] if len(params) > 1 else 'temperature'} = {round(random.uniform(20, 200), 1)}.
+{random.choice(['Statistical analysis confirmed', 'ANOVA testing revealed', 'Mann-Whitney U test showed'])} significance at p < {random.choice(['0.001', '0.01', '0.05'])} (n = {random.choice([3, 5, 10, 20, 50])}). These results {'align with' if random.random() > 0.3 else 'contradict'} previous reports by {random.choice(['Smith et al.', 'Zhang et al.', 'Kumar et al.', 'Johnson et al.'])} who found {params[0]} values of {round(random.uniform(0.1, 100), 2)}.
+We interpret these findings as evidence that {random.choice(['the proposed mechanism involves', 'the dominant factor is', 'surface interactions govern'])} the observed behavior. We hypothesize that {random.choice(['further optimization could achieve', 'a threshold effect exists at', 'competing mechanisms dominate above'])} {round(random.uniform(10, 500), 1)}."""
+        # Generate structured claims
+        study_quality_map = {
+            "primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6,
+            "review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3
+        }
+        journal_tier_map = {1: 1.0, 2: 0.85, 3: 0.7}
+        num_claims = random.randint(3, 7)
+        claims = []
+        for j in range(num_claims):
+            epistemic = random.choices(
+                ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"],
+                weights=[0.4, 0.3, 0.2, 0.1]
+            )[0]
+            evidence_strength = round(random.uniform(0.5, 1.0), 3)
+            sq_weight = study_quality_map.get(study_type, 0.6)
+            jt_weight = journal_tier_map.get(journal_tier, 0.7)
+            has_missing = random.random() < 0.25
+            completeness = 0.7 if has_missing else 1.0
+            missing = random.sample(params, random.randint(1, 2)) if has_missing else []
+            # Fixed-point calculation (multiply by 1000, round, divide)
+            conf_raw = evidence_strength * sq_weight * jt_weight * completeness
+            confidence = round(int(conf_raw * 1000) / 1000, 3)
+            claim_text_options = [
+                f"The {params[0]} was measured at {round(random.uniform(0.1, 100), 2)} under standard conditions.",
+                f"A {random.choice(['positive', 'negative', 'non-linear'])} correlation was observed between {params[0]} and {params[1] if len(params) > 1 else 'output'}.",
+                f"The proposed mechanism suggests that {topic} is primarily governed by {random.choice(['surface effects', 'bulk properties', 'interfacial phenomena'])}.",
+                f"We hypothesize that optimizing {params[0]} beyond {round(random.uniform(50, 200), 1)} could yield {random.choice(['enhanced', 'diminished', 'qualitatively different'])} results.",
+                f"These findings contradict the established model by {random.choice(['Smith et al.', 'Zhang et al.', 'Lee et al.'])}, who reported {random.choice(['opposite', 'significantly different', 'null'])} effects.",
+            ]
+            claim_obj = {
+                "claim_id": f"CLM_{i*10+j:04d}",
+                "text": random.choice(claim_text_options),
+                "epistemic_tag": epistemic,
+                "confidence": confidence,
+                "evidence_strength": evidence_strength,
+                "study_type": study_type,
+                "study_quality_weight": sq_weight,
+                "journal_tier_weight": jt_weight,
+                "completeness_penalty": completeness,
+                "missing_fields": missing,
+                "status": "Incomplete" if has_missing else "Complete",
+                "parameters": {p: round(random.uniform(0.1, 100), 2) for p in random.sample(params, min(2, len(params)))}
+            }
+            claims.append(claim_obj)
+        response = json.dumps({"claims": claims, "source_doi": f"10.1234/example.{i:04d}", "paper_domain": domain_name}, indent=2)
+        examples.append({
+            "messages": [
+                {"role": "system", "content": SYSTEM_CLAIM_EXTRACTION},
+                {"role": "user", "content": f"Extract all scientific claims from the following paper excerpt:\n\n{excerpt}"},
+                {"role": "assistant", "content": response}
+            ]
+        })
+    return examples
+def generate_epistemic_classification_examples(n=400):
+    """Generate epistemic tag classification examples."""
+    examples = []
+    statements = {
+        "Fact": [
+            "The measured ionic conductivity of the LLZO pellet was 4.2 × 10⁻⁴ S/cm at 25°C.",
+            "Graphene field-effect transistors showed a Dirac point shift of 45 mV upon target binding.",
+            "The crystal structure was confirmed as tetragonal by XRD analysis (JCPDS 00-024-0867).",
+            "Cell viability remained above 95% after 48 hours of exposure to nanoparticle concentrations up to 100 μg/mL.",
+            "The reaction yield increased from 32% to 87% when the catalyst loading was doubled from 5 mol% to 10 mol%.",
+            "Raman spectroscopy revealed a G/2D peak ratio of 0.35, consistent with monolayer graphene.",
+            "The bandgap energy was determined to be 1.85 eV from UV-Vis absorption spectroscopy.",
+            "Atomic force microscopy confirmed a film thickness of 12.3 ± 0.4 nm across 20 measurement points.",
+            "The protein folding simulation converged after 850 ns with RMSD < 2.0 Å.",
+            "HPLC analysis showed 99.2% purity of the synthesized compound.",
+            "The qubit T1 relaxation time was measured at 152 ± 8 μs at 15 mK.",
+            "Mass spectrometry confirmed the molecular ion peak at m/z = 342.18, consistent with the expected product.",
+            "The transistor exhibited an on/off current ratio of 10⁶ with a subthreshold swing of 68 mV/decade.",
+            "Flow cytometry analysis revealed 78.3% of cells were in the G1 phase after treatment.",
+            "The electrode maintained 94.2% capacity retention after 500 charge-discharge cycles at 1C rate."
+        ],
+        "Interpretation": [
+            "The observed Dirac point shift suggests successful functionalization of the graphene surface.",
+            "These results indicate that the ion transport mechanism is primarily governed by grain boundary diffusion.",
+            "The non-linear dose-response curve is consistent with a cooperative binding model.",
+            "The improved performance at elevated temperatures points to thermally activated charge transport.",
+            "Our data support the hypothesis that surface defects play a critical role in catalytic activity.",
+            "The correlation between particle size and reactivity implies surface-area-dependent kinetics.",
+            "These findings are consistent with a two-step nucleation mechanism rather than classical nucleation theory.",
+            "The asymmetric peak broadening in XRD patterns suggests the presence of microstrain.",
+            "The enhanced fluorescence lifetime indicates reduced non-radiative recombination pathways.",
+            "The inverse relationship between ionic strength and sensitivity aligns with Debye screening predictions.",
+            "Based on the activation energy of 0.32 eV, we conclude that lithium diffusion occurs via an interstitial mechanism.",
+            "The observed blue shift in photoluminescence is attributable to quantum confinement effects.",
+            "The saturation behavior above 100 nM concentration reflects receptor site limitation."
+        ],
+        "Hypothesis": [
+            "We propose that the anomalous conductivity enhancement arises from a percolation network of amorphous regions.",
+            "It is possible that the observed bistability originates from competing ferroelectric and antiferroelectric phases.",
+            "Future experiments with isotope labeling could determine whether proton hopping or vehicle mechanism dominates.",
+            "We hypothesize that introducing tensile strain into the MoS2 lattice will reduce the bandgap below 1.5 eV.",
+            "A possible explanation is that the protein undergoes a conformational change upon ligand binding that exposes a hidden epitope.",
+            "We speculate that the unexpected catalytic activity may arise from edge-site defects not captured in bulk characterization.",
+            "If the proposed mechanism is correct, replacing the counter-ion should produce a measurable shift in the voltammetric response.",
+            "The anomalous transport behavior could potentially be explained by a polaron hopping model.",
+            "We conjecture that the system exhibits a quantum phase transition at a critical doping concentration of approximately 0.15.",
+            "A theoretical framework based on Marcus theory predicts that electron transfer rates should increase by 10× at these reorganization energies.",
+            "It remains to be tested whether this enhancement persists under physiological buffer conditions."
+        ],
+        "Conflict_Hypothesis": [
+            "Our observation of decreasing sensitivity at high ionic strength directly contradicts Chen et al. (2022), who reported sensitivity enhancement under similar conditions.",
+            "While the established model predicts a linear relationship between film thickness and resistance, our data show clear deviation above 20 nm.",
+            "These results challenge the widely accepted Langmuir adsorption model, suggesting that multilayer formation occurs at concentrations previously considered sub-monolayer.",
+            "Contrary to the predictions of density functional theory calculations by Park et al., we observe metallic rather than semiconducting behavior in this phase.",
+            "The measured activation energy of 0.52 eV is significantly higher than the 0.28 eV reported by three independent groups, suggesting a fundamentally different transport mechanism.",
+            "Our finding that the reaction proceeds without the proposed intermediate contradicts the established mechanism.",
+            "The negative correlation we observe between grain size and conductivity opposes the conventional understanding based on brick-layer model predictions.",
+            "While Johnson et al. reported complete stability over 1000 cycles, our replication attempt shows measurable degradation beginning at cycle 300."
+        ]
+    }
+    for i in range(n):
+        tag = random.choices(
+            ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"],
+            weights=[0.35, 0.30, 0.20, 0.15]
+        )[0]
+        statement = random.choice(statements[tag])
+        reasoning_templates = {
+            "Fact": "This statement reports a direct measurement/observation with specific quantitative data. It does not include author interpretation or speculation.",
+            "Interpretation": "This statement goes beyond raw data to explain or attribute meaning to observations. The author draws conclusions that are not strictly contained in the measurements alone.",
+            "Hypothesis": "This statement proposes an untested mechanism or prediction. Key indicators: uses speculative language (may, could, hypothesize, propose, possible).",
+            "Conflict_Hypothesis": "This statement explicitly contradicts an established finding or widely accepted model, with evidence supporting both positions."
+        }
+        conf_in_class = round(random.uniform(0.75, 0.98), 2)
+        response = json.dumps({
+            "epistemic_tag": tag,
+            "reasoning": reasoning_templates[tag],
+            "confidence_in_classification": conf_in_class
+        }, indent=2)
+        examples.append({
+            "messages": [
+                {"role": "system", "content": SYSTEM_EPISTEMIC_CLASSIFIER},
+                {"role": "user", "content": f"Classify the epistemic status of this scientific statement:\n\n\"{statement}\""},
+                {"role": "assistant", "content": response}
+            ]
+        })
+    return examples
+def generate_confidence_scoring_examples(n=300):
+    """Generate confidence scoring training examples."""
+    examples = []
+    for i in range(n):
+        domain_name = random.choice(list(STEM_DOMAINS.keys()))
+        domain = STEM_DOMAINS[domain_name]
+        study_type = random.choices(
+            ["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study"],
+            weights=[0.35, 0.20, 0.15, 0.10, 0.10, 0.10]
+        )[0]
+        journal_tier = random.choices([1, 2, 3, "preprint"], weights=[0.25, 0.35, 0.25, 0.15])[0]
+        if journal_tier == 1:
+            journal = random.choice(domain["journals_t1"])
+        elif journal_tier == 2:
+            journal = random.choice(domain["journals_t2"])
+        elif journal_tier == 3:
+            journal = random.choice(domain["journals_t3"])
+        else:
+            journal = "arXiv preprint"
+        sq_map = {"primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6,
+                  "review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3}
+        jt_map = {1: 1.0, 2: 0.85, 3: 0.7, "preprint": 0.5}
+        evidence_strength = round(random.uniform(0.3, 1.0), 3)
+        sq_weight = sq_map[study_type]
+        jt_weight = jt_map[journal_tier]
+        has_missing = random.random() < 0.3
+        completeness = 0.7 if has_missing else 1.0
+        # Fixed-point calculation
+        raw = evidence_strength * sq_weight * jt_weight * completeness
+        confidence = round(int(raw * 1000) / 1000, 3)
+        claim_text = f"Claim from {journal} ({study_type.replace('_', ' ')} study): {random.choice(domain['topics'])} measured {random.choice(domain['parameters'])} at {round(random.uniform(0.1, 100), 2)}."
+        if has_missing:
+            claim_text += " [Note: sample size and error margins not reported]"
+        reasoning = f"Study type '{study_type}' gets weight {sq_weight}. Journal '{journal}' is tier {journal_tier} (weight {jt_weight}). Evidence strength assessed at {evidence_strength} based on directness of measurement. {'Missing key parameters reduce completeness penalty to 0.7.' if has_missing else 'All parameters reported, completeness 1.0.'} Final: {evidence_strength} × {sq_weight} × {jt_weight} × {completeness} = {confidence}."
+        response = json.dumps({
+            "confidence": confidence,
+            "evidence_strength": evidence_strength,
+            "study_quality_weight": sq_weight,
+            "journal_tier_weight": jt_weight,
+            "completeness_penalty": completeness,
+            "reasoning": reasoning
+        }, indent=2)
+        examples.append({
+            "messages": [
+                {"role": "system", "content": SYSTEM_CONFIDENCE_SCORER},
+                {"role": "user", "content": f"Score the confidence of this claim:\n\n{claim_text}\n\nSource: {journal}\nStudy type: {study_type}\nJournal tier: {journal_tier}"},
+                {"role": "assistant", "content": response}
+            ]
+        })
+    return examples
+def generate_conflict_detection_examples(n=300):
+    """Generate contradiction detection training examples."""
+    examples = []
+    conflict_pairs = [
+        # Real conflicts
+        {
+            "claim_a": "Graphene FET sensitivity increases monotonically with decreasing ionic strength (measured range: 0.1-100 mM PBS).",
+            "claim_b": "Below 1 mM ionic strength, GFET sensitivity plateaus due to parasitic charge screening from surface-adsorbed species.",
+            "conflict": True,
+            "conflict_type": "value_mismatch",
+            "hypothesis": "The discrepancy likely arises from different surface functionalization protocols. Claim A used bare graphene while Claim B used PEG-passivated surfaces, which would accumulate different surface charges at very low ionic strength.",
+            "key_diffs": ["surface treatment protocol", "ionic strength range tested", "measurement technique"]
+        },
+        {
+            "claim_a": "The Li-ion conductivity of LLZO was 1.2 × 10⁻³ S/cm at room temperature.",
+            "claim_b": "LLZO pellets sintered under identical conditions showed conductivity of 2.8 × 10⁻⁴ S/cm at 25°C.",
+            "conflict": True,
+            "conflict_type": "value_mismatch",
+            "hypothesis": "The order-of-magnitude difference may stem from different grain boundary densities. Claim A likely reports single-crystal or highly-densified samples, while Claim B may include significant grain boundary resistance.",
+            "key_diffs": ["sample preparation method", "densification level", "measurement geometry"]
+        },
+        {
+            "claim_a": "MoS2 monolayers grown by CVD at 700°C show exclusively 2H phase.",
+            "claim_b": "CVD-grown MoS2 at 700°C contains 30-40% 1T phase, as confirmed by XPS peak deconvolution.",
+            "conflict": True,
+            "conflict_type": "methodology_difference",
+            "hypothesis": "The 1T phase in Claim B may be induced during the transfer process (common with wet-chemical transfer) or by the choice of sulfur precursor. Claim A may have used a different transfer method or in-situ characterization.",
+            "key_diffs": ["sulfur precursor", "transfer method", "characterization timing (in-situ vs ex-situ)"]
+        },
+        {
+            "claim_a": "The protein binding affinity (Kd) for aptamer X was 2.3 nM in buffer.",
+            "claim_b": "Aptamer X showed Kd of 180 nM when tested in 50% human serum.",
+            "conflict": False,
+            "conflict_type": "scope_difference",
+            "hypothesis": "These measurements are not contradictory — they reflect different measurement conditions. The 78× decrease in affinity in serum is expected due to non-specific protein interactions, ionic strength differences, and potential aptamer degradation by nucleases.",
+            "key_diffs": ["measurement medium (buffer vs serum)", "matrix effects"]
+        },
+        {
+            "claim_a": "Quantum dot fluorescence quenching follows Stern-Volmer kinetics with KSV = 4.5 × 10⁴ M⁻¹.",
+            "claim_b": "The same QD-quencher system shows non-linear Stern-Volmer behavior above 10 μM quencher concentration.",
+            "conflict": False,
+            "conflict_type": "scope_difference",
+            "hypothesis": "Both claims can be simultaneously true. The Stern-Volmer relationship is linear at low quencher concentrations (Claim A's measurement range) but deviates at higher concentrations (Claim B) due to static quenching or ground-state complex formation.",
+            "key_diffs": ["concentration range", "quenching mechanism regime"]
+        },
+        # More conflicts
+        {
+            "claim_a": "Sonication for 30 minutes produces graphene flakes with average lateral size of 500 nm.",
+            "claim_b": "Extended sonication (30 min) yields graphene fragments predominantly below 100 nm with significant edge defects.",
+            "conflict": True,
+            "conflict_type": "value_mismatch",
+            "hypothesis": "The 5× difference in reported flake size likely stems from different sonication power/frequency settings, solvent choice (NMP vs water/surfactant), or measurement method (DLS vs AFM). Additionally, definition of 'average' may differ (number-averaged vs volume-averaged).",
+            "key_diffs": ["sonication parameters (power, frequency)", "solvent system", "size measurement technique", "averaging method"]
+        },
+        {
+            "claim_a": "The neural network achieved 96.3% accuracy on the protein structure prediction benchmark.",
+            "claim_b": "On the same benchmark, the identical architecture achieved only 84.1% accuracy when trained with a different random seed.",
+            "conflict": True,
+            "conflict_type": "methodology_difference",
+            "hypothesis": "The 12.2% accuracy gap from random seed variation alone suggests the model is highly sensitive to initialization. This may indicate overfitting to specific training data partitions or instability in the optimization landscape.",
+            "key_diffs": ["random seed", "potentially different train/test splits", "convergence criteria"]
+        },
+        {
+            "claim_a": "The catalyst achieves 99% conversion at 80°C.",
+            "claim_b": "The catalyst achieves 99% conversion at 80°C with fresh reagents, but only 45% with recycled catalyst after 3 cycles.",
+            "conflict": False,
+            "conflict_type": "scope_difference",
+            "hypothesis": "These are not contradictory. Claim A reports initial performance while Claim B adds information about durability. The 54% drop after 3 cycles reveals catalyst deactivation, possibly from active site poisoning or structural degradation.",
+            "key_diffs": ["catalyst reuse cycle", "implicit freshness assumption in Claim A"]
+        }
+    ]
+    for i in range(n):
+        pair = random.choice(conflict_pairs)
+        response = json.dumps({
+            "conflict_detected": pair["conflict"],
+            "conflict_type": pair["conflict_type"],
+            "generated_hypothesis": pair["hypothesis"],
+            "hypothesis_confidence": "low",
+            "resolution_status": "Unresolved",
+            "key_differences": pair["key_diffs"],
+            "recommended_action": f"{'Investigate the specific methodological differences identified. Request raw data from both groups if possible.' if pair['conflict'] else 'No conflict resolution needed. Both claims are valid within their respective scopes. Document the scope boundary.'}"
+        }, indent=2)
+        examples.append({
+            "messages": [
+                {"role": "system", "content": SYSTEM_CONFLICT_DETECTOR},
+                {"role": "user", "content": f"Analyze these two claims for contradictions:\n\nClaim A: \"{pair['claim_a']}\"\n\nClaim B: \"{pair['claim_b']}\""},
+                {"role": "assistant", "content": response}
+            ]
+        })
+    return examples
+def generate_query_decomposition_examples(n=200):
+    """Generate query decomposition examples."""
+    examples = []
+    queries = [
+        {
+            "query": "What is the current state of graphene-based biosensors for cancer biomarker detection?",
+            "sub_queries": [
+                "What cancer biomarkers have been detected using graphene FET sensors?",
+                "What are the reported limits of detection for graphene biosensors targeting cancer markers?",
+                "How does graphene biosensor performance compare to ELISA and other standard methods?",
+                "What are the main challenges preventing clinical translation of graphene biosensors?"
+            ],
+            "reasoning": "This broad question spans detection targets, performance metrics, comparative assessment, and translational barriers. Each sub-query targets a specific knowledge domain that maps to different claim types in the database."
+        },
+        {
+            "query": "How does ionic strength affect the performance of field-effect transistor biosensors?",
+            "sub_queries": [
+                "What is the Debye screening length at different ionic strength values?",
+                "How does sensitivity change as a function of buffer ionic strength for FET biosensors?",
+                "What strategies have been developed to overcome Debye screening limitations?"
+            ],
+            "reasoning": "The question involves fundamental physics (Debye length), empirical relationships (sensitivity vs ionic strength), and engineering solutions (overcoming limitations)."
+        },
+        {
+            "query": "What are the best solid-state electrolytes for next-generation lithium batteries?",
+            "sub_queries": [
+                "What ionic conductivities have been achieved in garnet-type, sulfide, and polymer electrolytes?",
+                "What are the interfacial stability challenges between solid electrolytes and lithium metal anodes?",
+                "How do manufacturing scalability and cost compare across solid electrolyte families?",
+                "What degradation mechanisms limit cycle life in solid-state batteries?"
+            ],
+            "reasoning": "This question spans material performance (conductivity), interface engineering, practical considerations (cost/scale), and durability. Each is a distinct research sub-domain."
+        },
+        {
+            "query": "Can CRISPR-Cas9 efficiency be predicted computationally?",
+            "sub_queries": [
+                "What machine learning models have been developed for CRISPR guide RNA efficiency prediction?",
+                "What features (sequence, structure, chromatin) are most predictive of CRISPR cutting efficiency?",
+                "How do computational predictions compare to experimental validation data?"
+            ],
+            "reasoning": "The question bridges computational methods, feature engineering, and experimental validation — three distinct knowledge areas."
+        },
+        {
+            "query": "What determines qubit coherence time in superconducting quantum processors?",
+            "sub_queries": [
+                "What are the dominant decoherence mechanisms (T1, T2) in transmon qubits?",
+                "How do materials and fabrication choices affect qubit coherence?",
+                "What is the current state-of-the-art for superconducting qubit coherence times?"
+            ],
+            "reasoning": "This question involves fundamental physics (decoherence mechanisms), engineering (fabrication), and benchmarking (state-of-the-art values)."
+        },
+        {
+            "query": "How effective are 2D material heterostructures for photocatalytic water splitting?",
+            "sub_queries": [
+                "What 2D material combinations have been tested for photocatalytic hydrogen evolution?",
+                "What are the reported solar-to-hydrogen conversion efficiencies for 2D heterostructure photocatalysts?",
+                "How does band alignment in 2D heterostructures affect charge separation and catalytic activity?",
+                "What stability and durability challenges exist for 2D photocatalysts?"
+            ],
+            "reasoning": "Covers materials (combinations), performance metrics (efficiency), mechanism (band alignment), and practical challenges (stability) — four distinct research angles."
+        }
+    ]
+    for i in range(n):
+        q = random.choice(queries)
+        response = json.dumps({
+            "original_query": q["query"],
+            "sub_queries": q["sub_queries"],
+            "reasoning": q["reasoning"]
+        }, indent=2)
+        examples.append({
+            "messages": [
+                {"role": "system", "content": SYSTEM_QUERY_DECOMPOSER},
+                {"role": "user", "content": f"Decompose this research question into specific sub-queries:\n\n\"{q['query']}\""},
+                {"role": "assistant", "content": response}
+            ]
+        })
+    return examples
+def generate_decision_object_examples(n=200):
+    """Generate Decision Object training examples."""
+    examples = []
+    scenarios = [
+        {
+            "goal": "Achieve sub-femtomolar detection limit for cardiac troponin I using GFET biosensor",
+            "gaps": ["Optimal aptamer sequence for cTnI not determined", "Debye screening at physiological ionic strength limits sensitivity", "No data on sensor-to-sensor reproducibility"],
+            "low_confidence_claims": ["CLM_0042: PEG spacer length of 5 kDa optimal (confidence: 0.35)", "CLM_0089: Desalting step preserves >90% of target protein (confidence: 0.41)"],
+            "decision": {
+                "decision_id": "DEC_0001",
+                "recommended_action": "experiment",
+                "action_description": "Systematic optimization of aptamer surface density and PEG spacer length on GFET. Test 3 aptamer concentrations × 3 PEG lengths × 3 ionic strength conditions. Include negative controls.",
+                "expected_information_gain": 0.72,
+                "linked_goal_id": "GOAL_001",
+                "linked_claim_ids": ["CLM_0042", "CLM_0089"],
+                "priority": "high",
+                "estimated_effort": "2-3 weeks of lab work + 1 week analysis"
+            }
+        },
+        {
+            "goal": "Understand degradation mechanism in solid-state lithium battery at >100 cycles",
+            "gaps": ["Interface evolution not characterized in-situ", "Role of grain boundary resistance vs bulk unclear", "Temperature dependence of degradation unknown"],
+            "low_confidence_claims": ["CLM_0156: Lithium dendrite penetration through grain boundaries (confidence: 0.28)", "CLM_0201: SEI formation at cathode interface dominates early degradation (confidence: 0.39)"],
+            "decision": {
+                "decision_id": "DEC_0002",
+                "recommended_action": "literature_search",
+                "action_description": "Comprehensive search for in-situ/operando characterization studies of LLZO-lithium interfaces during cycling. Focus on synchrotron XRD and cryo-TEM studies from 2022-2024.",
+                "expected_information_gain": 0.58,
+                "linked_goal_id": "GOAL_003",
+                "linked_claim_ids": ["CLM_0156", "CLM_0201"],
+                "priority": "medium",
+                "estimated_effort": "1 week literature search + synthesis"
+            }
+        },
+        {
+            "goal": "Validate computational model of protein-aptamer binding",
+            "gaps": ["MD simulation parameters not benchmarked against experimental Kd", "Force field choice may introduce systematic bias", "Solvent model effects unexplored"],
+            "low_confidence_claims": ["CLM_0312: AMBER ff14SB adequate for aptamer-protein complexes (confidence: 0.32)"],
+            "decision": {
+                "decision_id": "DEC_0003",
+                "recommended_action": "collaboration",
+                "action_description": "Contact the computational chemistry group (Prof. Martinez) for force field parameterization expertise. Their recent paper on RNA-protein interactions used an optimized force field that may apply here.",
+                "expected_information_gain": 0.65,
+                "linked_goal_id": "GOAL_005",
+                "linked_claim_ids": ["CLM_0312"],
+                "priority": "medium",
+                "estimated_effort": "Initial meeting + 2-4 weeks of collaborative work"
+            }
+        },
+        {
+            "goal": "Establish reproducibility of the nanofabrication process",
+            "gaps": ["Batch-to-batch variation not quantified", "Critical process parameters not identified", "No statistical process control"],
+            "low_confidence_claims": ["CLM_0089: Yield >80% for device fabrication (confidence: 0.25, based on single batch)"],
+            "decision": {
+                "decision_id": "DEC_0004",
+                "recommended_action": "replication",
+                "action_description": "Fabricate 5 independent batches of 20 devices each over 5 separate days. Measure key performance metrics. Perform ANOVA to identify day-to-day and within-batch variation. Calculate Cpk for critical parameters.",
+                "expected_information_gain": 0.81,
+                "linked_goal_id": "GOAL_002",
+                "linked_claim_ids": ["CLM_0089"],
+                "priority": "high",
+                "estimated_effort": "3-4 weeks fabrication + 1 week characterization + 1 week analysis"
+            }
+        },
+        {
+            "goal": "Resolve contradictory claims about qubit decoherence mechanism",
+            "gaps": ["Two competing theories with similar explanatory power", "No experiment designed to distinguish between mechanisms"],
+            "low_confidence_claims": ["CLM_0445: TLS-dominated decoherence (confidence: 0.42)", "CLM_0446: Quasiparticle tunneling dominates above 50mK (confidence: 0.38)"],
+            "decision": {
+                "decision_id": "DEC_0005",
+                "recommended_action": "methodology_review",
+                "action_description": "Design a discriminating experiment: measure T1 as a function of temperature (10-200 mK) and applied magnetic field (0-100 mT). TLS model predicts logarithmic T1(T) while quasiparticle model predicts exponential. This single experiment can resolve the contradiction.",
+                "expected_information_gain": 0.89,
+                "linked_goal_id": "GOAL_007",
+                "linked_claim_ids": ["CLM_0445", "CLM_0446"],
+                "priority": "high",
+                "estimated_effort": "1 week experiment design + 2 weeks measurement + 1 week analysis"
+            }
+        }
+    ]
+    for i in range(n):
+        scenario = random.choice(scenarios)
+        user_prompt = f"""Current research goal: {scenario['goal']}
+Knowledge gaps:
+{chr(10).join('- ' + g for g in scenario['gaps'])}
+Low-confidence claims requiring resolution:
+{chr(10).join('- ' + c for c in scenario['low_confidence_claims'])}
+Propose a Decision Object with the highest expected information gain."""
+        response = json.dumps(scenario["decision"], indent=2)
+        examples.append({
+            "messages": [
+                {"role": "system", "content": SYSTEM_DECISION_GENERATOR},
+                {"role": "user", "content": user_prompt},
+                {"role": "assistant", "content": response}
+            ]
+        })
+    return examples
+# ============================================================
+# MAIN: Generate and combine all task datasets
+# ============================================================
+def main():
+    print("Generating PhD Research OS training dataset...")
+    print("  Task 1: Claim Extraction (500 examples)...")
+    claim_examples = generate_claim_extraction_examples(500)
+    print("  Task 2: Epistemic Classification (400 examples)...")
+    epistemic_examples = generate_epistemic_classification_examples(400)
+    print("  Task 3: Confidence Scoring (300 examples)...")
+    confidence_examples = generate_confidence_scoring_examples(300)
+    print("  Task 4: Conflict Detection (300 examples)...")
+    conflict_examples = generate_conflict_detection_examples(300)
+    print("  Task 5: Query Decomposition (200 examples)...")
+    query_examples = generate_query_decomposition_examples(200)
+    print("  Task 6: Decision Objects (200 examples)...")
+    decision_examples = generate_decision_object_examples(200)
+    # Combine all examples
+    all_examples = (claim_examples + epistemic_examples + confidence_examples +
+                    conflict_examples + query_examples + decision_examples)
+    # Shuffle
+    random.shuffle(all_examples)
+    print(f"\n  Total examples: {len(all_examples)}")
+    # Split into train/eval (90/10)
+    split_idx = int(len(all_examples) * 0.9)
+    train_data = all_examples[:split_idx]
+    eval_data = all_examples[split_idx:]
+    print(f"  Train: {len(train_data)}, Eval: {len(eval_data)}")
+    # Create HF Dataset
+    train_dataset = Dataset.from_list(train_data)
+    eval_dataset = Dataset.from_list(eval_data)
+    dataset_dict = DatasetDict({
+        "train": train_dataset,
+        "test": eval_dataset
+    })
+    # Save locally
+    dataset_dict.save_to_disk("/app/phd_research_os_dataset")
+    print(f"\n  Dataset saved to /app/phd_research_os_dataset")
+    # Also save as JSON for inspection
+    with open("/app/sample_examples.json", "w") as f:
+        json.dump(all_examples[:10], f, indent=2)
+    print("  Sample examples saved to /app/sample_examples.json")
+    # Print distribution stats
+    task_counts = {}
+    for ex in all_examples:
+        system_msg = ex["messages"][0]["content"][:50]
+        task_counts[system_msg] = task_counts.get(system_msg, 0) + 1
+    print("\n  Task distribution:")
+    for task, count in task_counts.items():
+        print(f"    {task}... : {count}")
+    return dataset_dict
+if __name__ == "__main__":
+    main()