| """ |
| PhD Research OS — Synthetic Training Dataset Generator |
| ===================================================== |
| Generates multi-task SFT dataset in TRL conversational format for: |
| Task 1: Scientific Claim Extraction (text → structured claims JSON) |
| Task 2: Epistemic Classification (Fact / Interpretation / Hypothesis / Conflict_Hypothesis) |
| Task 3: Confidence Scoring (evidence_strength × study_quality × journal_tier × completeness) |
| Task 4: Contradiction Detection (claim pair → conflict analysis) |
| Task 5: Query Decomposition (broad question → sub-queries) |
| Task 6: Decision Object Generation (gaps + goals → proposed actions with info gain) |
| |
| Output: HF Dataset with "messages" column in conversational ChatML format. |
| """ |
|
|
| import json |
| import random |
| import hashlib |
| from datasets import Dataset, DatasetDict |
|
|
| random.seed(42) |
|
|
| |
| |
| |
|
|
| SYSTEM_CLAIM_EXTRACTION = """You are the Researcher Agent of a PhD Research OS. Your role is to extract structured scientific claims from research paper text. |
| |
| For each claim, output a JSON object with these fields: |
| - claim_id: string (CLM_XXXX format) |
| - text: the claim text as stated in the paper |
| - epistemic_tag: one of "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis" |
| - confidence: float [0,1] computed as evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty |
| - evidence_strength: float [0,1] based on directness of evidence |
| - study_type: one of "primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study" |
| - missing_fields: list of field names that could not be determined from the text |
| - status: "Complete" if no missing fields, else "Incomplete" |
| - parameters: dict of key experimental parameters mentioned (concentrations, temperatures, etc.) |
| |
| Output must be valid JSON: {"claims": [...]} |
| Always classify epistemic tags conservatively. When uncertain, prefer "Interpretation" over "Fact".""" |
|
|
| SYSTEM_EPISTEMIC_CLASSIFIER = """You are the Epistemic Classifier of a PhD Research OS. Given a scientific statement, classify it into exactly one category: |
| |
| - Fact: Directly supported by experimental data with quantitative evidence. Reproducible measurements. |
| - Interpretation: Author's explanation of data. Goes beyond what the numbers strictly show. Often uses words like "suggests", "indicates", "consistent with". |
| - Hypothesis: Proposed mechanism or prediction not yet tested. Uses "may", "could", "we propose", "it is possible". |
| - Conflict_Hypothesis: A claim that explicitly contradicts another established claim in the field. Evidence exists on both sides. |
| |
| Output JSON: {"epistemic_tag": "...", "reasoning": "...", "confidence_in_classification": float} |
| Be conservative: if a statement mixes fact and interpretation, classify as Interpretation.""" |
|
|
| SYSTEM_CONFIDENCE_SCORER = """You are the Confidence Scorer of a PhD Research OS. Score the confidence of a scientific claim using this formula: |
| |
| confidence = evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty |
| |
| Where: |
| - evidence_strength [0,1]: How directly the evidence supports the claim |
| - study_quality_weight: primary_experimental=1.0, in_vitro=0.8, simulation=0.6, review_non_systematic=0.4, meta_analysis=1.0, case_study=0.3 |
| - journal_tier_weight: tier1=1.0, tier2=0.85, tier3=0.7, preprint=0.5 |
| - completeness_penalty: 1.0 if all parameters reported, 0.7 if missing key parameters |
| |
| Output JSON: {"confidence": float, "evidence_strength": float, "study_quality_weight": float, "journal_tier_weight": float, "completeness_penalty": float, "reasoning": "..."} |
| |
| Use fixed-point scaled integers internally (multiply by 1000, round, divide by 1000) to avoid floating-point drift.""" |
|
|
| SYSTEM_CONFLICT_DETECTOR = """You are the Verifier Agent of a PhD Research OS. Given two scientific claims, determine if they contradict each other. |
| |
| Analyze the claims and output a Conflict Resolution Object: |
| - conflict_detected: boolean |
| - conflict_type: one of "value_mismatch", "methodology_difference", "scope_difference", "no_conflict" |
| - generated_hypothesis: text explaining the possible cause of the conflict |
| - hypothesis_confidence: always "low" (never auto-set to high — human review required) |
| - resolution_status: "Unresolved" |
| - key_differences: list of specific parameter/methodology differences |
| - recommended_action: what the researcher should investigate to resolve this |
| |
| Output valid JSON. Be thorough but conservative — flag real conflicts, not superficial differences.""" |
|
|
| SYSTEM_QUERY_DECOMPOSER = """You are the Query Planner of a PhD Research OS. Given a broad research question, decompose it into 2-4 specific sub-queries that can be independently searched in a scientific knowledge base. |
| |
| Each sub-query should: |
| - Target a specific aspect of the question |
| - Be answerable from individual paper claims |
| - Together, cover the full scope of the original question |
| |
| Output JSON: {"original_query": "...", "sub_queries": ["...", "..."], "reasoning": "..."}""" |
|
|
| SYSTEM_DECISION_GENERATOR = """You are the Decision Agent of a PhD Research OS. Given the current research goals, knowledge gaps, and incomplete/low-confidence claims, propose a Decision Object. |
| |
| A Decision Object includes: |
| - decision_id: string (DEC_XXXX) |
| - recommended_action: one of "experiment", "literature_search", "collaboration", "replication", "methodology_review" |
| - action_description: specific description of what to do |
| - expected_information_gain: float [0,1] = uncertainty_of_claim × impact_on_goal |
| - linked_goal_id: which research goal this addresses |
| - linked_claim_ids: which claims this would resolve |
| - priority: "high", "medium", "low" |
| - estimated_effort: rough time estimate |
| |
| Output valid JSON. Prioritize actions with highest information gain per unit effort.""" |
|
|
| |
| |
| |
|
|
| STEM_DOMAINS = { |
| "biosensors": { |
| "topics": ["graphene FET sensors", "Debye length screening", "aptamer functionalization", |
| "limit of detection", "signal-to-noise ratio", "ionic strength effects", |
| "surface chemistry", "biomarker detection", "point-of-care diagnostics"], |
| "parameters": ["concentration", "ionic_strength_mM", "temperature_C", "pH", |
| "incubation_time_min", "gate_voltage_V", "drain_current_uA", |
| "sensitivity_mV_per_decade", "LOD_fM", "selectivity_ratio"], |
| "journals_t1": ["Nature Biotechnology", "ACS Nano", "Nano Letters", "Biosensors and Bioelectronics"], |
| "journals_t2": ["Analytical Chemistry", "Lab on a Chip", "Sensors and Actuators B"], |
| "journals_t3": ["IEEE Sensors Journal", "Microchimica Acta"] |
| }, |
| "materials_science": { |
| "topics": ["2D materials", "MoS2 synthesis", "CVD growth", "defect engineering", |
| "band gap tuning", "heterostructures", "strain engineering"], |
| "parameters": ["thickness_nm", "growth_temperature_C", "pressure_torr", "carrier_gas_flow_sccm", |
| "grain_size_um", "mobility_cm2_Vs", "bandgap_eV", "defect_density_cm2"], |
| "journals_t1": ["Nature Materials", "Advanced Materials", "ACS Nano"], |
| "journals_t2": ["Chemistry of Materials", "2D Materials", "Nanoscale"], |
| "journals_t3": ["Materials Research Express", "Journal of Materials Science"] |
| }, |
| "electrochemistry": { |
| "topics": ["battery electrolytes", "solid-state batteries", "lithium-ion transport", |
| "electrode-electrolyte interface", "impedance spectroscopy", "cycling stability"], |
| "parameters": ["ionic_conductivity_S_cm", "activation_energy_eV", "cycle_number", |
| "capacity_retention_pct", "voltage_window_V", "current_density_mA_cm2", |
| "coulombic_efficiency_pct", "electrode_thickness_um"], |
| "journals_t1": ["Nature Energy", "Joule", "Advanced Energy Materials"], |
| "journals_t2": ["Journal of the Electrochemical Society", "Electrochimica Acta"], |
| "journals_t3": ["Batteries", "Journal of Power Sources"] |
| }, |
| "computational_biology": { |
| "topics": ["protein folding", "molecular dynamics", "drug-target interaction", |
| "genomic analysis", "CRISPR efficiency prediction", "gene regulatory networks"], |
| "parameters": ["RMSD_angstrom", "binding_affinity_kcal_mol", "simulation_time_ns", |
| "accuracy_pct", "AUC_ROC", "precision", "recall", "F1_score"], |
| "journals_t1": ["Nature Methods", "Nature Biotechnology", "Cell Systems"], |
| "journals_t2": ["Bioinformatics", "PLOS Computational Biology", "BMC Genomics"], |
| "journals_t3": ["Journal of Computational Biology", "Computational Biology and Chemistry"] |
| }, |
| "quantum_computing": { |
| "topics": ["qubit coherence", "quantum error correction", "superconducting circuits", |
| "quantum algorithms", "quantum supremacy benchmarks", "topological qubits"], |
| "parameters": ["T1_us", "T2_us", "gate_fidelity_pct", "qubit_count", |
| "error_rate", "circuit_depth", "quantum_volume"], |
| "journals_t1": ["Nature", "Science", "Physical Review Letters"], |
| "journals_t2": ["Physical Review A", "Quantum", "npj Quantum Information"], |
| "journals_t3": ["Quantum Science and Technology", "Journal of Physics A"] |
| } |
| } |
|
|
| |
| |
| |
|
|
| def generate_claim_extraction_examples(n=500): |
| """Generate claim extraction training examples.""" |
| examples = [] |
| |
| paper_templates = [ |
| |
| { |
| "text": "We investigated the effect of {param1} on {topic} using {method}. Our results demonstrate that increasing {param1} from {val1} to {val2} led to a {change_pct}% {direction} in {metric}. The {metric} reached a maximum value of {max_val} {unit} at {param1} = {optimal_val}. Statistical analysis ({stat_test}, n={n_samples}) confirmed significance (p < {p_val}). These findings suggest that {interpretation}. We hypothesize that {hypothesis}.", |
| "num_claims": 5, |
| "claim_types": ["Fact", "Fact", "Fact", "Interpretation", "Hypothesis"] |
| }, |
| |
| { |
| "text": "A comparative analysis of {method_a} and {method_b} for {application} was conducted. {method_a} achieved a {metric} of {val_a} ± {err_a} {unit}, while {method_b} yielded {val_b} ± {err_b} {unit} under identical conditions ({conditions}). The {better_method} outperformed by {diff_pct}% (p = {p_val}). However, {worse_method} showed superior {alt_metric} ({alt_val_w} vs {alt_val_b}). These results indicate that the choice between methods depends on {factor}. Future work should explore {future_direction}.", |
| "num_claims": 6, |
| "claim_types": ["Fact", "Fact", "Fact", "Fact", "Interpretation", "Hypothesis"] |
| }, |
| |
| { |
| "text": "Recent studies have established that {established_fact}. Multiple groups have reported {metric} values ranging from {range_low} to {range_high} {unit} ({ref1}; {ref2}; {ref3}). The consensus view is that {consensus}. However, {conflicting_author} reported contradictory findings, showing {contradicting_claim}. This discrepancy may arise from {possible_explanation}. A unified model has been proposed where {model_description}.", |
| "num_claims": 5, |
| "claim_types": ["Fact", "Fact", "Interpretation", "Conflict_Hypothesis", "Hypothesis"] |
| }, |
| |
| { |
| "text": "We present a novel {technique} for {application} that achieves {metric} of {value} {unit}. The method involves {step1}, followed by {step2}, and final {step3}. Calibration was performed using {calibration_method} with {standard}. The limit of detection was determined to be {lod} {lod_unit} (S/N = 3). Reproducibility was assessed over {n_trials} independent measurements, yielding RSD of {rsd}%. The technique is applicable to {sample_types} with matrix effects below {matrix_effect}%.", |
| "num_claims": 6, |
| "claim_types": ["Fact", "Fact", "Fact", "Fact", "Fact", "Interpretation"] |
| }, |
| |
| { |
| "text": "Contrary to predictions from {theory}, our measurements of {parameter} in {system} showed no significant dependence on {variable} (p = {p_val_ns}, n = {n_samples}). The observed {parameter} remained at {constant_val} ± {error} {unit} across the entire range of {variable} tested ({range}). This null result suggests that {null_interpretation}. One possible explanation is that {alternative_mechanism}. These findings challenge the assumption that {challenged_assumption}.", |
| "num_claims": 5, |
| "claim_types": ["Fact", "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"] |
| } |
| ] |
|
|
| for i in range(n): |
| domain_name = random.choice(list(STEM_DOMAINS.keys())) |
| domain = STEM_DOMAINS[domain_name] |
| template = random.choice(paper_templates) |
| topic = random.choice(domain["topics"]) |
| params = random.sample(domain["parameters"], min(4, len(domain["parameters"]))) |
| |
| |
| journal_tier = random.choices([1, 2, 3], weights=[0.3, 0.4, 0.3])[0] |
| if journal_tier == 1: |
| journal = random.choice(domain["journals_t1"]) |
| elif journal_tier == 2: |
| journal = random.choice(domain["journals_t2"]) |
| else: |
| journal = random.choice(domain["journals_t3"]) |
| |
| study_types = ["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis"] |
| study_type = random.choices(study_types, weights=[0.4, 0.2, 0.15, 0.1, 0.15])[0] |
| |
| |
| excerpt = f"""[Excerpt from: "{topic}: Recent Advances" — Published in {journal}, 2024] |
| |
| In this study, we examined {topic} with particular focus on the relationship between {params[0]} and {params[1] if len(params) > 1 else 'system performance'}. Using {study_type.replace('_', ' ')} methodology, we measured {params[0]} under varying conditions of {params[1] if len(params) > 1 else 'standard parameters'}. |
| |
| Our primary finding is that {params[0]} exhibits a {random.choice(['linear', 'exponential', 'logarithmic', 'sigmoidal'])} dependence on {params[1] if len(params) > 1 else 'the control variable'}, with a correlation coefficient of {round(random.uniform(0.7, 0.99), 3)}. The optimal value of {params[0]} was found to be {round(random.uniform(0.1, 100), 2)} under conditions where {params[1] if len(params) > 1 else 'temperature'} = {round(random.uniform(20, 200), 1)}. |
| |
| {random.choice(['Statistical analysis confirmed', 'ANOVA testing revealed', 'Mann-Whitney U test showed'])} significance at p < {random.choice(['0.001', '0.01', '0.05'])} (n = {random.choice([3, 5, 10, 20, 50])}). These results {'align with' if random.random() > 0.3 else 'contradict'} previous reports by {random.choice(['Smith et al.', 'Zhang et al.', 'Kumar et al.', 'Johnson et al.'])} who found {params[0]} values of {round(random.uniform(0.1, 100), 2)}. |
| |
| We interpret these findings as evidence that {random.choice(['the proposed mechanism involves', 'the dominant factor is', 'surface interactions govern'])} the observed behavior. We hypothesize that {random.choice(['further optimization could achieve', 'a threshold effect exists at', 'competing mechanisms dominate above'])} {round(random.uniform(10, 500), 1)}.""" |
|
|
| |
| study_quality_map = { |
| "primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6, |
| "review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3 |
| } |
| journal_tier_map = {1: 1.0, 2: 0.85, 3: 0.7} |
| |
| num_claims = random.randint(3, 7) |
| claims = [] |
| for j in range(num_claims): |
| epistemic = random.choices( |
| ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"], |
| weights=[0.4, 0.3, 0.2, 0.1] |
| )[0] |
| |
| evidence_strength = round(random.uniform(0.5, 1.0), 3) |
| sq_weight = study_quality_map.get(study_type, 0.6) |
| jt_weight = journal_tier_map.get(journal_tier, 0.7) |
| |
| has_missing = random.random() < 0.25 |
| completeness = 0.7 if has_missing else 1.0 |
| missing = random.sample(params, random.randint(1, 2)) if has_missing else [] |
| |
| |
| conf_raw = evidence_strength * sq_weight * jt_weight * completeness |
| confidence = round(int(conf_raw * 1000) / 1000, 3) |
| |
| claim_text_options = [ |
| f"The {params[0]} was measured at {round(random.uniform(0.1, 100), 2)} under standard conditions.", |
| f"A {random.choice(['positive', 'negative', 'non-linear'])} correlation was observed between {params[0]} and {params[1] if len(params) > 1 else 'output'}.", |
| f"The proposed mechanism suggests that {topic} is primarily governed by {random.choice(['surface effects', 'bulk properties', 'interfacial phenomena'])}.", |
| f"We hypothesize that optimizing {params[0]} beyond {round(random.uniform(50, 200), 1)} could yield {random.choice(['enhanced', 'diminished', 'qualitatively different'])} results.", |
| f"These findings contradict the established model by {random.choice(['Smith et al.', 'Zhang et al.', 'Lee et al.'])}, who reported {random.choice(['opposite', 'significantly different', 'null'])} effects.", |
| ] |
| |
| claim_obj = { |
| "claim_id": f"CLM_{i*10+j:04d}", |
| "text": random.choice(claim_text_options), |
| "epistemic_tag": epistemic, |
| "confidence": confidence, |
| "evidence_strength": evidence_strength, |
| "study_type": study_type, |
| "study_quality_weight": sq_weight, |
| "journal_tier_weight": jt_weight, |
| "completeness_penalty": completeness, |
| "missing_fields": missing, |
| "status": "Incomplete" if has_missing else "Complete", |
| "parameters": {p: round(random.uniform(0.1, 100), 2) for p in random.sample(params, min(2, len(params)))} |
| } |
| claims.append(claim_obj) |
| |
| response = json.dumps({"claims": claims, "source_doi": f"10.1234/example.{i:04d}", "paper_domain": domain_name}, indent=2) |
| |
| examples.append({ |
| "messages": [ |
| {"role": "system", "content": SYSTEM_CLAIM_EXTRACTION}, |
| {"role": "user", "content": f"Extract all scientific claims from the following paper excerpt:\n\n{excerpt}"}, |
| {"role": "assistant", "content": response} |
| ] |
| }) |
| |
| return examples |
|
|
|
|
| def generate_epistemic_classification_examples(n=400): |
| """Generate epistemic tag classification examples.""" |
| examples = [] |
| |
| statements = { |
| "Fact": [ |
| "The measured ionic conductivity of the LLZO pellet was 4.2 × 10⁻⁴ S/cm at 25°C.", |
| "Graphene field-effect transistors showed a Dirac point shift of 45 mV upon target binding.", |
| "The crystal structure was confirmed as tetragonal by XRD analysis (JCPDS 00-024-0867).", |
| "Cell viability remained above 95% after 48 hours of exposure to nanoparticle concentrations up to 100 μg/mL.", |
| "The reaction yield increased from 32% to 87% when the catalyst loading was doubled from 5 mol% to 10 mol%.", |
| "Raman spectroscopy revealed a G/2D peak ratio of 0.35, consistent with monolayer graphene.", |
| "The bandgap energy was determined to be 1.85 eV from UV-Vis absorption spectroscopy.", |
| "Atomic force microscopy confirmed a film thickness of 12.3 ± 0.4 nm across 20 measurement points.", |
| "The protein folding simulation converged after 850 ns with RMSD < 2.0 Å.", |
| "HPLC analysis showed 99.2% purity of the synthesized compound.", |
| "The qubit T1 relaxation time was measured at 152 ± 8 μs at 15 mK.", |
| "Mass spectrometry confirmed the molecular ion peak at m/z = 342.18, consistent with the expected product.", |
| "The transistor exhibited an on/off current ratio of 10⁶ with a subthreshold swing of 68 mV/decade.", |
| "Flow cytometry analysis revealed 78.3% of cells were in the G1 phase after treatment.", |
| "The electrode maintained 94.2% capacity retention after 500 charge-discharge cycles at 1C rate." |
| ], |
| "Interpretation": [ |
| "The observed Dirac point shift suggests successful functionalization of the graphene surface.", |
| "These results indicate that the ion transport mechanism is primarily governed by grain boundary diffusion.", |
| "The non-linear dose-response curve is consistent with a cooperative binding model.", |
| "The improved performance at elevated temperatures points to thermally activated charge transport.", |
| "Our data support the hypothesis that surface defects play a critical role in catalytic activity.", |
| "The correlation between particle size and reactivity implies surface-area-dependent kinetics.", |
| "These findings are consistent with a two-step nucleation mechanism rather than classical nucleation theory.", |
| "The asymmetric peak broadening in XRD patterns suggests the presence of microstrain.", |
| "The enhanced fluorescence lifetime indicates reduced non-radiative recombination pathways.", |
| "The inverse relationship between ionic strength and sensitivity aligns with Debye screening predictions.", |
| "Based on the activation energy of 0.32 eV, we conclude that lithium diffusion occurs via an interstitial mechanism.", |
| "The observed blue shift in photoluminescence is attributable to quantum confinement effects.", |
| "The saturation behavior above 100 nM concentration reflects receptor site limitation." |
| ], |
| "Hypothesis": [ |
| "We propose that the anomalous conductivity enhancement arises from a percolation network of amorphous regions.", |
| "It is possible that the observed bistability originates from competing ferroelectric and antiferroelectric phases.", |
| "Future experiments with isotope labeling could determine whether proton hopping or vehicle mechanism dominates.", |
| "We hypothesize that introducing tensile strain into the MoS2 lattice will reduce the bandgap below 1.5 eV.", |
| "A possible explanation is that the protein undergoes a conformational change upon ligand binding that exposes a hidden epitope.", |
| "We speculate that the unexpected catalytic activity may arise from edge-site defects not captured in bulk characterization.", |
| "If the proposed mechanism is correct, replacing the counter-ion should produce a measurable shift in the voltammetric response.", |
| "The anomalous transport behavior could potentially be explained by a polaron hopping model.", |
| "We conjecture that the system exhibits a quantum phase transition at a critical doping concentration of approximately 0.15.", |
| "A theoretical framework based on Marcus theory predicts that electron transfer rates should increase by 10× at these reorganization energies.", |
| "It remains to be tested whether this enhancement persists under physiological buffer conditions." |
| ], |
| "Conflict_Hypothesis": [ |
| "Our observation of decreasing sensitivity at high ionic strength directly contradicts Chen et al. (2022), who reported sensitivity enhancement under similar conditions.", |
| "While the established model predicts a linear relationship between film thickness and resistance, our data show clear deviation above 20 nm.", |
| "These results challenge the widely accepted Langmuir adsorption model, suggesting that multilayer formation occurs at concentrations previously considered sub-monolayer.", |
| "Contrary to the predictions of density functional theory calculations by Park et al., we observe metallic rather than semiconducting behavior in this phase.", |
| "The measured activation energy of 0.52 eV is significantly higher than the 0.28 eV reported by three independent groups, suggesting a fundamentally different transport mechanism.", |
| "Our finding that the reaction proceeds without the proposed intermediate contradicts the established mechanism.", |
| "The negative correlation we observe between grain size and conductivity opposes the conventional understanding based on brick-layer model predictions.", |
| "While Johnson et al. reported complete stability over 1000 cycles, our replication attempt shows measurable degradation beginning at cycle 300." |
| ] |
| } |
| |
| for i in range(n): |
| tag = random.choices( |
| ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"], |
| weights=[0.35, 0.30, 0.20, 0.15] |
| )[0] |
| statement = random.choice(statements[tag]) |
| |
| reasoning_templates = { |
| "Fact": "This statement reports a direct measurement/observation with specific quantitative data. It does not include author interpretation or speculation.", |
| "Interpretation": "This statement goes beyond raw data to explain or attribute meaning to observations. The author draws conclusions that are not strictly contained in the measurements alone.", |
| "Hypothesis": "This statement proposes an untested mechanism or prediction. Key indicators: uses speculative language (may, could, hypothesize, propose, possible).", |
| "Conflict_Hypothesis": "This statement explicitly contradicts an established finding or widely accepted model, with evidence supporting both positions." |
| } |
| |
| conf_in_class = round(random.uniform(0.75, 0.98), 2) |
| |
| response = json.dumps({ |
| "epistemic_tag": tag, |
| "reasoning": reasoning_templates[tag], |
| "confidence_in_classification": conf_in_class |
| }, indent=2) |
| |
| examples.append({ |
| "messages": [ |
| {"role": "system", "content": SYSTEM_EPISTEMIC_CLASSIFIER}, |
| {"role": "user", "content": f"Classify the epistemic status of this scientific statement:\n\n\"{statement}\""}, |
| {"role": "assistant", "content": response} |
| ] |
| }) |
| |
| return examples |
|
|
|
|
| def generate_confidence_scoring_examples(n=300): |
| """Generate confidence scoring training examples.""" |
| examples = [] |
| |
| for i in range(n): |
| domain_name = random.choice(list(STEM_DOMAINS.keys())) |
| domain = STEM_DOMAINS[domain_name] |
| |
| study_type = random.choices( |
| ["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study"], |
| weights=[0.35, 0.20, 0.15, 0.10, 0.10, 0.10] |
| )[0] |
| |
| journal_tier = random.choices([1, 2, 3, "preprint"], weights=[0.25, 0.35, 0.25, 0.15])[0] |
| |
| if journal_tier == 1: |
| journal = random.choice(domain["journals_t1"]) |
| elif journal_tier == 2: |
| journal = random.choice(domain["journals_t2"]) |
| elif journal_tier == 3: |
| journal = random.choice(domain["journals_t3"]) |
| else: |
| journal = "arXiv preprint" |
| |
| sq_map = {"primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6, |
| "review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3} |
| jt_map = {1: 1.0, 2: 0.85, 3: 0.7, "preprint": 0.5} |
| |
| evidence_strength = round(random.uniform(0.3, 1.0), 3) |
| sq_weight = sq_map[study_type] |
| jt_weight = jt_map[journal_tier] |
| |
| has_missing = random.random() < 0.3 |
| completeness = 0.7 if has_missing else 1.0 |
| |
| |
| raw = evidence_strength * sq_weight * jt_weight * completeness |
| confidence = round(int(raw * 1000) / 1000, 3) |
| |
| claim_text = f"Claim from {journal} ({study_type.replace('_', ' ')} study): {random.choice(domain['topics'])} measured {random.choice(domain['parameters'])} at {round(random.uniform(0.1, 100), 2)}." |
| |
| if has_missing: |
| claim_text += " [Note: sample size and error margins not reported]" |
| |
| reasoning = f"Study type '{study_type}' gets weight {sq_weight}. Journal '{journal}' is tier {journal_tier} (weight {jt_weight}). Evidence strength assessed at {evidence_strength} based on directness of measurement. {'Missing key parameters reduce completeness penalty to 0.7.' if has_missing else 'All parameters reported, completeness 1.0.'} Final: {evidence_strength} × {sq_weight} × {jt_weight} × {completeness} = {confidence}." |
| |
| response = json.dumps({ |
| "confidence": confidence, |
| "evidence_strength": evidence_strength, |
| "study_quality_weight": sq_weight, |
| "journal_tier_weight": jt_weight, |
| "completeness_penalty": completeness, |
| "reasoning": reasoning |
| }, indent=2) |
| |
| examples.append({ |
| "messages": [ |
| {"role": "system", "content": SYSTEM_CONFIDENCE_SCORER}, |
| {"role": "user", "content": f"Score the confidence of this claim:\n\n{claim_text}\n\nSource: {journal}\nStudy type: {study_type}\nJournal tier: {journal_tier}"}, |
| {"role": "assistant", "content": response} |
| ] |
| }) |
| |
| return examples |
|
|
|
|
| def generate_conflict_detection_examples(n=300): |
| """Generate contradiction detection training examples.""" |
| examples = [] |
| |
| conflict_pairs = [ |
| |
| { |
| "claim_a": "Graphene FET sensitivity increases monotonically with decreasing ionic strength (measured range: 0.1-100 mM PBS).", |
| "claim_b": "Below 1 mM ionic strength, GFET sensitivity plateaus due to parasitic charge screening from surface-adsorbed species.", |
| "conflict": True, |
| "conflict_type": "value_mismatch", |
| "hypothesis": "The discrepancy likely arises from different surface functionalization protocols. Claim A used bare graphene while Claim B used PEG-passivated surfaces, which would accumulate different surface charges at very low ionic strength.", |
| "key_diffs": ["surface treatment protocol", "ionic strength range tested", "measurement technique"] |
| }, |
| { |
| "claim_a": "The Li-ion conductivity of LLZO was 1.2 × 10⁻³ S/cm at room temperature.", |
| "claim_b": "LLZO pellets sintered under identical conditions showed conductivity of 2.8 × 10⁻⁴ S/cm at 25°C.", |
| "conflict": True, |
| "conflict_type": "value_mismatch", |
| "hypothesis": "The order-of-magnitude difference may stem from different grain boundary densities. Claim A likely reports single-crystal or highly-densified samples, while Claim B may include significant grain boundary resistance.", |
| "key_diffs": ["sample preparation method", "densification level", "measurement geometry"] |
| }, |
| { |
| "claim_a": "MoS2 monolayers grown by CVD at 700°C show exclusively 2H phase.", |
| "claim_b": "CVD-grown MoS2 at 700°C contains 30-40% 1T phase, as confirmed by XPS peak deconvolution.", |
| "conflict": True, |
| "conflict_type": "methodology_difference", |
| "hypothesis": "The 1T phase in Claim B may be induced during the transfer process (common with wet-chemical transfer) or by the choice of sulfur precursor. Claim A may have used a different transfer method or in-situ characterization.", |
| "key_diffs": ["sulfur precursor", "transfer method", "characterization timing (in-situ vs ex-situ)"] |
| }, |
| { |
| "claim_a": "The protein binding affinity (Kd) for aptamer X was 2.3 nM in buffer.", |
| "claim_b": "Aptamer X showed Kd of 180 nM when tested in 50% human serum.", |
| "conflict": False, |
| "conflict_type": "scope_difference", |
| "hypothesis": "These measurements are not contradictory — they reflect different measurement conditions. The 78× decrease in affinity in serum is expected due to non-specific protein interactions, ionic strength differences, and potential aptamer degradation by nucleases.", |
| "key_diffs": ["measurement medium (buffer vs serum)", "matrix effects"] |
| }, |
| { |
| "claim_a": "Quantum dot fluorescence quenching follows Stern-Volmer kinetics with KSV = 4.5 × 10⁴ M⁻¹.", |
| "claim_b": "The same QD-quencher system shows non-linear Stern-Volmer behavior above 10 μM quencher concentration.", |
| "conflict": False, |
| "conflict_type": "scope_difference", |
| "hypothesis": "Both claims can be simultaneously true. The Stern-Volmer relationship is linear at low quencher concentrations (Claim A's measurement range) but deviates at higher concentrations (Claim B) due to static quenching or ground-state complex formation.", |
| "key_diffs": ["concentration range", "quenching mechanism regime"] |
| }, |
| |
| { |
| "claim_a": "Sonication for 30 minutes produces graphene flakes with average lateral size of 500 nm.", |
| "claim_b": "Extended sonication (30 min) yields graphene fragments predominantly below 100 nm with significant edge defects.", |
| "conflict": True, |
| "conflict_type": "value_mismatch", |
| "hypothesis": "The 5× difference in reported flake size likely stems from different sonication power/frequency settings, solvent choice (NMP vs water/surfactant), or measurement method (DLS vs AFM). Additionally, definition of 'average' may differ (number-averaged vs volume-averaged).", |
| "key_diffs": ["sonication parameters (power, frequency)", "solvent system", "size measurement technique", "averaging method"] |
| }, |
| { |
| "claim_a": "The neural network achieved 96.3% accuracy on the protein structure prediction benchmark.", |
| "claim_b": "On the same benchmark, the identical architecture achieved only 84.1% accuracy when trained with a different random seed.", |
| "conflict": True, |
| "conflict_type": "methodology_difference", |
| "hypothesis": "The 12.2% accuracy gap from random seed variation alone suggests the model is highly sensitive to initialization. This may indicate overfitting to specific training data partitions or instability in the optimization landscape.", |
| "key_diffs": ["random seed", "potentially different train/test splits", "convergence criteria"] |
| }, |
| { |
| "claim_a": "The catalyst achieves 99% conversion at 80°C.", |
| "claim_b": "The catalyst achieves 99% conversion at 80°C with fresh reagents, but only 45% with recycled catalyst after 3 cycles.", |
| "conflict": False, |
| "conflict_type": "scope_difference", |
| "hypothesis": "These are not contradictory. Claim A reports initial performance while Claim B adds information about durability. The 54% drop after 3 cycles reveals catalyst deactivation, possibly from active site poisoning or structural degradation.", |
| "key_diffs": ["catalyst reuse cycle", "implicit freshness assumption in Claim A"] |
| } |
| ] |
| |
| for i in range(n): |
| pair = random.choice(conflict_pairs) |
| |
| response = json.dumps({ |
| "conflict_detected": pair["conflict"], |
| "conflict_type": pair["conflict_type"], |
| "generated_hypothesis": pair["hypothesis"], |
| "hypothesis_confidence": "low", |
| "resolution_status": "Unresolved", |
| "key_differences": pair["key_diffs"], |
| "recommended_action": f"{'Investigate the specific methodological differences identified. Request raw data from both groups if possible.' if pair['conflict'] else 'No conflict resolution needed. Both claims are valid within their respective scopes. Document the scope boundary.'}" |
| }, indent=2) |
| |
| examples.append({ |
| "messages": [ |
| {"role": "system", "content": SYSTEM_CONFLICT_DETECTOR}, |
| {"role": "user", "content": f"Analyze these two claims for contradictions:\n\nClaim A: \"{pair['claim_a']}\"\n\nClaim B: \"{pair['claim_b']}\""}, |
| {"role": "assistant", "content": response} |
| ] |
| }) |
| |
| return examples |
|
|
|
|
| def generate_query_decomposition_examples(n=200): |
| """Generate query decomposition examples.""" |
| examples = [] |
| |
| queries = [ |
| { |
| "query": "What is the current state of graphene-based biosensors for cancer biomarker detection?", |
| "sub_queries": [ |
| "What cancer biomarkers have been detected using graphene FET sensors?", |
| "What are the reported limits of detection for graphene biosensors targeting cancer markers?", |
| "How does graphene biosensor performance compare to ELISA and other standard methods?", |
| "What are the main challenges preventing clinical translation of graphene biosensors?" |
| ], |
| "reasoning": "This broad question spans detection targets, performance metrics, comparative assessment, and translational barriers. Each sub-query targets a specific knowledge domain that maps to different claim types in the database." |
| }, |
| { |
| "query": "How does ionic strength affect the performance of field-effect transistor biosensors?", |
| "sub_queries": [ |
| "What is the Debye screening length at different ionic strength values?", |
| "How does sensitivity change as a function of buffer ionic strength for FET biosensors?", |
| "What strategies have been developed to overcome Debye screening limitations?" |
| ], |
| "reasoning": "The question involves fundamental physics (Debye length), empirical relationships (sensitivity vs ionic strength), and engineering solutions (overcoming limitations)." |
| }, |
| { |
| "query": "What are the best solid-state electrolytes for next-generation lithium batteries?", |
| "sub_queries": [ |
| "What ionic conductivities have been achieved in garnet-type, sulfide, and polymer electrolytes?", |
| "What are the interfacial stability challenges between solid electrolytes and lithium metal anodes?", |
| "How do manufacturing scalability and cost compare across solid electrolyte families?", |
| "What degradation mechanisms limit cycle life in solid-state batteries?" |
| ], |
| "reasoning": "This question spans material performance (conductivity), interface engineering, practical considerations (cost/scale), and durability. Each is a distinct research sub-domain." |
| }, |
| { |
| "query": "Can CRISPR-Cas9 efficiency be predicted computationally?", |
| "sub_queries": [ |
| "What machine learning models have been developed for CRISPR guide RNA efficiency prediction?", |
| "What features (sequence, structure, chromatin) are most predictive of CRISPR cutting efficiency?", |
| "How do computational predictions compare to experimental validation data?" |
| ], |
| "reasoning": "The question bridges computational methods, feature engineering, and experimental validation — three distinct knowledge areas." |
| }, |
| { |
| "query": "What determines qubit coherence time in superconducting quantum processors?", |
| "sub_queries": [ |
| "What are the dominant decoherence mechanisms (T1, T2) in transmon qubits?", |
| "How do materials and fabrication choices affect qubit coherence?", |
| "What is the current state-of-the-art for superconducting qubit coherence times?" |
| ], |
| "reasoning": "This question involves fundamental physics (decoherence mechanisms), engineering (fabrication), and benchmarking (state-of-the-art values)." |
| }, |
| { |
| "query": "How effective are 2D material heterostructures for photocatalytic water splitting?", |
| "sub_queries": [ |
| "What 2D material combinations have been tested for photocatalytic hydrogen evolution?", |
| "What are the reported solar-to-hydrogen conversion efficiencies for 2D heterostructure photocatalysts?", |
| "How does band alignment in 2D heterostructures affect charge separation and catalytic activity?", |
| "What stability and durability challenges exist for 2D photocatalysts?" |
| ], |
| "reasoning": "Covers materials (combinations), performance metrics (efficiency), mechanism (band alignment), and practical challenges (stability) — four distinct research angles." |
| } |
| ] |
| |
| for i in range(n): |
| q = random.choice(queries) |
| response = json.dumps({ |
| "original_query": q["query"], |
| "sub_queries": q["sub_queries"], |
| "reasoning": q["reasoning"] |
| }, indent=2) |
| |
| examples.append({ |
| "messages": [ |
| {"role": "system", "content": SYSTEM_QUERY_DECOMPOSER}, |
| {"role": "user", "content": f"Decompose this research question into specific sub-queries:\n\n\"{q['query']}\""}, |
| {"role": "assistant", "content": response} |
| ] |
| }) |
| |
| return examples |
|
|
|
|
| def generate_decision_object_examples(n=200): |
| """Generate Decision Object training examples.""" |
| examples = [] |
| |
| scenarios = [ |
| { |
| "goal": "Achieve sub-femtomolar detection limit for cardiac troponin I using GFET biosensor", |
| "gaps": ["Optimal aptamer sequence for cTnI not determined", "Debye screening at physiological ionic strength limits sensitivity", "No data on sensor-to-sensor reproducibility"], |
| "low_confidence_claims": ["CLM_0042: PEG spacer length of 5 kDa optimal (confidence: 0.35)", "CLM_0089: Desalting step preserves >90% of target protein (confidence: 0.41)"], |
| "decision": { |
| "decision_id": "DEC_0001", |
| "recommended_action": "experiment", |
| "action_description": "Systematic optimization of aptamer surface density and PEG spacer length on GFET. Test 3 aptamer concentrations × 3 PEG lengths × 3 ionic strength conditions. Include negative controls.", |
| "expected_information_gain": 0.72, |
| "linked_goal_id": "GOAL_001", |
| "linked_claim_ids": ["CLM_0042", "CLM_0089"], |
| "priority": "high", |
| "estimated_effort": "2-3 weeks of lab work + 1 week analysis" |
| } |
| }, |
| { |
| "goal": "Understand degradation mechanism in solid-state lithium battery at >100 cycles", |
| "gaps": ["Interface evolution not characterized in-situ", "Role of grain boundary resistance vs bulk unclear", "Temperature dependence of degradation unknown"], |
| "low_confidence_claims": ["CLM_0156: Lithium dendrite penetration through grain boundaries (confidence: 0.28)", "CLM_0201: SEI formation at cathode interface dominates early degradation (confidence: 0.39)"], |
| "decision": { |
| "decision_id": "DEC_0002", |
| "recommended_action": "literature_search", |
| "action_description": "Comprehensive search for in-situ/operando characterization studies of LLZO-lithium interfaces during cycling. Focus on synchrotron XRD and cryo-TEM studies from 2022-2024.", |
| "expected_information_gain": 0.58, |
| "linked_goal_id": "GOAL_003", |
| "linked_claim_ids": ["CLM_0156", "CLM_0201"], |
| "priority": "medium", |
| "estimated_effort": "1 week literature search + synthesis" |
| } |
| }, |
| { |
| "goal": "Validate computational model of protein-aptamer binding", |
| "gaps": ["MD simulation parameters not benchmarked against experimental Kd", "Force field choice may introduce systematic bias", "Solvent model effects unexplored"], |
| "low_confidence_claims": ["CLM_0312: AMBER ff14SB adequate for aptamer-protein complexes (confidence: 0.32)"], |
| "decision": { |
| "decision_id": "DEC_0003", |
| "recommended_action": "collaboration", |
| "action_description": "Contact the computational chemistry group (Prof. Martinez) for force field parameterization expertise. Their recent paper on RNA-protein interactions used an optimized force field that may apply here.", |
| "expected_information_gain": 0.65, |
| "linked_goal_id": "GOAL_005", |
| "linked_claim_ids": ["CLM_0312"], |
| "priority": "medium", |
| "estimated_effort": "Initial meeting + 2-4 weeks of collaborative work" |
| } |
| }, |
| { |
| "goal": "Establish reproducibility of the nanofabrication process", |
| "gaps": ["Batch-to-batch variation not quantified", "Critical process parameters not identified", "No statistical process control"], |
| "low_confidence_claims": ["CLM_0089: Yield >80% for device fabrication (confidence: 0.25, based on single batch)"], |
| "decision": { |
| "decision_id": "DEC_0004", |
| "recommended_action": "replication", |
| "action_description": "Fabricate 5 independent batches of 20 devices each over 5 separate days. Measure key performance metrics. Perform ANOVA to identify day-to-day and within-batch variation. Calculate Cpk for critical parameters.", |
| "expected_information_gain": 0.81, |
| "linked_goal_id": "GOAL_002", |
| "linked_claim_ids": ["CLM_0089"], |
| "priority": "high", |
| "estimated_effort": "3-4 weeks fabrication + 1 week characterization + 1 week analysis" |
| } |
| }, |
| { |
| "goal": "Resolve contradictory claims about qubit decoherence mechanism", |
| "gaps": ["Two competing theories with similar explanatory power", "No experiment designed to distinguish between mechanisms"], |
| "low_confidence_claims": ["CLM_0445: TLS-dominated decoherence (confidence: 0.42)", "CLM_0446: Quasiparticle tunneling dominates above 50mK (confidence: 0.38)"], |
| "decision": { |
| "decision_id": "DEC_0005", |
| "recommended_action": "methodology_review", |
| "action_description": "Design a discriminating experiment: measure T1 as a function of temperature (10-200 mK) and applied magnetic field (0-100 mT). TLS model predicts logarithmic T1(T) while quasiparticle model predicts exponential. This single experiment can resolve the contradiction.", |
| "expected_information_gain": 0.89, |
| "linked_goal_id": "GOAL_007", |
| "linked_claim_ids": ["CLM_0445", "CLM_0446"], |
| "priority": "high", |
| "estimated_effort": "1 week experiment design + 2 weeks measurement + 1 week analysis" |
| } |
| } |
| ] |
| |
| for i in range(n): |
| scenario = random.choice(scenarios) |
| |
| user_prompt = f"""Current research goal: {scenario['goal']} |
| |
| Knowledge gaps: |
| {chr(10).join('- ' + g for g in scenario['gaps'])} |
| |
| Low-confidence claims requiring resolution: |
| {chr(10).join('- ' + c for c in scenario['low_confidence_claims'])} |
| |
| Propose a Decision Object with the highest expected information gain.""" |
| |
| response = json.dumps(scenario["decision"], indent=2) |
| |
| examples.append({ |
| "messages": [ |
| {"role": "system", "content": SYSTEM_DECISION_GENERATOR}, |
| {"role": "user", "content": user_prompt}, |
| {"role": "assistant", "content": response} |
| ] |
| }) |
| |
| return examples |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| print("Generating PhD Research OS training dataset...") |
| |
| print(" Task 1: Claim Extraction (500 examples)...") |
| claim_examples = generate_claim_extraction_examples(500) |
| |
| print(" Task 2: Epistemic Classification (400 examples)...") |
| epistemic_examples = generate_epistemic_classification_examples(400) |
| |
| print(" Task 3: Confidence Scoring (300 examples)...") |
| confidence_examples = generate_confidence_scoring_examples(300) |
| |
| print(" Task 4: Conflict Detection (300 examples)...") |
| conflict_examples = generate_conflict_detection_examples(300) |
| |
| print(" Task 5: Query Decomposition (200 examples)...") |
| query_examples = generate_query_decomposition_examples(200) |
| |
| print(" Task 6: Decision Objects (200 examples)...") |
| decision_examples = generate_decision_object_examples(200) |
| |
| |
| all_examples = (claim_examples + epistemic_examples + confidence_examples + |
| conflict_examples + query_examples + decision_examples) |
| |
| |
| random.shuffle(all_examples) |
| |
| print(f"\n Total examples: {len(all_examples)}") |
| |
| |
| split_idx = int(len(all_examples) * 0.9) |
| train_data = all_examples[:split_idx] |
| eval_data = all_examples[split_idx:] |
| |
| print(f" Train: {len(train_data)}, Eval: {len(eval_data)}") |
| |
| |
| train_dataset = Dataset.from_list(train_data) |
| eval_dataset = Dataset.from_list(eval_data) |
| |
| dataset_dict = DatasetDict({ |
| "train": train_dataset, |
| "test": eval_dataset |
| }) |
| |
| |
| dataset_dict.save_to_disk("/app/phd_research_os_dataset") |
| print(f"\n Dataset saved to /app/phd_research_os_dataset") |
| |
| |
| with open("/app/sample_examples.json", "w") as f: |
| json.dump(all_examples[:10], f, indent=2) |
| print(" Sample examples saved to /app/sample_examples.json") |
| |
| |
| task_counts = {} |
| for ex in all_examples: |
| system_msg = ex["messages"][0]["content"][:50] |
| task_counts[system_msg] = task_counts.get(system_msg, 0) + 1 |
| |
| print("\n Task distribution:") |
| for task, count in task_counts.items(): |
| print(f" {task}... : {count}") |
| |
| return dataset_dict |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|