File size: 51,592 Bytes
b44c1ed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 | """
PhD Research OS — Synthetic Training Dataset Generator
=====================================================
Generates multi-task SFT dataset in TRL conversational format for:
Task 1: Scientific Claim Extraction (text → structured claims JSON)
Task 2: Epistemic Classification (Fact / Interpretation / Hypothesis / Conflict_Hypothesis)
Task 3: Confidence Scoring (evidence_strength × study_quality × journal_tier × completeness)
Task 4: Contradiction Detection (claim pair → conflict analysis)
Task 5: Query Decomposition (broad question → sub-queries)
Task 6: Decision Object Generation (gaps + goals → proposed actions with info gain)
Output: HF Dataset with "messages" column in conversational ChatML format.
"""
import json
import random
import hashlib
from datasets import Dataset, DatasetDict
random.seed(42)
# ============================================================
# SYSTEM PROMPTS (one per task — stored in /config/prompts/)
# ============================================================
SYSTEM_CLAIM_EXTRACTION = """You are the Researcher Agent of a PhD Research OS. Your role is to extract structured scientific claims from research paper text.
For each claim, output a JSON object with these fields:
- claim_id: string (CLM_XXXX format)
- text: the claim text as stated in the paper
- epistemic_tag: one of "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"
- confidence: float [0,1] computed as evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty
- evidence_strength: float [0,1] based on directness of evidence
- study_type: one of "primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study"
- missing_fields: list of field names that could not be determined from the text
- status: "Complete" if no missing fields, else "Incomplete"
- parameters: dict of key experimental parameters mentioned (concentrations, temperatures, etc.)
Output must be valid JSON: {"claims": [...]}
Always classify epistemic tags conservatively. When uncertain, prefer "Interpretation" over "Fact"."""
SYSTEM_EPISTEMIC_CLASSIFIER = """You are the Epistemic Classifier of a PhD Research OS. Given a scientific statement, classify it into exactly one category:
- Fact: Directly supported by experimental data with quantitative evidence. Reproducible measurements.
- Interpretation: Author's explanation of data. Goes beyond what the numbers strictly show. Often uses words like "suggests", "indicates", "consistent with".
- Hypothesis: Proposed mechanism or prediction not yet tested. Uses "may", "could", "we propose", "it is possible".
- Conflict_Hypothesis: A claim that explicitly contradicts another established claim in the field. Evidence exists on both sides.
Output JSON: {"epistemic_tag": "...", "reasoning": "...", "confidence_in_classification": float}
Be conservative: if a statement mixes fact and interpretation, classify as Interpretation."""
SYSTEM_CONFIDENCE_SCORER = """You are the Confidence Scorer of a PhD Research OS. Score the confidence of a scientific claim using this formula:
confidence = evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty
Where:
- evidence_strength [0,1]: How directly the evidence supports the claim
- study_quality_weight: primary_experimental=1.0, in_vitro=0.8, simulation=0.6, review_non_systematic=0.4, meta_analysis=1.0, case_study=0.3
- journal_tier_weight: tier1=1.0, tier2=0.85, tier3=0.7, preprint=0.5
- completeness_penalty: 1.0 if all parameters reported, 0.7 if missing key parameters
Output JSON: {"confidence": float, "evidence_strength": float, "study_quality_weight": float, "journal_tier_weight": float, "completeness_penalty": float, "reasoning": "..."}
Use fixed-point scaled integers internally (multiply by 1000, round, divide by 1000) to avoid floating-point drift."""
SYSTEM_CONFLICT_DETECTOR = """You are the Verifier Agent of a PhD Research OS. Given two scientific claims, determine if they contradict each other.
Analyze the claims and output a Conflict Resolution Object:
- conflict_detected: boolean
- conflict_type: one of "value_mismatch", "methodology_difference", "scope_difference", "no_conflict"
- generated_hypothesis: text explaining the possible cause of the conflict
- hypothesis_confidence: always "low" (never auto-set to high — human review required)
- resolution_status: "Unresolved"
- key_differences: list of specific parameter/methodology differences
- recommended_action: what the researcher should investigate to resolve this
Output valid JSON. Be thorough but conservative — flag real conflicts, not superficial differences."""
SYSTEM_QUERY_DECOMPOSER = """You are the Query Planner of a PhD Research OS. Given a broad research question, decompose it into 2-4 specific sub-queries that can be independently searched in a scientific knowledge base.
Each sub-query should:
- Target a specific aspect of the question
- Be answerable from individual paper claims
- Together, cover the full scope of the original question
Output JSON: {"original_query": "...", "sub_queries": ["...", "..."], "reasoning": "..."}"""
SYSTEM_DECISION_GENERATOR = """You are the Decision Agent of a PhD Research OS. Given the current research goals, knowledge gaps, and incomplete/low-confidence claims, propose a Decision Object.
A Decision Object includes:
- decision_id: string (DEC_XXXX)
- recommended_action: one of "experiment", "literature_search", "collaboration", "replication", "methodology_review"
- action_description: specific description of what to do
- expected_information_gain: float [0,1] = uncertainty_of_claim × impact_on_goal
- linked_goal_id: which research goal this addresses
- linked_claim_ids: which claims this would resolve
- priority: "high", "medium", "low"
- estimated_effort: rough time estimate
Output valid JSON. Prioritize actions with highest information gain per unit effort."""
# ============================================================
# STEM DOMAIN KNOWLEDGE BASE (for generating realistic examples)
# ============================================================
STEM_DOMAINS = {
"biosensors": {
"topics": ["graphene FET sensors", "Debye length screening", "aptamer functionalization",
"limit of detection", "signal-to-noise ratio", "ionic strength effects",
"surface chemistry", "biomarker detection", "point-of-care diagnostics"],
"parameters": ["concentration", "ionic_strength_mM", "temperature_C", "pH",
"incubation_time_min", "gate_voltage_V", "drain_current_uA",
"sensitivity_mV_per_decade", "LOD_fM", "selectivity_ratio"],
"journals_t1": ["Nature Biotechnology", "ACS Nano", "Nano Letters", "Biosensors and Bioelectronics"],
"journals_t2": ["Analytical Chemistry", "Lab on a Chip", "Sensors and Actuators B"],
"journals_t3": ["IEEE Sensors Journal", "Microchimica Acta"]
},
"materials_science": {
"topics": ["2D materials", "MoS2 synthesis", "CVD growth", "defect engineering",
"band gap tuning", "heterostructures", "strain engineering"],
"parameters": ["thickness_nm", "growth_temperature_C", "pressure_torr", "carrier_gas_flow_sccm",
"grain_size_um", "mobility_cm2_Vs", "bandgap_eV", "defect_density_cm2"],
"journals_t1": ["Nature Materials", "Advanced Materials", "ACS Nano"],
"journals_t2": ["Chemistry of Materials", "2D Materials", "Nanoscale"],
"journals_t3": ["Materials Research Express", "Journal of Materials Science"]
},
"electrochemistry": {
"topics": ["battery electrolytes", "solid-state batteries", "lithium-ion transport",
"electrode-electrolyte interface", "impedance spectroscopy", "cycling stability"],
"parameters": ["ionic_conductivity_S_cm", "activation_energy_eV", "cycle_number",
"capacity_retention_pct", "voltage_window_V", "current_density_mA_cm2",
"coulombic_efficiency_pct", "electrode_thickness_um"],
"journals_t1": ["Nature Energy", "Joule", "Advanced Energy Materials"],
"journals_t2": ["Journal of the Electrochemical Society", "Electrochimica Acta"],
"journals_t3": ["Batteries", "Journal of Power Sources"]
},
"computational_biology": {
"topics": ["protein folding", "molecular dynamics", "drug-target interaction",
"genomic analysis", "CRISPR efficiency prediction", "gene regulatory networks"],
"parameters": ["RMSD_angstrom", "binding_affinity_kcal_mol", "simulation_time_ns",
"accuracy_pct", "AUC_ROC", "precision", "recall", "F1_score"],
"journals_t1": ["Nature Methods", "Nature Biotechnology", "Cell Systems"],
"journals_t2": ["Bioinformatics", "PLOS Computational Biology", "BMC Genomics"],
"journals_t3": ["Journal of Computational Biology", "Computational Biology and Chemistry"]
},
"quantum_computing": {
"topics": ["qubit coherence", "quantum error correction", "superconducting circuits",
"quantum algorithms", "quantum supremacy benchmarks", "topological qubits"],
"parameters": ["T1_us", "T2_us", "gate_fidelity_pct", "qubit_count",
"error_rate", "circuit_depth", "quantum_volume"],
"journals_t1": ["Nature", "Science", "Physical Review Letters"],
"journals_t2": ["Physical Review A", "Quantum", "npj Quantum Information"],
"journals_t3": ["Quantum Science and Technology", "Journal of Physics A"]
}
}
# ============================================================
# EXAMPLE GENERATORS
# ============================================================
def generate_claim_extraction_examples(n=500):
"""Generate claim extraction training examples."""
examples = []
paper_templates = [
# Template 1: Experimental results paper
{
"text": "We investigated the effect of {param1} on {topic} using {method}. Our results demonstrate that increasing {param1} from {val1} to {val2} led to a {change_pct}% {direction} in {metric}. The {metric} reached a maximum value of {max_val} {unit} at {param1} = {optimal_val}. Statistical analysis ({stat_test}, n={n_samples}) confirmed significance (p < {p_val}). These findings suggest that {interpretation}. We hypothesize that {hypothesis}.",
"num_claims": 5,
"claim_types": ["Fact", "Fact", "Fact", "Interpretation", "Hypothesis"]
},
# Template 2: Comparison study
{
"text": "A comparative analysis of {method_a} and {method_b} for {application} was conducted. {method_a} achieved a {metric} of {val_a} ± {err_a} {unit}, while {method_b} yielded {val_b} ± {err_b} {unit} under identical conditions ({conditions}). The {better_method} outperformed by {diff_pct}% (p = {p_val}). However, {worse_method} showed superior {alt_metric} ({alt_val_w} vs {alt_val_b}). These results indicate that the choice between methods depends on {factor}. Future work should explore {future_direction}.",
"num_claims": 6,
"claim_types": ["Fact", "Fact", "Fact", "Fact", "Interpretation", "Hypothesis"]
},
# Template 3: Review/synthesis
{
"text": "Recent studies have established that {established_fact}. Multiple groups have reported {metric} values ranging from {range_low} to {range_high} {unit} ({ref1}; {ref2}; {ref3}). The consensus view is that {consensus}. However, {conflicting_author} reported contradictory findings, showing {contradicting_claim}. This discrepancy may arise from {possible_explanation}. A unified model has been proposed where {model_description}.",
"num_claims": 5,
"claim_types": ["Fact", "Fact", "Interpretation", "Conflict_Hypothesis", "Hypothesis"]
},
# Template 4: Methodology paper
{
"text": "We present a novel {technique} for {application} that achieves {metric} of {value} {unit}. The method involves {step1}, followed by {step2}, and final {step3}. Calibration was performed using {calibration_method} with {standard}. The limit of detection was determined to be {lod} {lod_unit} (S/N = 3). Reproducibility was assessed over {n_trials} independent measurements, yielding RSD of {rsd}%. The technique is applicable to {sample_types} with matrix effects below {matrix_effect}%.",
"num_claims": 6,
"claim_types": ["Fact", "Fact", "Fact", "Fact", "Fact", "Interpretation"]
},
# Template 5: Negative/unexpected results
{
"text": "Contrary to predictions from {theory}, our measurements of {parameter} in {system} showed no significant dependence on {variable} (p = {p_val_ns}, n = {n_samples}). The observed {parameter} remained at {constant_val} ± {error} {unit} across the entire range of {variable} tested ({range}). This null result suggests that {null_interpretation}. One possible explanation is that {alternative_mechanism}. These findings challenge the assumption that {challenged_assumption}.",
"num_claims": 5,
"claim_types": ["Fact", "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
}
]
for i in range(n):
domain_name = random.choice(list(STEM_DOMAINS.keys()))
domain = STEM_DOMAINS[domain_name]
template = random.choice(paper_templates)
topic = random.choice(domain["topics"])
params = random.sample(domain["parameters"], min(4, len(domain["parameters"])))
# Generate realistic parameter values
journal_tier = random.choices([1, 2, 3], weights=[0.3, 0.4, 0.3])[0]
if journal_tier == 1:
journal = random.choice(domain["journals_t1"])
elif journal_tier == 2:
journal = random.choice(domain["journals_t2"])
else:
journal = random.choice(domain["journals_t3"])
study_types = ["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis"]
study_type = random.choices(study_types, weights=[0.4, 0.2, 0.15, 0.1, 0.15])[0]
# Build paper excerpt (simplified — real version would use LLM)
excerpt = f"""[Excerpt from: "{topic}: Recent Advances" — Published in {journal}, 2024]
In this study, we examined {topic} with particular focus on the relationship between {params[0]} and {params[1] if len(params) > 1 else 'system performance'}. Using {study_type.replace('_', ' ')} methodology, we measured {params[0]} under varying conditions of {params[1] if len(params) > 1 else 'standard parameters'}.
Our primary finding is that {params[0]} exhibits a {random.choice(['linear', 'exponential', 'logarithmic', 'sigmoidal'])} dependence on {params[1] if len(params) > 1 else 'the control variable'}, with a correlation coefficient of {round(random.uniform(0.7, 0.99), 3)}. The optimal value of {params[0]} was found to be {round(random.uniform(0.1, 100), 2)} under conditions where {params[1] if len(params) > 1 else 'temperature'} = {round(random.uniform(20, 200), 1)}.
{random.choice(['Statistical analysis confirmed', 'ANOVA testing revealed', 'Mann-Whitney U test showed'])} significance at p < {random.choice(['0.001', '0.01', '0.05'])} (n = {random.choice([3, 5, 10, 20, 50])}). These results {'align with' if random.random() > 0.3 else 'contradict'} previous reports by {random.choice(['Smith et al.', 'Zhang et al.', 'Kumar et al.', 'Johnson et al.'])} who found {params[0]} values of {round(random.uniform(0.1, 100), 2)}.
We interpret these findings as evidence that {random.choice(['the proposed mechanism involves', 'the dominant factor is', 'surface interactions govern'])} the observed behavior. We hypothesize that {random.choice(['further optimization could achieve', 'a threshold effect exists at', 'competing mechanisms dominate above'])} {round(random.uniform(10, 500), 1)}."""
# Generate structured claims
study_quality_map = {
"primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6,
"review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3
}
journal_tier_map = {1: 1.0, 2: 0.85, 3: 0.7}
num_claims = random.randint(3, 7)
claims = []
for j in range(num_claims):
epistemic = random.choices(
["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"],
weights=[0.4, 0.3, 0.2, 0.1]
)[0]
evidence_strength = round(random.uniform(0.5, 1.0), 3)
sq_weight = study_quality_map.get(study_type, 0.6)
jt_weight = journal_tier_map.get(journal_tier, 0.7)
has_missing = random.random() < 0.25
completeness = 0.7 if has_missing else 1.0
missing = random.sample(params, random.randint(1, 2)) if has_missing else []
# Fixed-point calculation (multiply by 1000, round, divide)
conf_raw = evidence_strength * sq_weight * jt_weight * completeness
confidence = round(int(conf_raw * 1000) / 1000, 3)
claim_text_options = [
f"The {params[0]} was measured at {round(random.uniform(0.1, 100), 2)} under standard conditions.",
f"A {random.choice(['positive', 'negative', 'non-linear'])} correlation was observed between {params[0]} and {params[1] if len(params) > 1 else 'output'}.",
f"The proposed mechanism suggests that {topic} is primarily governed by {random.choice(['surface effects', 'bulk properties', 'interfacial phenomena'])}.",
f"We hypothesize that optimizing {params[0]} beyond {round(random.uniform(50, 200), 1)} could yield {random.choice(['enhanced', 'diminished', 'qualitatively different'])} results.",
f"These findings contradict the established model by {random.choice(['Smith et al.', 'Zhang et al.', 'Lee et al.'])}, who reported {random.choice(['opposite', 'significantly different', 'null'])} effects.",
]
claim_obj = {
"claim_id": f"CLM_{i*10+j:04d}",
"text": random.choice(claim_text_options),
"epistemic_tag": epistemic,
"confidence": confidence,
"evidence_strength": evidence_strength,
"study_type": study_type,
"study_quality_weight": sq_weight,
"journal_tier_weight": jt_weight,
"completeness_penalty": completeness,
"missing_fields": missing,
"status": "Incomplete" if has_missing else "Complete",
"parameters": {p: round(random.uniform(0.1, 100), 2) for p in random.sample(params, min(2, len(params)))}
}
claims.append(claim_obj)
response = json.dumps({"claims": claims, "source_doi": f"10.1234/example.{i:04d}", "paper_domain": domain_name}, indent=2)
examples.append({
"messages": [
{"role": "system", "content": SYSTEM_CLAIM_EXTRACTION},
{"role": "user", "content": f"Extract all scientific claims from the following paper excerpt:\n\n{excerpt}"},
{"role": "assistant", "content": response}
]
})
return examples
def generate_epistemic_classification_examples(n=400):
"""Generate epistemic tag classification examples."""
examples = []
statements = {
"Fact": [
"The measured ionic conductivity of the LLZO pellet was 4.2 × 10⁻⁴ S/cm at 25°C.",
"Graphene field-effect transistors showed a Dirac point shift of 45 mV upon target binding.",
"The crystal structure was confirmed as tetragonal by XRD analysis (JCPDS 00-024-0867).",
"Cell viability remained above 95% after 48 hours of exposure to nanoparticle concentrations up to 100 μg/mL.",
"The reaction yield increased from 32% to 87% when the catalyst loading was doubled from 5 mol% to 10 mol%.",
"Raman spectroscopy revealed a G/2D peak ratio of 0.35, consistent with monolayer graphene.",
"The bandgap energy was determined to be 1.85 eV from UV-Vis absorption spectroscopy.",
"Atomic force microscopy confirmed a film thickness of 12.3 ± 0.4 nm across 20 measurement points.",
"The protein folding simulation converged after 850 ns with RMSD < 2.0 Å.",
"HPLC analysis showed 99.2% purity of the synthesized compound.",
"The qubit T1 relaxation time was measured at 152 ± 8 μs at 15 mK.",
"Mass spectrometry confirmed the molecular ion peak at m/z = 342.18, consistent with the expected product.",
"The transistor exhibited an on/off current ratio of 10⁶ with a subthreshold swing of 68 mV/decade.",
"Flow cytometry analysis revealed 78.3% of cells were in the G1 phase after treatment.",
"The electrode maintained 94.2% capacity retention after 500 charge-discharge cycles at 1C rate."
],
"Interpretation": [
"The observed Dirac point shift suggests successful functionalization of the graphene surface.",
"These results indicate that the ion transport mechanism is primarily governed by grain boundary diffusion.",
"The non-linear dose-response curve is consistent with a cooperative binding model.",
"The improved performance at elevated temperatures points to thermally activated charge transport.",
"Our data support the hypothesis that surface defects play a critical role in catalytic activity.",
"The correlation between particle size and reactivity implies surface-area-dependent kinetics.",
"These findings are consistent with a two-step nucleation mechanism rather than classical nucleation theory.",
"The asymmetric peak broadening in XRD patterns suggests the presence of microstrain.",
"The enhanced fluorescence lifetime indicates reduced non-radiative recombination pathways.",
"The inverse relationship between ionic strength and sensitivity aligns with Debye screening predictions.",
"Based on the activation energy of 0.32 eV, we conclude that lithium diffusion occurs via an interstitial mechanism.",
"The observed blue shift in photoluminescence is attributable to quantum confinement effects.",
"The saturation behavior above 100 nM concentration reflects receptor site limitation."
],
"Hypothesis": [
"We propose that the anomalous conductivity enhancement arises from a percolation network of amorphous regions.",
"It is possible that the observed bistability originates from competing ferroelectric and antiferroelectric phases.",
"Future experiments with isotope labeling could determine whether proton hopping or vehicle mechanism dominates.",
"We hypothesize that introducing tensile strain into the MoS2 lattice will reduce the bandgap below 1.5 eV.",
"A possible explanation is that the protein undergoes a conformational change upon ligand binding that exposes a hidden epitope.",
"We speculate that the unexpected catalytic activity may arise from edge-site defects not captured in bulk characterization.",
"If the proposed mechanism is correct, replacing the counter-ion should produce a measurable shift in the voltammetric response.",
"The anomalous transport behavior could potentially be explained by a polaron hopping model.",
"We conjecture that the system exhibits a quantum phase transition at a critical doping concentration of approximately 0.15.",
"A theoretical framework based on Marcus theory predicts that electron transfer rates should increase by 10× at these reorganization energies.",
"It remains to be tested whether this enhancement persists under physiological buffer conditions."
],
"Conflict_Hypothesis": [
"Our observation of decreasing sensitivity at high ionic strength directly contradicts Chen et al. (2022), who reported sensitivity enhancement under similar conditions.",
"While the established model predicts a linear relationship between film thickness and resistance, our data show clear deviation above 20 nm.",
"These results challenge the widely accepted Langmuir adsorption model, suggesting that multilayer formation occurs at concentrations previously considered sub-monolayer.",
"Contrary to the predictions of density functional theory calculations by Park et al., we observe metallic rather than semiconducting behavior in this phase.",
"The measured activation energy of 0.52 eV is significantly higher than the 0.28 eV reported by three independent groups, suggesting a fundamentally different transport mechanism.",
"Our finding that the reaction proceeds without the proposed intermediate contradicts the established mechanism.",
"The negative correlation we observe between grain size and conductivity opposes the conventional understanding based on brick-layer model predictions.",
"While Johnson et al. reported complete stability over 1000 cycles, our replication attempt shows measurable degradation beginning at cycle 300."
]
}
for i in range(n):
tag = random.choices(
["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"],
weights=[0.35, 0.30, 0.20, 0.15]
)[0]
statement = random.choice(statements[tag])
reasoning_templates = {
"Fact": "This statement reports a direct measurement/observation with specific quantitative data. It does not include author interpretation or speculation.",
"Interpretation": "This statement goes beyond raw data to explain or attribute meaning to observations. The author draws conclusions that are not strictly contained in the measurements alone.",
"Hypothesis": "This statement proposes an untested mechanism or prediction. Key indicators: uses speculative language (may, could, hypothesize, propose, possible).",
"Conflict_Hypothesis": "This statement explicitly contradicts an established finding or widely accepted model, with evidence supporting both positions."
}
conf_in_class = round(random.uniform(0.75, 0.98), 2)
response = json.dumps({
"epistemic_tag": tag,
"reasoning": reasoning_templates[tag],
"confidence_in_classification": conf_in_class
}, indent=2)
examples.append({
"messages": [
{"role": "system", "content": SYSTEM_EPISTEMIC_CLASSIFIER},
{"role": "user", "content": f"Classify the epistemic status of this scientific statement:\n\n\"{statement}\""},
{"role": "assistant", "content": response}
]
})
return examples
def generate_confidence_scoring_examples(n=300):
"""Generate confidence scoring training examples."""
examples = []
for i in range(n):
domain_name = random.choice(list(STEM_DOMAINS.keys()))
domain = STEM_DOMAINS[domain_name]
study_type = random.choices(
["primary_experimental", "in_vitro", "simulation", "review_non_systematic", "meta_analysis", "case_study"],
weights=[0.35, 0.20, 0.15, 0.10, 0.10, 0.10]
)[0]
journal_tier = random.choices([1, 2, 3, "preprint"], weights=[0.25, 0.35, 0.25, 0.15])[0]
if journal_tier == 1:
journal = random.choice(domain["journals_t1"])
elif journal_tier == 2:
journal = random.choice(domain["journals_t2"])
elif journal_tier == 3:
journal = random.choice(domain["journals_t3"])
else:
journal = "arXiv preprint"
sq_map = {"primary_experimental": 1.0, "in_vitro": 0.8, "simulation": 0.6,
"review_non_systematic": 0.4, "meta_analysis": 1.0, "case_study": 0.3}
jt_map = {1: 1.0, 2: 0.85, 3: 0.7, "preprint": 0.5}
evidence_strength = round(random.uniform(0.3, 1.0), 3)
sq_weight = sq_map[study_type]
jt_weight = jt_map[journal_tier]
has_missing = random.random() < 0.3
completeness = 0.7 if has_missing else 1.0
# Fixed-point calculation
raw = evidence_strength * sq_weight * jt_weight * completeness
confidence = round(int(raw * 1000) / 1000, 3)
claim_text = f"Claim from {journal} ({study_type.replace('_', ' ')} study): {random.choice(domain['topics'])} measured {random.choice(domain['parameters'])} at {round(random.uniform(0.1, 100), 2)}."
if has_missing:
claim_text += " [Note: sample size and error margins not reported]"
reasoning = f"Study type '{study_type}' gets weight {sq_weight}. Journal '{journal}' is tier {journal_tier} (weight {jt_weight}). Evidence strength assessed at {evidence_strength} based on directness of measurement. {'Missing key parameters reduce completeness penalty to 0.7.' if has_missing else 'All parameters reported, completeness 1.0.'} Final: {evidence_strength} × {sq_weight} × {jt_weight} × {completeness} = {confidence}."
response = json.dumps({
"confidence": confidence,
"evidence_strength": evidence_strength,
"study_quality_weight": sq_weight,
"journal_tier_weight": jt_weight,
"completeness_penalty": completeness,
"reasoning": reasoning
}, indent=2)
examples.append({
"messages": [
{"role": "system", "content": SYSTEM_CONFIDENCE_SCORER},
{"role": "user", "content": f"Score the confidence of this claim:\n\n{claim_text}\n\nSource: {journal}\nStudy type: {study_type}\nJournal tier: {journal_tier}"},
{"role": "assistant", "content": response}
]
})
return examples
def generate_conflict_detection_examples(n=300):
"""Generate contradiction detection training examples."""
examples = []
conflict_pairs = [
# Real conflicts
{
"claim_a": "Graphene FET sensitivity increases monotonically with decreasing ionic strength (measured range: 0.1-100 mM PBS).",
"claim_b": "Below 1 mM ionic strength, GFET sensitivity plateaus due to parasitic charge screening from surface-adsorbed species.",
"conflict": True,
"conflict_type": "value_mismatch",
"hypothesis": "The discrepancy likely arises from different surface functionalization protocols. Claim A used bare graphene while Claim B used PEG-passivated surfaces, which would accumulate different surface charges at very low ionic strength.",
"key_diffs": ["surface treatment protocol", "ionic strength range tested", "measurement technique"]
},
{
"claim_a": "The Li-ion conductivity of LLZO was 1.2 × 10⁻³ S/cm at room temperature.",
"claim_b": "LLZO pellets sintered under identical conditions showed conductivity of 2.8 × 10⁻⁴ S/cm at 25°C.",
"conflict": True,
"conflict_type": "value_mismatch",
"hypothesis": "The order-of-magnitude difference may stem from different grain boundary densities. Claim A likely reports single-crystal or highly-densified samples, while Claim B may include significant grain boundary resistance.",
"key_diffs": ["sample preparation method", "densification level", "measurement geometry"]
},
{
"claim_a": "MoS2 monolayers grown by CVD at 700°C show exclusively 2H phase.",
"claim_b": "CVD-grown MoS2 at 700°C contains 30-40% 1T phase, as confirmed by XPS peak deconvolution.",
"conflict": True,
"conflict_type": "methodology_difference",
"hypothesis": "The 1T phase in Claim B may be induced during the transfer process (common with wet-chemical transfer) or by the choice of sulfur precursor. Claim A may have used a different transfer method or in-situ characterization.",
"key_diffs": ["sulfur precursor", "transfer method", "characterization timing (in-situ vs ex-situ)"]
},
{
"claim_a": "The protein binding affinity (Kd) for aptamer X was 2.3 nM in buffer.",
"claim_b": "Aptamer X showed Kd of 180 nM when tested in 50% human serum.",
"conflict": False,
"conflict_type": "scope_difference",
"hypothesis": "These measurements are not contradictory — they reflect different measurement conditions. The 78× decrease in affinity in serum is expected due to non-specific protein interactions, ionic strength differences, and potential aptamer degradation by nucleases.",
"key_diffs": ["measurement medium (buffer vs serum)", "matrix effects"]
},
{
"claim_a": "Quantum dot fluorescence quenching follows Stern-Volmer kinetics with KSV = 4.5 × 10⁴ M⁻¹.",
"claim_b": "The same QD-quencher system shows non-linear Stern-Volmer behavior above 10 μM quencher concentration.",
"conflict": False,
"conflict_type": "scope_difference",
"hypothesis": "Both claims can be simultaneously true. The Stern-Volmer relationship is linear at low quencher concentrations (Claim A's measurement range) but deviates at higher concentrations (Claim B) due to static quenching or ground-state complex formation.",
"key_diffs": ["concentration range", "quenching mechanism regime"]
},
# More conflicts
{
"claim_a": "Sonication for 30 minutes produces graphene flakes with average lateral size of 500 nm.",
"claim_b": "Extended sonication (30 min) yields graphene fragments predominantly below 100 nm with significant edge defects.",
"conflict": True,
"conflict_type": "value_mismatch",
"hypothesis": "The 5× difference in reported flake size likely stems from different sonication power/frequency settings, solvent choice (NMP vs water/surfactant), or measurement method (DLS vs AFM). Additionally, definition of 'average' may differ (number-averaged vs volume-averaged).",
"key_diffs": ["sonication parameters (power, frequency)", "solvent system", "size measurement technique", "averaging method"]
},
{
"claim_a": "The neural network achieved 96.3% accuracy on the protein structure prediction benchmark.",
"claim_b": "On the same benchmark, the identical architecture achieved only 84.1% accuracy when trained with a different random seed.",
"conflict": True,
"conflict_type": "methodology_difference",
"hypothesis": "The 12.2% accuracy gap from random seed variation alone suggests the model is highly sensitive to initialization. This may indicate overfitting to specific training data partitions or instability in the optimization landscape.",
"key_diffs": ["random seed", "potentially different train/test splits", "convergence criteria"]
},
{
"claim_a": "The catalyst achieves 99% conversion at 80°C.",
"claim_b": "The catalyst achieves 99% conversion at 80°C with fresh reagents, but only 45% with recycled catalyst after 3 cycles.",
"conflict": False,
"conflict_type": "scope_difference",
"hypothesis": "These are not contradictory. Claim A reports initial performance while Claim B adds information about durability. The 54% drop after 3 cycles reveals catalyst deactivation, possibly from active site poisoning or structural degradation.",
"key_diffs": ["catalyst reuse cycle", "implicit freshness assumption in Claim A"]
}
]
for i in range(n):
pair = random.choice(conflict_pairs)
response = json.dumps({
"conflict_detected": pair["conflict"],
"conflict_type": pair["conflict_type"],
"generated_hypothesis": pair["hypothesis"],
"hypothesis_confidence": "low",
"resolution_status": "Unresolved",
"key_differences": pair["key_diffs"],
"recommended_action": f"{'Investigate the specific methodological differences identified. Request raw data from both groups if possible.' if pair['conflict'] else 'No conflict resolution needed. Both claims are valid within their respective scopes. Document the scope boundary.'}"
}, indent=2)
examples.append({
"messages": [
{"role": "system", "content": SYSTEM_CONFLICT_DETECTOR},
{"role": "user", "content": f"Analyze these two claims for contradictions:\n\nClaim A: \"{pair['claim_a']}\"\n\nClaim B: \"{pair['claim_b']}\""},
{"role": "assistant", "content": response}
]
})
return examples
def generate_query_decomposition_examples(n=200):
"""Generate query decomposition examples."""
examples = []
queries = [
{
"query": "What is the current state of graphene-based biosensors for cancer biomarker detection?",
"sub_queries": [
"What cancer biomarkers have been detected using graphene FET sensors?",
"What are the reported limits of detection for graphene biosensors targeting cancer markers?",
"How does graphene biosensor performance compare to ELISA and other standard methods?",
"What are the main challenges preventing clinical translation of graphene biosensors?"
],
"reasoning": "This broad question spans detection targets, performance metrics, comparative assessment, and translational barriers. Each sub-query targets a specific knowledge domain that maps to different claim types in the database."
},
{
"query": "How does ionic strength affect the performance of field-effect transistor biosensors?",
"sub_queries": [
"What is the Debye screening length at different ionic strength values?",
"How does sensitivity change as a function of buffer ionic strength for FET biosensors?",
"What strategies have been developed to overcome Debye screening limitations?"
],
"reasoning": "The question involves fundamental physics (Debye length), empirical relationships (sensitivity vs ionic strength), and engineering solutions (overcoming limitations)."
},
{
"query": "What are the best solid-state electrolytes for next-generation lithium batteries?",
"sub_queries": [
"What ionic conductivities have been achieved in garnet-type, sulfide, and polymer electrolytes?",
"What are the interfacial stability challenges between solid electrolytes and lithium metal anodes?",
"How do manufacturing scalability and cost compare across solid electrolyte families?",
"What degradation mechanisms limit cycle life in solid-state batteries?"
],
"reasoning": "This question spans material performance (conductivity), interface engineering, practical considerations (cost/scale), and durability. Each is a distinct research sub-domain."
},
{
"query": "Can CRISPR-Cas9 efficiency be predicted computationally?",
"sub_queries": [
"What machine learning models have been developed for CRISPR guide RNA efficiency prediction?",
"What features (sequence, structure, chromatin) are most predictive of CRISPR cutting efficiency?",
"How do computational predictions compare to experimental validation data?"
],
"reasoning": "The question bridges computational methods, feature engineering, and experimental validation — three distinct knowledge areas."
},
{
"query": "What determines qubit coherence time in superconducting quantum processors?",
"sub_queries": [
"What are the dominant decoherence mechanisms (T1, T2) in transmon qubits?",
"How do materials and fabrication choices affect qubit coherence?",
"What is the current state-of-the-art for superconducting qubit coherence times?"
],
"reasoning": "This question involves fundamental physics (decoherence mechanisms), engineering (fabrication), and benchmarking (state-of-the-art values)."
},
{
"query": "How effective are 2D material heterostructures for photocatalytic water splitting?",
"sub_queries": [
"What 2D material combinations have been tested for photocatalytic hydrogen evolution?",
"What are the reported solar-to-hydrogen conversion efficiencies for 2D heterostructure photocatalysts?",
"How does band alignment in 2D heterostructures affect charge separation and catalytic activity?",
"What stability and durability challenges exist for 2D photocatalysts?"
],
"reasoning": "Covers materials (combinations), performance metrics (efficiency), mechanism (band alignment), and practical challenges (stability) — four distinct research angles."
}
]
for i in range(n):
q = random.choice(queries)
response = json.dumps({
"original_query": q["query"],
"sub_queries": q["sub_queries"],
"reasoning": q["reasoning"]
}, indent=2)
examples.append({
"messages": [
{"role": "system", "content": SYSTEM_QUERY_DECOMPOSER},
{"role": "user", "content": f"Decompose this research question into specific sub-queries:\n\n\"{q['query']}\""},
{"role": "assistant", "content": response}
]
})
return examples
def generate_decision_object_examples(n=200):
"""Generate Decision Object training examples."""
examples = []
scenarios = [
{
"goal": "Achieve sub-femtomolar detection limit for cardiac troponin I using GFET biosensor",
"gaps": ["Optimal aptamer sequence for cTnI not determined", "Debye screening at physiological ionic strength limits sensitivity", "No data on sensor-to-sensor reproducibility"],
"low_confidence_claims": ["CLM_0042: PEG spacer length of 5 kDa optimal (confidence: 0.35)", "CLM_0089: Desalting step preserves >90% of target protein (confidence: 0.41)"],
"decision": {
"decision_id": "DEC_0001",
"recommended_action": "experiment",
"action_description": "Systematic optimization of aptamer surface density and PEG spacer length on GFET. Test 3 aptamer concentrations × 3 PEG lengths × 3 ionic strength conditions. Include negative controls.",
"expected_information_gain": 0.72,
"linked_goal_id": "GOAL_001",
"linked_claim_ids": ["CLM_0042", "CLM_0089"],
"priority": "high",
"estimated_effort": "2-3 weeks of lab work + 1 week analysis"
}
},
{
"goal": "Understand degradation mechanism in solid-state lithium battery at >100 cycles",
"gaps": ["Interface evolution not characterized in-situ", "Role of grain boundary resistance vs bulk unclear", "Temperature dependence of degradation unknown"],
"low_confidence_claims": ["CLM_0156: Lithium dendrite penetration through grain boundaries (confidence: 0.28)", "CLM_0201: SEI formation at cathode interface dominates early degradation (confidence: 0.39)"],
"decision": {
"decision_id": "DEC_0002",
"recommended_action": "literature_search",
"action_description": "Comprehensive search for in-situ/operando characterization studies of LLZO-lithium interfaces during cycling. Focus on synchrotron XRD and cryo-TEM studies from 2022-2024.",
"expected_information_gain": 0.58,
"linked_goal_id": "GOAL_003",
"linked_claim_ids": ["CLM_0156", "CLM_0201"],
"priority": "medium",
"estimated_effort": "1 week literature search + synthesis"
}
},
{
"goal": "Validate computational model of protein-aptamer binding",
"gaps": ["MD simulation parameters not benchmarked against experimental Kd", "Force field choice may introduce systematic bias", "Solvent model effects unexplored"],
"low_confidence_claims": ["CLM_0312: AMBER ff14SB adequate for aptamer-protein complexes (confidence: 0.32)"],
"decision": {
"decision_id": "DEC_0003",
"recommended_action": "collaboration",
"action_description": "Contact the computational chemistry group (Prof. Martinez) for force field parameterization expertise. Their recent paper on RNA-protein interactions used an optimized force field that may apply here.",
"expected_information_gain": 0.65,
"linked_goal_id": "GOAL_005",
"linked_claim_ids": ["CLM_0312"],
"priority": "medium",
"estimated_effort": "Initial meeting + 2-4 weeks of collaborative work"
}
},
{
"goal": "Establish reproducibility of the nanofabrication process",
"gaps": ["Batch-to-batch variation not quantified", "Critical process parameters not identified", "No statistical process control"],
"low_confidence_claims": ["CLM_0089: Yield >80% for device fabrication (confidence: 0.25, based on single batch)"],
"decision": {
"decision_id": "DEC_0004",
"recommended_action": "replication",
"action_description": "Fabricate 5 independent batches of 20 devices each over 5 separate days. Measure key performance metrics. Perform ANOVA to identify day-to-day and within-batch variation. Calculate Cpk for critical parameters.",
"expected_information_gain": 0.81,
"linked_goal_id": "GOAL_002",
"linked_claim_ids": ["CLM_0089"],
"priority": "high",
"estimated_effort": "3-4 weeks fabrication + 1 week characterization + 1 week analysis"
}
},
{
"goal": "Resolve contradictory claims about qubit decoherence mechanism",
"gaps": ["Two competing theories with similar explanatory power", "No experiment designed to distinguish between mechanisms"],
"low_confidence_claims": ["CLM_0445: TLS-dominated decoherence (confidence: 0.42)", "CLM_0446: Quasiparticle tunneling dominates above 50mK (confidence: 0.38)"],
"decision": {
"decision_id": "DEC_0005",
"recommended_action": "methodology_review",
"action_description": "Design a discriminating experiment: measure T1 as a function of temperature (10-200 mK) and applied magnetic field (0-100 mT). TLS model predicts logarithmic T1(T) while quasiparticle model predicts exponential. This single experiment can resolve the contradiction.",
"expected_information_gain": 0.89,
"linked_goal_id": "GOAL_007",
"linked_claim_ids": ["CLM_0445", "CLM_0446"],
"priority": "high",
"estimated_effort": "1 week experiment design + 2 weeks measurement + 1 week analysis"
}
}
]
for i in range(n):
scenario = random.choice(scenarios)
user_prompt = f"""Current research goal: {scenario['goal']}
Knowledge gaps:
{chr(10).join('- ' + g for g in scenario['gaps'])}
Low-confidence claims requiring resolution:
{chr(10).join('- ' + c for c in scenario['low_confidence_claims'])}
Propose a Decision Object with the highest expected information gain."""
response = json.dumps(scenario["decision"], indent=2)
examples.append({
"messages": [
{"role": "system", "content": SYSTEM_DECISION_GENERATOR},
{"role": "user", "content": user_prompt},
{"role": "assistant", "content": response}
]
})
return examples
# ============================================================
# MAIN: Generate and combine all task datasets
# ============================================================
def main():
print("Generating PhD Research OS training dataset...")
print(" Task 1: Claim Extraction (500 examples)...")
claim_examples = generate_claim_extraction_examples(500)
print(" Task 2: Epistemic Classification (400 examples)...")
epistemic_examples = generate_epistemic_classification_examples(400)
print(" Task 3: Confidence Scoring (300 examples)...")
confidence_examples = generate_confidence_scoring_examples(300)
print(" Task 4: Conflict Detection (300 examples)...")
conflict_examples = generate_conflict_detection_examples(300)
print(" Task 5: Query Decomposition (200 examples)...")
query_examples = generate_query_decomposition_examples(200)
print(" Task 6: Decision Objects (200 examples)...")
decision_examples = generate_decision_object_examples(200)
# Combine all examples
all_examples = (claim_examples + epistemic_examples + confidence_examples +
conflict_examples + query_examples + decision_examples)
# Shuffle
random.shuffle(all_examples)
print(f"\n Total examples: {len(all_examples)}")
# Split into train/eval (90/10)
split_idx = int(len(all_examples) * 0.9)
train_data = all_examples[:split_idx]
eval_data = all_examples[split_idx:]
print(f" Train: {len(train_data)}, Eval: {len(eval_data)}")
# Create HF Dataset
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)
dataset_dict = DatasetDict({
"train": train_dataset,
"test": eval_dataset
})
# Save locally
dataset_dict.save_to_disk("/app/phd_research_os_dataset")
print(f"\n Dataset saved to /app/phd_research_os_dataset")
# Also save as JSON for inspection
with open("/app/sample_examples.json", "w") as f:
json.dump(all_examples[:10], f, indent=2)
print(" Sample examples saved to /app/sample_examples.json")
# Print distribution stats
task_counts = {}
for ex in all_examples:
system_msg = ex["messages"][0]["content"][:50]
task_counts[system_msg] = task_counts.get(system_msg, 0) + 1
print("\n Task distribution:")
for task, count in task_counts.items():
print(f" {task}... : {count}")
return dataset_dict
if __name__ == "__main__":
main()
|