Add SciRIFF training data integration script (72x more data for training)

Browse files

Files changed (1) hide show

phd_research_os_v2/training/sciriff_integration.py +261 -0

phd_research_os_v2/training/sciriff_integration.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+SciRIFF Training Data Integration
+====================================
+Converts AllenAI's SciRIFF dataset (137K expert-written examples across
+54 scientific tasks) into the PhD Research OS ChatML format.
+Filters for tasks relevant to our pipeline:
+- Claim verification (SciFact tasks)
+- Information extraction (SciERC tasks)
+- NER and entity recognition
+- Summarization (faithful compression)
+Addresses blindspots: D-1, D-6, PA-3
+Source: SYSTEM_INSPIRATIONS.md DA-3
+Dependencies:
+    pip install datasets
+"""
+import json
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Tasks from SciRIFF that map to our pipeline
+RELEVANT_TASK_FAMILIES = {
+    "ie",                    # Information extraction → Layer 2
+    "classification",        # Classification → epistemic tagging
+    "summarization",         # Summarization → faithful claim compression
+    "qa",                    # Question answering → query decomposition
+    "entailment",            # Entailment → claim verification
+}
+# Specific task prefixes that are highly relevant
+HIGH_PRIORITY_TASKS = {
+    "scifact",               # Claim verification (SUPPORT/CONTRADICT)
+    "scierc",                # Scientific entity + relation extraction
+    "evidence_inference",    # RCT outcome extraction
+    "biosses",               # Biomedical sentence similarity
+    "chemprot",              # Chemical-protein interaction extraction
+    "ncbi_disease",          # Disease NER
+    "pubmedqa",              # Biomedical QA
+    "qasper",                # Full-text scientific QA
+}
+# System prompts to wrap SciRIFF examples in our format
+SYSTEM_PROMPTS = {
+    "ie": (
+        "You are the Claim Extractor of a PhD Research OS. "
+        "Extract structured information from scientific text. "
+        "Be precise, preserve qualifiers, and output valid JSON."
+    ),
+    "classification": (
+        "You are the Epistemic Classifier of a PhD Research OS. "
+        "Classify the given scientific text according to the specified taxonomy. "
+        "Consider context, hedging language, and evidence strength."
+    ),
+    "summarization": (
+        "You are the Synthesis Agent of a PhD Research OS. "
+        "Summarize scientific text faithfully. Never add information "
+        "not present in the source. Preserve all qualifiers and hedging."
+    ),
+    "qa": (
+        "You are the Query Planner of a PhD Research OS. "
+        "Answer questions about scientific papers using evidence from the text. "
+        "Cite specific passages. Say 'insufficient evidence' when appropriate."
+    ),
+    "entailment": (
+        "You are the Claim Verifier of a PhD Research OS. "
+        "Given a claim and evidence, determine if the evidence SUPPORTS, "
+        "CONTRADICTS, or provides NOT_ENOUGH_INFO about the claim."
+    ),
+}
+def load_sciriff(config: str = "4096", split: str = "train",
+                  max_examples: int = None) -> list[dict]:
+    """
+    Load SciRIFF from HuggingFace and convert to ChatML format.
+    Args:
+        config: Token length config ("4096", "8192", "16384")
+        split: Dataset split
+        max_examples: Limit for quick testing
+    Returns:
+        List of {"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]}
+    """
+    from datasets import load_dataset
+    logger.info(f"Loading SciRIFF ({config}/{split})...")
+    ds = load_dataset("allenai/SciRIFF", config, split=split, trust_remote_code=True)
+    if max_examples:
+        ds = ds.select(range(min(max_examples, len(ds))))
+    converted = []
+    skipped = 0
+    task_counts = {}
+    for row in ds:
+        input_text = row.get("input", "")
+        output_text = row.get("output", "")
+        metadata = row.get("metadata", {})
+        instance_id = row.get("_instance_id", "")
+        if not input_text or not output_text:
+            skipped += 1
+            continue
+        # Determine task family from metadata or instance_id
+        task_family = None
+        if isinstance(metadata, dict):
+            task_family = metadata.get("task_family", "")
+        # Also check instance_id for task identification
+        task_name = instance_id.split(":")[0] if ":" in instance_id else ""
+        # Filter for relevant tasks
+        is_relevant = False
+        if task_family and task_family.lower() in RELEVANT_TASK_FAMILIES:
+            is_relevant = True
+        for prefix in HIGH_PRIORITY_TASKS:
+            if task_name.lower().startswith(prefix):
+                is_relevant = True
+                break
+        if not is_relevant:
+            # Still include with lower priority — all scientific tasks help
+            # but only include 20% of non-priority tasks to maintain focus
+            import hashlib
+            h = int(hashlib.md5(instance_id.encode()).hexdigest(), 16)
+            if h % 5 != 0:  # Keep ~20%
+                skipped += 1
+                continue
+        # Select system prompt based on task family
+        system_prompt = SYSTEM_PROMPTS.get(
+            task_family.lower() if task_family else "ie",
+            SYSTEM_PROMPTS["ie"]
+        )
+        # Build ChatML message
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": input_text},
+            {"role": "assistant", "content": output_text},
+        ]
+        converted.append({"messages": messages})
+        # Track task distribution
+        task_key = task_name or task_family or "unknown"
+        task_counts[task_key] = task_counts.get(task_key, 0) + 1
+    logger.info(
+        f"Converted {len(converted)} SciRIFF examples "
+        f"(skipped {skipped}, {len(task_counts)} task types)"
+    )
+    # Log task distribution
+    sorted_tasks = sorted(task_counts.items(), key=lambda x: -x[1])
+    for task, count in sorted_tasks[:15]:
+        logger.info(f"  {task}: {count} examples")
+    return converted
+def merge_datasets(existing_path: str = "nkshirsa/phd-research-os-sft-data",
+                    sciriff_config: str = "4096",
+                    sciriff_max: int = 10000,
+                    existing_max: int = None) -> dict:
+    """
+    Merge existing PhD Research OS training data with SciRIFF.
+    Returns:
+        {
+            "merged": list of ChatML examples,
+            "stats": {
+                "existing_count": int,
+                "sciriff_count": int,
+                "total": int,
+            }
+        }
+    """
+    from datasets import load_dataset
+    # Load existing data
+    logger.info(f"Loading existing data from {existing_path}...")
+    existing_ds = load_dataset(existing_path, split="train", trust_remote_code=True)
+    existing_examples = [{"messages": row["messages"]} for row in existing_ds]
+    if existing_max:
+        existing_examples = existing_examples[:existing_max]
+    # Load SciRIFF
+    sciriff_examples = load_sciriff(config=sciriff_config, max_examples=sciriff_max)
+    # Merge
+    merged = existing_examples + sciriff_examples
+    stats = {
+        "existing_count": len(existing_examples),
+        "sciriff_count": len(sciriff_examples),
+        "total": len(merged),
+        "expansion_factor": round(len(merged) / max(len(existing_examples), 1), 1),
+    }
+    logger.info(
+        f"Merged dataset: {stats['existing_count']} existing + "
+        f"{stats['sciriff_count']} SciRIFF = {stats['total']} total "
+        f"({stats['expansion_factor']}× expansion)"
+    )
+    return {"merged": merged, "stats": stats}
+def create_merged_hf_dataset(output_path: str = "data/merged_sft",
+                               sciriff_max: int = 10000,
+                               test_ratio: float = 0.1):
+    """
+    Create a merged HuggingFace dataset on disk, ready for training.
+    Args:
+        output_path: Where to save the dataset
+        sciriff_max: Maximum SciRIFF examples to include
+        test_ratio: Fraction for test split
+    """
+    from datasets import Dataset, DatasetDict
+    import random
+    result = merge_datasets(sciriff_max=sciriff_max)
+    all_examples = result["merged"]
+    # Shuffle
+    random.seed(42)
+    random.shuffle(all_examples)
+    # Split
+    n_test = int(len(all_examples) * test_ratio)
+    test_examples = all_examples[:n_test]
+    train_examples = all_examples[n_test:]
+    # Create HF dataset
+    train_ds = Dataset.from_list(train_examples)
+    test_ds = Dataset.from_list(test_examples)
+    ds_dict = DatasetDict({"train": train_ds, "test": test_ds})
+    ds_dict.save_to_disk(output_path)
+    logger.info(
+        f"Saved merged dataset to {output_path}: "
+        f"{len(train_examples)} train, {len(test_examples)} test"
+    )
+    return {
+        "path": output_path,
+        "train_count": len(train_examples),
+        "test_count": len(test_examples),
+        "stats": result["stats"],
+    }