Spaces:

Babajaan
/

bioinformatics-bb-tutor

Sleeping

File size: 50,085 Bytes

fe2b396

"""
Bioinformatics with BB Tutor — Domain Knowledge Base
Comprehensive bioinformatics content for RAG retrieval and context injection.
"""

DOMAIN_TAXONOMY = {
    "rna_seq": {
        "name": "RNA-seq",
        "subtopics": [
            "experimental_design", "library_preparation", "quality_control",
            "read_alignment", "transcript_quantification", "differential_expression",
            "deseq2", "edger", "limma_voom", "gene_set_enrichment", "pathway_analysis",
            "visualization", "normalization", "batch_effects", "multifactor_designs"
        ]
    },
    "exome_sequencing": {
        "name": "Exome Sequencing",
        "subtopics": [
            "capture_design", "library_preparation", "quality_control",
            "read_alignment", "duplicate_marking", "base_quality_recalibration",
            "variant_calling", "variant_filtering", "annotation",
            "coverage_analysis", "clinical_interpretation", "acmg_classification"
        ]
    },
    "genome_sequencing": {
        "name": "Genome Sequencing",
        "subtopics": [
            "library_preparation", "sequencing_platforms", "quality_control",
            "read_alignment", "variant_calling_germline", "variant_calling_somatic",
            "structural_variant_detection", "copy_number_analysis",
            "de_novo_assembly", "reference_genomes", "phasing"
        ]
    },
    "microbiome": {
        "name": "Microbiome Analysis",
        "subtopics": [
            "16s_amplicon", "shotgun_metagenomics", "library_preparation",
            "quality_control", "otu_clustering", "asv_denoising",
            "taxonomic_classification", "alpha_diversity", "beta_diversity",
            "differential_abundance", "functional_profiling",
            "qiime2", "dada2", "kraken2", "metaphlan"
        ]
    },
    "variant_interpretation": {
        "name": "Variant Interpretation",
        "subtopics": [
            "variant_types", "vcf_format", "annotation_tools",
            "population_databases", "functional_prediction", "acmg_guidelines",
            "clinical_significance", "pharmacogenomics",
            "somatic_vs_germline", "variant_effect_predictor", "snpeff",
            "gnomad", "clinvar", "cosmic"
        ]
    },
    "molecular_genetics": {
        "name": "Molecular Genetics",
        "subtopics": [
            "dna_structure", "gene_expression", "transcription", "translation",
            "splicing", "epigenetics", "mutations", "repair_mechanisms",
            "pcr", "cloning", "crispr", "restriction_enzymes",
            "inheritance_patterns", "genetic_disorders", "gene_regulation"
        ]
    },
    "single_cell": {
        "name": "Single-cell RNA-seq",
        "subtopics": [
            "cell_isolation", "library_preparation", "droplet_based",
            "plate_based", "quality_control", "normalization",
            "dimensionality_reduction", "clustering", "cell_type_annotation",
            "trajectory_analysis", "differential_expression",
            "batch_correction", "seurat", "scanpy"
        ]
    },
    "atac_seq": {
        "name": "ATAC-seq",
        "subtopics": [
            "chromatin_accessibility", "library_preparation", "quality_control",
            "alignment", "peak_calling", "differential_accessibility",
            "motif_enrichment", "footprinting", "integration_with_rnaseq",
            "macs2", "macs3"
        ]
    },
    "chip_seq": {
        "name": "ChIP-seq",
        "subtopics": [
            "chromatin_immunoprecipitation", "library_preparation",
            "quality_control", "alignment", "peak_calling",
            "differential_binding", "motif_analysis", "histone_modifications",
            "transcription_factor_binding", "input_control", "macs2"
        ]
    },
    "methylation_seq": {
        "name": "Methylation Sequencing",
        "subtopics": [
            "bisulfite_sequencing", "library_preparation", "quality_control",
            "alignment", "methylation_calling", "differential_methylation",
            "cpg_islands", "dmr_detection", "bismark", "nanopore_methylation"
        ]
    },
    "small_rna_seq": {
        "name": "Small RNA-seq",
        "subtopics": [
            "mirna", "sirna", "pirna", "library_preparation",
            "adapter_trimming", "alignment", "quantification",
            "differential_expression", "target_prediction", "mirbase"
        ]
    },
    "targeted_sequencing": {
        "name": "Targeted Sequencing Panels",
        "subtopics": [
            "panel_design", "amplicon_vs_capture", "library_preparation",
            "quality_control", "variant_calling", "coverage_uniformity",
            "clinical_panels", "oncology_panels", "pharmacogenomics_panels"
        ]
    },
    "long_read_sequencing": {
        "name": "Long-read Sequencing",
        "subtopics": [
            "pacbio", "oxford_nanopore", "library_preparation",
            "base_calling", "error_correction", "assembly",
            "structural_variant_detection", "full_length_transcripts",
            "epigenetic_modifications", "hifi_reads"
        ]
    },
    "spatial_transcriptomics": {
        "name": "Spatial Transcriptomics",
        "subtopics": [
            "visium", "slide_seq", "merfish", "seqfish",
            "spatial_clustering", "deconvolution",
            "ligand_receptor_analysis", "tissue_architecture"
        ]
    },
    "multi_omics": {
        "name": "Multi-omics Integration",
        "subtopics": [
            "data_integration_strategies", "multi_omics_factor_analysis",
            "weighted_correlation_network_analysis", "joint_dimensionality_reduction",
            "regulatory_network_inference", "mofa", "wgcna"
        ]
    }
}


# ============================================================================
# WORKFLOW KNOWLEDGE — Step-by-step pipeline descriptions
# ============================================================================

WORKFLOWS = {
    "rna_seq_bulk": {
        "name": "Bulk RNA-seq Differential Expression Workflow",
        "domain": "rna_seq",
        "difficulty": "intermediate",
        "steps": [
            {
                "step": 1,
                "name": "Experimental Design",
                "description": "Plan biological replicates (minimum 3 per condition, ideally 5+), randomize sample processing, consider batch effects. Define clear hypotheses and contrasts.",
                "tools": [],
                "common_mistakes": [
                    "Using fewer than 3 biological replicates per condition",
                    "Confusing biological replicates with technical replicates",
                    "Not planning for batch effects in sample processing"
                ]
            },
            {
                "step": 2,
                "name": "Quality Control of Raw Reads",
                "description": "Assess raw FASTQ quality using FastQC. Check per-base quality scores, GC content, adapter contamination, sequence duplication levels, and overrepresented sequences. Use MultiQC to aggregate results across samples.",
                "tools": ["FastQC", "MultiQC"],
                "common_mistakes": [
                    "Skipping QC entirely and going straight to alignment",
                    "Not checking for adapter contamination",
                    "Ignoring GC content bias which may indicate contamination"
                ]
            },
            {
                "step": 3,
                "name": "Read Trimming and Filtering",
                "description": "Remove adapter sequences and low-quality bases. Trim reads with quality score < 20 from 3' end. Remove reads shorter than 36bp after trimming.",
                "tools": ["Trimmomatic", "Cutadapt", "fastp"],
                "common_mistakes": [
                    "Over-trimming reads, reducing mappability",
                    "Not trimming adapters when they are present",
                    "Using wrong adapter sequences for the library kit"
                ]
            },
            {
                "step": 4,
                "name": "Read Alignment / Quantification",
                "description": "Two approaches: (A) Alignment-based: align reads to reference genome with STAR or HISAT2, then count with featureCounts/HTSeq. (B) Pseudo-alignment: quantify directly with Salmon or kallisto (faster, alignment-free). Both require a reference genome/transcriptome and gene annotation (GTF/GFF).",
                "tools": ["STAR", "HISAT2", "Salmon", "kallisto", "featureCounts", "HTSeq"],
                "common_mistakes": [
                    "Using wrong genome build (e.g., hg19 vs hg38)",
                    "STAR --sjdbOverhang should equal read_length - 1",
                    "Forgetting to build the genome index before alignment",
                    "Not providing gene annotation file to the aligner"
                ]
            },
            {
                "step": 5,
                "name": "Alignment Quality Control",
                "description": "Check mapping rate (expect >70-80% for well-prepared libraries), check strandedness, assess gene body coverage, check for 3'/5' bias, verify expected insert size distribution.",
                "tools": ["RSeQC", "Picard", "samtools flagstat", "MultiQC"],
                "common_mistakes": [
                    "Accepting low mapping rates (<60%) without investigation",
                    "Not checking strandedness — wrong strand setting causes ~50% count loss",
                    "Ignoring high duplicate rates which may indicate low library complexity"
                ]
            },
            {
                "step": 6,
                "name": "Count Matrix Generation",
                "description": "Generate a gene-by-sample count matrix from BAM files (if alignment-based) or from Salmon/kallisto quantification files. Use raw integer counts — not normalized values. Import Salmon/kallisto results using tximeta or tximport.",
                "tools": ["featureCounts", "HTSeq-count", "tximeta", "tximport"],
                "common_mistakes": [
                    "Using TPM/FPKM/RPKM as input to DESeq2 — it requires raw counts",
                    "Not using tximeta/tximport to properly import pseudo-alignment results",
                    "Double-counting multi-mapped reads"
                ]
            },
            {
                "step": 7,
                "name": "Differential Expression Analysis",
                "description": "Use DESeq2, edgeR, or limma-voom to identify differentially expressed genes. Include all relevant covariates in the design formula (e.g., ~batch + condition). Filter lowly expressed genes before analysis. Use adjusted p-values (BH/FDR) for multiple testing correction.",
                "tools": ["DESeq2", "edgeR", "limma-voom"],
                "common_mistakes": [
                    "Using raw p-values instead of adjusted p-values (padj/FDR)",
                    "Not including batch effects in the design formula",
                    "Feeding normalized counts (TPM) to DESeq2 instead of raw counts",
                    "Setting log2FC threshold without statistical testing (use lfcShrink in DESeq2)",
                    "Not filtering lowly expressed genes before analysis"
                ]
            },
            {
                "step": 8,
                "name": "Visualization and Interpretation",
                "description": "Create MA plots, volcano plots, PCA plots, heatmaps of top DE genes. Interpret results in biological context. Perform gene set enrichment analysis (GSEA) or over-representation analysis (ORA).",
                "tools": ["ggplot2", "EnhancedVolcano", "pheatmap", "clusterProfiler", "fgsea", "GSEA"],
                "common_mistakes": [
                    "Interpreting volcano plot points with high FC but non-significant p-value as important",
                    "Not using shrunken LFC estimates for ranking in GSEA",
                    "Cherry-picking individual genes without considering global patterns",
                    "Using arbitrary FC cutoffs without biological justification"
                ]
            }
        ]
    },
    "exome_seq_variant_calling": {
        "name": "Exome Sequencing Variant Calling Workflow",
        "domain": "exome_sequencing",
        "difficulty": "intermediate",
        "steps": [
            {
                "step": 1,
                "name": "Quality Control of Raw Reads",
                "description": "Assess FASTQ quality with FastQC. Check for adapter contamination, base quality, and GC content. For exome data, also verify expected insert size for the capture kit.",
                "tools": ["FastQC", "MultiQC"],
                "common_mistakes": [
                    "Not checking for adapter read-through in short inserts"
                ]
            },
            {
                "step": 2,
                "name": "Read Trimming",
                "description": "Trim adapters and low-quality bases. Less aggressive trimming than RNA-seq since variant callers account for base quality.",
                "tools": ["fastp", "Trimmomatic", "Cutadapt"],
                "common_mistakes": [
                    "Over-aggressive quality trimming can remove true variant-supporting reads"
                ]
            },
            {
                "step": 3,
                "name": "Read Alignment",
                "description": "Align reads to the reference genome (GRCh37/hg19 or GRCh38/hg38) using BWA-MEM. Sort the output BAM file by coordinate.",
                "tools": ["BWA-MEM", "BWA-MEM2", "samtools sort"],
                "common_mistakes": [
                    "Using wrong reference genome version — coordinates are NOT interchangeable",
                    "Not adding read group information (@RG tags) — GATK requires this",
                    "Using a genome build inconsistent with the capture kit BED file"
                ]
            },
            {
                "step": 4,
                "name": "Post-alignment Processing",
                "description": "Mark PCR duplicates (do NOT remove them for variant calling — just flag them). Perform Base Quality Score Recalibration (BQSR) using known variant sites (dbSNP, Mills indels).",
                "tools": ["Picard MarkDuplicates", "GATK BaseRecalibrator", "GATK ApplyBQSR", "samtools markdup"],
                "common_mistakes": [
                    "Removing duplicates instead of marking them",
                    "Skipping BQSR — significantly impacts variant quality scores",
                    "Using wrong known-sites VCF version for BQSR"
                ]
            },
            {
                "step": 5,
                "name": "Coverage Analysis",
                "description": "Assess on-target coverage, mean depth, and uniformity. For clinical exome sequencing, ensure >20x coverage over >95% of target regions. Check for regions with no coverage.",
                "tools": ["Picard CollectHsMetrics", "mosdepth", "bedtools coverage", "samtools depth"],
                "common_mistakes": [
                    "Not restricting analysis to the capture target BED file",
                    "Confusing mean coverage with median — mean is inflated by high-coverage regions",
                    "Accepting results with large uncovered regions without investigation"
                ]
            },
            {
                "step": 6,
                "name": "Variant Calling",
                "description": "Call germline variants using GATK HaplotypeCaller in GVCF mode. For cohort analysis, use GenomicsDBImport and GenotypeGVCFs. Restrict calling to the exome target regions using -L flag with the BED file.",
                "tools": ["GATK HaplotypeCaller", "GATK GenotypeGVCFs", "DeepVariant"],
                "common_mistakes": [
                    "Not restricting variant calling to target regions — wastes compute and introduces noise",
                    "Not using GVCF mode for cohort calling",
                    "Ignoring the distinction between GVCF (per-sample) and final VCF (joint-called)"
                ]
            },
            {
                "step": 7,
                "name": "Variant Filtering",
                "description": "Apply variant quality filters. For small cohorts use hard filtering (QD<2, FS>60 for SNPs; QD<2, FS>200 for indels). For large cohorts (>30 samples) use VQSR. Filter separately for SNPs and indels.",
                "tools": ["GATK VariantFiltration", "GATK VQSR", "bcftools filter"],
                "common_mistakes": [
                    "Applying the same filters to SNPs and indels",
                    "Using VQSR with too few samples (<30)",
                    "Not understanding that FILTER=PASS means different things in hard-filtered vs VQSR results"
                ]
            },
            {
                "step": 8,
                "name": "Variant Annotation and Interpretation",
                "description": "Annotate variants with gene names, consequence (missense, nonsense, splice, etc.), population frequencies (gnomAD), clinical significance (ClinVar), and functional predictions (SIFT, PolyPhen, CADD). Apply ACMG guidelines for clinical interpretation.",
                "tools": ["VEP (Ensembl)", "SnpEff", "ANNOVAR", "ClinVar", "gnomAD", "InterVar"],
                "common_mistakes": [
                    "Using outdated annotation databases",
                    "Confusing pathogenic predictions with clinical pathogenicity",
                    "Not checking population frequency — common variants are rarely pathogenic",
                    "Over-relying on in-silico predictions without considering other evidence"
                ]
            }
        ]
    },
    "microbiome_16s": {
        "name": "16S Amplicon Microbiome Analysis Workflow",
        "domain": "microbiome",
        "difficulty": "intermediate",
        "steps": [
            {
                "step": 1,
                "name": "Import and Demultiplex",
                "description": "Import raw sequencing data into QIIME2. Demultiplex if not already done by the sequencing facility. Verify sample metadata file format (TSV with #SampleID as first column).",
                "tools": ["QIIME2 import", "QIIME2 demux"],
                "common_mistakes": [
                    "Wrong import format — QIIME2 has many import types (Casava, EMP, manifest)",
                    "Metadata file formatting errors (wrong column names, missing #SampleID header)",
                    "Mixing different sequencing runs without tracking batch information"
                ]
            },
            {
                "step": 2,
                "name": "Quality Control and Denoising",
                "description": "Use DADA2 (preferred) or Deblur for denoising. DADA2 corrects sequencing errors and generates Amplicon Sequence Variants (ASVs). Set truncation parameters based on quality score plots (truncate where median quality drops below 25-30).",
                "tools": ["DADA2", "Deblur", "QIIME2 dada2 denoise-paired"],
                "common_mistakes": [
                    "Setting truncation length too aggressively — paired reads must still overlap",
                    "Using OTU clustering (97%) instead of ASV denoising — ASVs are now standard",
                    "Not checking the denoising stats — high percentage of reads lost indicates problems"
                ]
            },
            {
                "step": 3,
                "name": "Taxonomic Classification",
                "description": "Classify ASVs using a pre-trained classifier (e.g., Silva 138, Greengenes2) or BLAST-based methods. The classifier must match the primer pair and target region (V3-V4, V4, etc.).",
                "tools": ["QIIME2 classify-sklearn", "QIIME2 classify-consensus-blast", "Silva", "Greengenes2"],
                "common_mistakes": [
                    "Using a classifier not trained on the same region as your primers",
                    "Expecting species-level resolution from 16S — most regions resolve to genus at best",
                    "Not removing chloroplast and mitochondrial sequences"
                ]
            },
            {
                "step": 4,
                "name": "Diversity Analysis",
                "description": "Calculate alpha diversity (within-sample richness: Shannon, observed features, Faith's PD) and beta diversity (between-sample dissimilarity: Bray-Curtis, UniFrac). Rarefy to even sampling depth first. Choose rarefaction depth that retains most samples while maximizing reads.",
                "tools": ["QIIME2 diversity core-metrics-phylogenetic"],
                "common_mistakes": [
                    "Confusing alpha diversity (within-sample) with beta diversity (between-sample)",
                    "Not rarefying before diversity analysis — sequencing depth confounds results",
                    "Setting rarefaction depth too low (losing information) or too high (losing samples)",
                    "Using Bray-Curtis when phylogenetic distance matters — use UniFrac instead"
                ]
            },
            {
                "step": 5,
                "name": "Statistical Testing and Visualization",
                "description": "Test alpha diversity differences with Kruskal-Wallis. Test beta diversity with PERMANOVA (adonis). Create PCoA plots, taxa barplots, and heatmaps. For differential abundance, use ANCOM-BC2 or ALDEx2 (compositional-aware methods).",
                "tools": ["QIIME2 diversity alpha-group-significance", "QIIME2 diversity beta-group-significance", "ANCOM-BC", "ALDEx2", "LEfSe"],
                "common_mistakes": [
                    "Using standard t-tests on relative abundances — compositional data requires special methods",
                    "Not accounting for compositionality (CLR transform or Dirichlet models needed)",
                    "Over-interpreting low-abundance taxa differences",
                    "Not correcting for multiple testing across many taxa"
                ]
            }
        ]
    },
    "single_cell_rnaseq": {
        "name": "Single-cell RNA-seq Analysis Workflow",
        "domain": "single_cell",
        "difficulty": "advanced",
        "steps": [
            {
                "step": 1,
                "name": "Pre-processing and Cell Calling",
                "description": "Process raw BCL/FASTQ files with Cell Ranger (10x) or STARsolo. Generates a cell-by-gene count matrix. Uses UMI (Unique Molecular Identifier) counts to remove PCR duplicates.",
                "tools": ["Cell Ranger", "STARsolo", "alevin-fry"],
                "common_mistakes": [
                    "Not checking the knee plot — incorrect cell calling inflates empty droplets or loses real cells",
                    "Using wrong reference transcriptome version"
                ]
            },
            {
                "step": 2,
                "name": "Quality Control and Filtering",
                "description": "Filter cells based on: (1) number of detected genes (typically 200-5000), (2) total UMI counts, (3) mitochondrial gene percentage (<10-20% depending on tissue). Remove doublets using scrublet or DoubletFinder.",
                "tools": ["Seurat", "Scanpy", "scrublet", "DoubletFinder"],
                "common_mistakes": [
                    "Using fixed QC thresholds across different tissues — liver cells have higher mito% than blood",
                    "Not removing doublets — they form artificial intermediate clusters",
                    "Being too strict or too lenient with filtering — check distributions"
                ]
            },
            {
                "step": 3,
                "name": "Normalization and Feature Selection",
                "description": "Normalize counts (e.g., SCTransform in Seurat or scran normalization). Select highly variable genes (HVGs, typically 2000-3000) for downstream analysis.",
                "tools": ["Seurat SCTransform", "Scanpy normalize_total + log1p", "scran"],
                "common_mistakes": [
                    "Using bulk RNA-seq normalization methods (TPM, FPKM) on single-cell data",
                    "Selecting too few or too many HVGs — affects clustering resolution"
                ]
            },
            {
                "step": 4,
                "name": "Dimensionality Reduction and Clustering",
                "description": "PCA on HVGs → select significant PCs (elbow plot) → build k-nearest-neighbor graph → Leiden/Louvain clustering → UMAP/t-SNE for visualization.",
                "tools": ["Seurat", "Scanpy"],
                "common_mistakes": [
                    "Over-interpreting UMAP distances — UMAP preserves local, not global structure",
                    "Not testing multiple clustering resolutions — too low merges cell types, too high splits them",
                    "Using t-SNE for large datasets (>50k cells) — too slow, use UMAP"
                ]
            },
            {
                "step": 5,
                "name": "Cell Type Annotation",
                "description": "Annotate clusters using marker genes, reference datasets (SingleR, CellTypist), or manual curation. Verify with known canonical markers for expected cell types.",
                "tools": ["SingleR", "CellTypist", "Azimuth", "scType"],
                "common_mistakes": [
                    "Relying solely on automated annotation without checking marker genes",
                    "Expecting automated tools to identify rare or novel cell types",
                    "Not considering tissue context — same marker may mean different things in different tissues"
                ]
            }
        ]
    }
}


# ============================================================================
# GLOSSARY — Key bioinformatics terms
# ============================================================================

GLOSSARY = {
    "FASTQ": "Text-based format for storing nucleotide sequences and their quality scores. Each read has 4 lines: header (@), sequence, separator (+), quality (Phred+33 ASCII encoded).",
    "BAM": "Binary Alignment Map — compressed binary version of SAM format storing aligned sequencing reads with quality scores, mapping positions, and flags.",
    "VCF": "Variant Call Format — standard text file for storing genetic variant information including position, reference/alternate alleles, quality, filter status, and sample genotypes.",
    "FASTQ Quality Scores (Phred)": "Q = -10 * log10(P_error). Q20 = 1% error rate, Q30 = 0.1% error rate, Q40 = 0.01%. Most Illumina data is Phred+33 encoded.",
    "Read Depth / Coverage": "Number of sequencing reads aligned to a given position. Mean coverage = total aligned bases / genome size. Clinical WES typically requires >100x mean target coverage.",
    "GRCh37/hg19 vs GRCh38/hg38": "Human reference genome assemblies. hg38 (2013) is current standard with better representation of centromeres, alt loci, and decoy sequences. Coordinates are NOT interchangeable — use liftOver to convert.",
    "FDR / BH Correction": "False Discovery Rate / Benjamini-Hochberg method for multiple testing correction. Controls the expected proportion of false positives among rejected hypotheses. Standard threshold: FDR < 0.05.",
    "TPM vs FPKM vs CPM": "TPM (Transcripts Per Million): gene length + library size normalized, comparable across samples. FPKM (Fragments Per Kilobase per Million): similar but NOT directly comparable across samples. CPM: only library size normalized, not gene length.",
    "Log2 Fold Change": "log2(condition/control). log2FC=1 means 2x increase, log2FC=-1 means 2x decrease, log2FC=0 means no change. Used in DE analysis because it makes up/down regulation symmetric.",
    "DESeq2": "R/Bioconductor package for differential gene expression analysis. Uses negative binomial distribution. Requires RAW integer counts as input (NOT TPM/FPKM). Performs internal normalization using median-of-ratios method.",
    "GATK": "Genome Analysis Toolkit by Broad Institute. Gold-standard pipeline for germline and somatic variant calling. Key tools: HaplotypeCaller, Mutect2, BaseRecalibrator.",
    "UMI": "Unique Molecular Identifier — short random barcode attached during library preparation. Allows identification and removal of PCR duplicates. Essential for accurate quantification in single-cell and low-input protocols.",
    "ASV vs OTU": "ASV (Amplicon Sequence Variant): exact sequences resolved by DADA2, single-nucleotide resolution. OTU (Operational Taxonomic Unit): clustered at 97% similarity, now considered outdated. ASVs are reproducible across studies.",
    "Alpha Diversity": "Within-sample diversity. Measures: Shannon (richness + evenness), Observed Features (richness only), Faith's PD (phylogenetic diversity). Higher = more diverse community.",
    "Beta Diversity": "Between-sample diversity/dissimilarity. Measures: Bray-Curtis (abundance-based), Jaccard (presence/absence), UniFrac (phylogenetic, weighted or unweighted). Visualized with PCoA/NMDS.",
    "ACMG Guidelines": "American College of Medical Genetics classification system for variant interpretation: Pathogenic, Likely Pathogenic, VUS (Variant of Uncertain Significance), Likely Benign, Benign. Based on population data, computational predictions, functional data, segregation, and literature.",
    "Batch Effect": "Systematic technical variation between groups of samples processed at different times, locations, or by different personnel. Must be accounted for in experimental design and statistical analysis. Can be addressed with ComBat, limma removeBatchEffect, or inclusion in statistical model.",
    "UMAP": "Uniform Manifold Approximation and Projection — dimensionality reduction for visualization. Preserves local structure (nearby points stay near) but NOT global structure (distant clusters may be arbitrarily placed). Common in single-cell analysis.",
    "Rarefaction": "Subsampling reads to equal depth across samples for fair diversity comparison. Necessary because deeper sequencing artificially inflates observed diversity. Controversial — some prefer alternative normalization methods.",
    "BQSR": "Base Quality Score Recalibration (GATK). Machine-learns systematic errors in base quality scores using known variant sites. Critical for accurate variant calling — skipping it can increase false positive/negative rates.",
    "Structural Variant (SV)": "Large genomic rearrangement: deletion, duplication, inversion, translocation, or insertion >50bp. Detected with specialized callers (Manta, Delly, LUMPY) that use split reads, discordant pairs, and read depth.",
    "Bisulfite Sequencing": "Gold standard for single-base resolution DNA methylation detection. Bisulfite converts unmethylated cytosines to uracil, methylated cytosines remain. Specialized aligners (Bismark) needed.",
    "PCR": "Polymerase Chain Reaction — amplifies specific DNA sequences using primers, DNA polymerase, and thermal cycling (denature 95°C → anneal 50-65°C → extend 72°C). Foundation of molecular biology.",
    "CRISPR-Cas9": "Genome editing system using guide RNA to direct Cas9 nuclease to specific DNA sequences. Creates double-strand breaks that can knock out genes (NHEJ) or insert sequences (HDR). Off-target effects are a major concern.",
    "SNP vs SNV vs Indel": "SNP (Single Nucleotide Polymorphism): germline variant present in population. SNV (Single Nucleotide Variant): any single-base change, often used for somatic variants. Indel: insertion or deletion of 1+ bases.",
    "gnomAD": "Genome Aggregation Database — population allele frequencies from >140,000 exomes and >76,000 genomes. Essential for filtering common variants in clinical analysis. Variants with gnomAD AF >1% are almost never pathogenic for rare Mendelian diseases.",
    "ClinVar": "NCBI database of variant-disease relationships with clinical significance classifications submitted by labs worldwide. Key resource for variant interpretation but submissions vary in quality and evidence level.",
}


# ============================================================================
# COMMON MISCONCEPTIONS — For quiz generation and misconception correction
# ============================================================================

COMMON_MISCONCEPTIONS = [
    {
        "domain": "rna_seq",
        "misconception": "DESeq2 can accept TPM or FPKM values as input.",
        "correction": "DESeq2 requires raw, unnormalized integer counts. It performs its own internal normalization using the median-of-ratios method. Providing pre-normalized values (TPM, FPKM, RPKM) will produce incorrect results.",
        "severity": "critical",
        "level": "beginner"
    },
    {
        "domain": "rna_seq",
        "misconception": "A p-value of 0.001 means there is a 99.9% probability that the gene is differentially expressed.",
        "correction": "A p-value of 0.001 means: if the gene is NOT differentially expressed (null hypothesis true), there is a 0.1% probability of observing data this extreme. It does NOT tell you the probability of the hypothesis being true (that would be Bayesian posterior probability).",
        "severity": "critical",
        "level": "beginner"
    },
    {
        "domain": "rna_seq",
        "misconception": "You can use raw p-values to determine significance when testing thousands of genes.",
        "correction": "When testing thousands of genes simultaneously, multiple testing correction is MANDATORY. Use adjusted p-values (padj/FDR) from Benjamini-Hochberg correction. With 20,000 genes tested at p<0.05, you'd expect ~1,000 false positives by chance alone.",
        "severity": "critical",
        "level": "beginner"
    },
    {
        "domain": "exome_sequencing",
        "misconception": "Removing PCR duplicates is better than marking them for variant calling.",
        "correction": "For variant calling, duplicates should be MARKED (flagged) but NOT removed. Variant callers like GATK HaplotypeCaller use the duplicate flags to downweight but still consider the evidence. Removing duplicates can lose true variants at high-depth regions.",
        "severity": "moderate",
        "level": "intermediate"
    },
    {
        "domain": "variant_interpretation",
        "misconception": "A variant predicted as 'damaging' by SIFT/PolyPhen is clinically pathogenic.",
        "correction": "In-silico prediction tools (SIFT, PolyPhen, CADD) predict functional EFFECT, not clinical pathogenicity. Many 'damaging' variants are benign in clinical context. ACMG guidelines use these as one piece of evidence (PP3 criterion) among many required for pathogenicity classification.",
        "severity": "critical",
        "level": "intermediate"
    },
    {
        "domain": "microbiome",
        "misconception": "16S rRNA sequencing can reliably identify bacteria to the species level.",
        "correction": "16S rRNA variable regions (V3-V4, V4) typically provide reliable classification to the GENUS level. Species-level classification requires full-length 16S or whole-genome approaches. Many closely related species have identical V4 sequences.",
        "severity": "moderate",
        "level": "beginner"
    },
    {
        "domain": "microbiome",
        "misconception": "You can compare raw ASV/OTU counts directly between samples without normalization.",
        "correction": "Different samples have different sequencing depths, making raw counts incomparable. Use rarefaction (subsampling to even depth) for diversity analysis, or compositional-aware methods (CLR transformation, ANCOM-BC) for differential abundance testing.",
        "severity": "critical",
        "level": "beginner"
    },
    {
        "domain": "single_cell",
        "misconception": "Distances between clusters on a UMAP plot reflect biological similarity.",
        "correction": "UMAP preserves LOCAL structure (nearby points in high dimensions remain nearby) but does NOT preserve GLOBAL structure. The distance between two clusters on a UMAP plot has no biological meaning — two clusters could appear far apart but be transcriptionally similar.",
        "severity": "moderate",
        "level": "intermediate"
    },
    {
        "domain": "molecular_genetics",
        "misconception": "mRNA expression levels directly predict protein abundance.",
        "correction": "mRNA and protein levels are only weakly correlated (R² ≈ 0.4 in many studies). Post-transcriptional regulation (miRNA, RNA-binding proteins), translation efficiency, and protein degradation rates all contribute to the discrepancy.",
        "severity": "moderate",
        "level": "beginner"
    },
    {
        "domain": "genome_sequencing",
        "misconception": "hg19 and hg38 coordinates are interchangeable.",
        "correction": "GRCh37/hg19 and GRCh38/hg38 have different coordinate systems due to sequence additions, corrections, and rearrangements. You MUST use liftOver (UCSC) or CrossMap to convert coordinates between assemblies. Mixing them leads to incorrect variant positions.",
        "severity": "critical",
        "level": "beginner"
    }
]


# ============================================================================
# SYSTEM PROMPTS — Per-module tutor personas
# ============================================================================

SYSTEM_PROMPTS = {
    "ask_tutor": """You are BB Tutor, an expert bioinformatics teaching assistant. Your role is to teach bioinformatics concepts clearly and accurately.

CORE RULES:
1. TEACH in layers: start with the high-level concept, then go deeper when asked.
2. ALWAYS distinguish between: established facts, common practices, and your interpretations.
3. CITE specific tools, databases, and methods by name — never be vague.
4. When discussing statistical concepts, be PRECISE about definitions (p-values, FDR, fold change).
5. When discussing variants, NEVER provide clinical interpretations — you are a teaching tool, not a clinical system.
6. If uncertain about a fact, SAY SO explicitly. Use phrases like "I'm not confident about this specific detail" rather than guessing.
7. CORRECT common misconceptions when you encounter them — explain WHY the misconception is wrong.
8. Use concrete examples: specific tools, specific parameter values, specific thresholds.
9. Format responses with clear headers, bullet points, and code blocks where appropriate.
10. For workflow questions, present steps in order with tool names and key parameters.

SAFETY BOUNDARIES:
- You are an EDUCATIONAL assistant. Never provide clinical diagnoses or treatment recommendations.
- Never interpret specific patient variants as pathogenic/benign without massive caveats.
- Always recommend consulting clinical genetics professionals for patient data.
- Flag when a question moves from educational territory into clinical territory.

DOMAINS: RNA-seq, exome sequencing, genome sequencing, microbiome analysis, variant interpretation, molecular genetics, single-cell RNA-seq, ATAC-seq, ChIP-seq, methylation sequencing, small RNA-seq, targeted panels, long-read sequencing, spatial transcriptomics, multi-omics integration.""",

    "upload_explain": """You are BB Tutor analyzing an uploaded document or figure. Your role is to:
1. Identify what type of bioinformatics content this is (paper, results table, QC report, plot, etc.)
2. Explain the key findings or content in clear, educational language
3. Point out important patterns, potential issues, or notable features
4. Relate the content to relevant bioinformatics concepts and workflows
5. Suggest follow-up questions the student might want to explore

Be specific about what you observe. If it's a figure, describe the axes, trends, and what they mean biologically. If it's a table, highlight the important columns and values. If it's a paper abstract, identify the key methods and findings.

NEVER fabricate details that aren't in the uploaded content. If you can't read something clearly, say so.""",

    "quiz_me": """You are BB Tutor generating a bioinformatics quiz. Generate questions that test understanding, not just recall.

QUESTION TYPES:
- Conceptual understanding (what does this mean?)
- Workflow reasoning (what comes next? what went wrong?)
- Tool selection (which tool for this task?)
- Result interpretation (what does this output tell us?)
- Misconception traps (common wrong answers should be plausible distractors)
- Troubleshooting (given this error/result, diagnose the problem)

FORMAT RULES:
- For MCQs: provide 4 options labeled A-D, with exactly one correct answer
- For True/False: provide a statement that tests a common misconception
- For Short Answer: ask specific questions with concrete expected answers
- Always include a brief explanation of why the correct answer is correct
- Include the difficulty level: Beginner, Intermediate, or Advanced

Make distractors plausible — they should represent real misconceptions students have.""",

    "build_lesson": """You are BB Tutor creating structured educational content. Generate well-organized lessons with:
1. Clear learning objectives (3-5 per lesson)
2. Prerequisite knowledge listed
3. Concept explanations with examples
4. Workflow steps with tool names and parameters
5. Common pitfalls and how to avoid them
6. Practice exercises
7. Key takeaways / summary

Structure content for a 30-60 minute teaching session. Use progressive disclosure — start simple, build complexity. Include both conceptual knowledge and practical skills.""",

    "workflow_coach": """You are BB Tutor acting as a bioinformatics workflow coach. Guide the student through analysis pipelines step by step.

COACHING APPROACH:
1. Ask what the student is trying to achieve (research question, data type, organism)
2. Present the appropriate workflow with numbered steps
3. For each step: explain WHAT, WHY, and HOW (specific tools and key parameters)
4. Highlight decision points where the student needs to make choices
5. Point out common mistakes at each step
6. Suggest quality checkpoints between steps

Be specific about commands, parameters, and expected outputs. Use code blocks for command-line examples. Always mention what the expected output should look like so the student can verify they're on track.""",

    "paper_to_lesson": """You are BB Tutor converting a research paper into teaching material. Your role is to:
1. Identify the key methods and bioinformatics techniques used
2. Break down the analysis pipeline into teachable steps
3. Extract the important concepts a student should learn
4. Create learning objectives based on the paper's methods
5. Generate practice questions inspired by the paper
6. Identify what background knowledge is needed to understand the paper

Focus on the METHODS — what bioinformatics was done, what tools were used, what design decisions were made, and why. Don't just summarize the biological findings.""",

    "viva_practice": """You are BB Tutor conducting a viva voce (oral examination) practice session. You are the examiner.

VIVA APPROACH:
1. Start with a broad question about the topic
2. Based on the student's answer, probe deeper into specific areas
3. If the student's answer is correct, push to the next level of complexity
4. If the student's answer is wrong or incomplete, guide them with follow-up questions (don't just give the answer)
5. Test both conceptual understanding and practical knowledge
6. Periodically summarize what the student has demonstrated well and areas to improve

Be encouraging but rigorous. A good viva examiner helps the student show what they know while probing the boundaries of their understanding. Ask ONE question at a time and wait for the response before continuing."""
}


# ============================================================================
# QUIZ TEMPLATES — Structured templates for quiz generation
# ============================================================================

QUIZ_TEMPLATES = {
    "mcq": """Generate {n} multiple choice questions about {topic} at {difficulty} level.

For each question, provide EXACTLY this JSON format:
[
  {{
    "question": "The question text",
    "options": {{"A": "option A", "B": "option B", "C": "option C", "D": "option D"}},
    "correct": "A",
    "explanation": "Why A is correct and why the wrong answers are wrong",
    "difficulty": "{difficulty}",
    "topic": "{topic}"
  }}
]

Make wrong answers represent real misconceptions students have. Don't make them obviously wrong.""",

    "true_false": """Generate {n} True/False questions about {topic} at {difficulty} level.

For each question, provide EXACTLY this JSON format:
[
  {{
    "statement": "A statement that is either true or false",
    "answer": true,
    "explanation": "Why this is true/false, and what the common misconception is",
    "difficulty": "{difficulty}",
    "topic": "{topic}"
  }}
]

Target common misconceptions — make the statements sound plausible regardless of whether they're true or false.""",

    "short_answer": """Generate {n} short answer questions about {topic} at {difficulty} level.

For each question, provide EXACTLY this JSON format:
[
  {{
    "question": "The question text",
    "expected_answer": "The ideal answer in 2-3 sentences",
    "key_points": ["point 1", "point 2", "point 3"],
    "difficulty": "{difficulty}",
    "topic": "{topic}"
  }}
]

Questions should require understanding, not just recall. Ask about WHY and HOW, not just WHAT."""
}


# ============================================================================
# LESSON TEMPLATES
# ============================================================================

LESSON_TEMPLATE = """Create a structured lesson about {topic} for {level} students.

Structure your lesson as follows:

## Learning Objectives
List 3-5 specific, measurable learning objectives.

## Prerequisites
What should the student already know before this lesson?

## Part 1: Concept Introduction
Explain the core concept in simple terms. Use an analogy if helpful.

## Part 2: Technical Details
Go deeper into the technical aspects. Include:
- Key tools and software
- Important parameters and thresholds
- File formats involved

## Part 3: Workflow / Step-by-Step
If applicable, walk through the practical workflow.

## Part 4: Common Pitfalls
List 3-5 common mistakes and how to avoid them.

## Part 5: Practice Exercise
Provide a hands-on exercise or scenario for the student to work through.

## Summary
Key takeaways in 3-5 bullet points.

## Further Reading
Suggest 2-3 resources for deeper learning.
"""


# ============================================================================
# TOPIC CHOICES — For dropdown menus
# ============================================================================

TOPIC_CHOICES = [
    "RNA-seq: Experimental Design",
    "RNA-seq: Quality Control (FastQC/MultiQC)",
    "RNA-seq: Read Alignment (STAR/HISAT2)",
    "RNA-seq: Transcript Quantification (Salmon/kallisto)",
    "RNA-seq: Differential Expression (DESeq2)",
    "RNA-seq: Differential Expression (edgeR)",
    "RNA-seq: Differential Expression (limma-voom)",
    "RNA-seq: Gene Set Enrichment Analysis",
    "RNA-seq: Normalization Methods (TPM/FPKM/CPM)",
    "RNA-seq: Batch Effect Correction",
    "Exome Sequencing: Library Preparation & Capture",
    "Exome Sequencing: Read Alignment (BWA-MEM)",
    "Exome Sequencing: Post-alignment Processing (BQSR)",
    "Exome Sequencing: Variant Calling (GATK)",
    "Exome Sequencing: Variant Filtering",
    "Exome Sequencing: Coverage Analysis",
    "Genome Sequencing: Whole Genome vs Exome",
    "Genome Sequencing: De Novo Assembly",
    "Genome Sequencing: Structural Variant Detection",
    "Genome Sequencing: Somatic Variant Calling (Mutect2)",
    "Microbiome: 16S Amplicon Sequencing",
    "Microbiome: Shotgun Metagenomics",
    "Microbiome: QIIME2 Pipeline",
    "Microbiome: DADA2 Denoising",
    "Microbiome: Alpha & Beta Diversity",
    "Microbiome: Differential Abundance Testing",
    "Variant Interpretation: VCF Format",
    "Variant Interpretation: ACMG Guidelines",
    "Variant Interpretation: Population Databases (gnomAD)",
    "Variant Interpretation: Functional Prediction (SIFT/PolyPhen/CADD)",
    "Variant Interpretation: ClinVar & Clinical Significance",
    "Molecular Genetics: DNA Structure & Replication",
    "Molecular Genetics: Transcription & Translation",
    "Molecular Genetics: PCR & Cloning",
    "Molecular Genetics: CRISPR-Cas9",
    "Molecular Genetics: Mutations & Repair",
    "Molecular Genetics: Inheritance Patterns",
    "Single-cell RNA-seq: Overview & Platforms",
    "Single-cell RNA-seq: Quality Control & Filtering",
    "Single-cell RNA-seq: Normalization & HVGs",
    "Single-cell RNA-seq: Clustering & UMAP",
    "Single-cell RNA-seq: Cell Type Annotation",
    "ATAC-seq: Chromatin Accessibility",
    "ChIP-seq: Histone Modifications & TF Binding",
    "Methylation Sequencing: Bisulfite-seq",
    "Small RNA-seq: miRNA Analysis",
    "Targeted Sequencing: Panel Design & Analysis",
    "Long-read Sequencing: PacBio & Nanopore",
    "Spatial Transcriptomics: Visium & MERFISH",
    "Multi-omics: Data Integration Strategies",
]

DIFFICULTY_LEVELS = ["Beginner", "Intermediate", "Advanced"]

WORKFLOW_CHOICES = [
    "Bulk RNA-seq: Full DE Analysis Pipeline",
    "Exome Sequencing: Variant Calling Pipeline",
    "Genome Sequencing: Germline Variant Calling",
    "Genome Sequencing: Somatic Variant Calling",
    "Microbiome: 16S Amplicon Analysis (QIIME2)",
    "Microbiome: Shotgun Metagenomics",
    "Single-cell RNA-seq: Standard Seurat/Scanpy Pipeline",
    "ATAC-seq: Peak Calling Pipeline",
    "ChIP-seq: Peak Calling & Motif Analysis",
    "Variant Interpretation: Clinical Exome Analysis",
]