File size: 50,085 Bytes
fe2b396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
"""
Bioinformatics with BB Tutor β€” Domain Knowledge Base
Comprehensive bioinformatics content for RAG retrieval and context injection.
"""

DOMAIN_TAXONOMY = {
    "rna_seq": {
        "name": "RNA-seq",
        "subtopics": [
            "experimental_design", "library_preparation", "quality_control",
            "read_alignment", "transcript_quantification", "differential_expression",
            "deseq2", "edger", "limma_voom", "gene_set_enrichment", "pathway_analysis",
            "visualization", "normalization", "batch_effects", "multifactor_designs"
        ]
    },
    "exome_sequencing": {
        "name": "Exome Sequencing",
        "subtopics": [
            "capture_design", "library_preparation", "quality_control",
            "read_alignment", "duplicate_marking", "base_quality_recalibration",
            "variant_calling", "variant_filtering", "annotation",
            "coverage_analysis", "clinical_interpretation", "acmg_classification"
        ]
    },
    "genome_sequencing": {
        "name": "Genome Sequencing",
        "subtopics": [
            "library_preparation", "sequencing_platforms", "quality_control",
            "read_alignment", "variant_calling_germline", "variant_calling_somatic",
            "structural_variant_detection", "copy_number_analysis",
            "de_novo_assembly", "reference_genomes", "phasing"
        ]
    },
    "microbiome": {
        "name": "Microbiome Analysis",
        "subtopics": [
            "16s_amplicon", "shotgun_metagenomics", "library_preparation",
            "quality_control", "otu_clustering", "asv_denoising",
            "taxonomic_classification", "alpha_diversity", "beta_diversity",
            "differential_abundance", "functional_profiling",
            "qiime2", "dada2", "kraken2", "metaphlan"
        ]
    },
    "variant_interpretation": {
        "name": "Variant Interpretation",
        "subtopics": [
            "variant_types", "vcf_format", "annotation_tools",
            "population_databases", "functional_prediction", "acmg_guidelines",
            "clinical_significance", "pharmacogenomics",
            "somatic_vs_germline", "variant_effect_predictor", "snpeff",
            "gnomad", "clinvar", "cosmic"
        ]
    },
    "molecular_genetics": {
        "name": "Molecular Genetics",
        "subtopics": [
            "dna_structure", "gene_expression", "transcription", "translation",
            "splicing", "epigenetics", "mutations", "repair_mechanisms",
            "pcr", "cloning", "crispr", "restriction_enzymes",
            "inheritance_patterns", "genetic_disorders", "gene_regulation"
        ]
    },
    "single_cell": {
        "name": "Single-cell RNA-seq",
        "subtopics": [
            "cell_isolation", "library_preparation", "droplet_based",
            "plate_based", "quality_control", "normalization",
            "dimensionality_reduction", "clustering", "cell_type_annotation",
            "trajectory_analysis", "differential_expression",
            "batch_correction", "seurat", "scanpy"
        ]
    },
    "atac_seq": {
        "name": "ATAC-seq",
        "subtopics": [
            "chromatin_accessibility", "library_preparation", "quality_control",
            "alignment", "peak_calling", "differential_accessibility",
            "motif_enrichment", "footprinting", "integration_with_rnaseq",
            "macs2", "macs3"
        ]
    },
    "chip_seq": {
        "name": "ChIP-seq",
        "subtopics": [
            "chromatin_immunoprecipitation", "library_preparation",
            "quality_control", "alignment", "peak_calling",
            "differential_binding", "motif_analysis", "histone_modifications",
            "transcription_factor_binding", "input_control", "macs2"
        ]
    },
    "methylation_seq": {
        "name": "Methylation Sequencing",
        "subtopics": [
            "bisulfite_sequencing", "library_preparation", "quality_control",
            "alignment", "methylation_calling", "differential_methylation",
            "cpg_islands", "dmr_detection", "bismark", "nanopore_methylation"
        ]
    },
    "small_rna_seq": {
        "name": "Small RNA-seq",
        "subtopics": [
            "mirna", "sirna", "pirna", "library_preparation",
            "adapter_trimming", "alignment", "quantification",
            "differential_expression", "target_prediction", "mirbase"
        ]
    },
    "targeted_sequencing": {
        "name": "Targeted Sequencing Panels",
        "subtopics": [
            "panel_design", "amplicon_vs_capture", "library_preparation",
            "quality_control", "variant_calling", "coverage_uniformity",
            "clinical_panels", "oncology_panels", "pharmacogenomics_panels"
        ]
    },
    "long_read_sequencing": {
        "name": "Long-read Sequencing",
        "subtopics": [
            "pacbio", "oxford_nanopore", "library_preparation",
            "base_calling", "error_correction", "assembly",
            "structural_variant_detection", "full_length_transcripts",
            "epigenetic_modifications", "hifi_reads"
        ]
    },
    "spatial_transcriptomics": {
        "name": "Spatial Transcriptomics",
        "subtopics": [
            "visium", "slide_seq", "merfish", "seqfish",
            "spatial_clustering", "deconvolution",
            "ligand_receptor_analysis", "tissue_architecture"
        ]
    },
    "multi_omics": {
        "name": "Multi-omics Integration",
        "subtopics": [
            "data_integration_strategies", "multi_omics_factor_analysis",
            "weighted_correlation_network_analysis", "joint_dimensionality_reduction",
            "regulatory_network_inference", "mofa", "wgcna"
        ]
    }
}


# ============================================================================
# WORKFLOW KNOWLEDGE β€” Step-by-step pipeline descriptions
# ============================================================================

WORKFLOWS = {
    "rna_seq_bulk": {
        "name": "Bulk RNA-seq Differential Expression Workflow",
        "domain": "rna_seq",
        "difficulty": "intermediate",
        "steps": [
            {
                "step": 1,
                "name": "Experimental Design",
                "description": "Plan biological replicates (minimum 3 per condition, ideally 5+), randomize sample processing, consider batch effects. Define clear hypotheses and contrasts.",
                "tools": [],
                "common_mistakes": [
                    "Using fewer than 3 biological replicates per condition",
                    "Confusing biological replicates with technical replicates",
                    "Not planning for batch effects in sample processing"
                ]
            },
            {
                "step": 2,
                "name": "Quality Control of Raw Reads",
                "description": "Assess raw FASTQ quality using FastQC. Check per-base quality scores, GC content, adapter contamination, sequence duplication levels, and overrepresented sequences. Use MultiQC to aggregate results across samples.",
                "tools": ["FastQC", "MultiQC"],
                "common_mistakes": [
                    "Skipping QC entirely and going straight to alignment",
                    "Not checking for adapter contamination",
                    "Ignoring GC content bias which may indicate contamination"
                ]
            },
            {
                "step": 3,
                "name": "Read Trimming and Filtering",
                "description": "Remove adapter sequences and low-quality bases. Trim reads with quality score < 20 from 3' end. Remove reads shorter than 36bp after trimming.",
                "tools": ["Trimmomatic", "Cutadapt", "fastp"],
                "common_mistakes": [
                    "Over-trimming reads, reducing mappability",
                    "Not trimming adapters when they are present",
                    "Using wrong adapter sequences for the library kit"
                ]
            },
            {
                "step": 4,
                "name": "Read Alignment / Quantification",
                "description": "Two approaches: (A) Alignment-based: align reads to reference genome with STAR or HISAT2, then count with featureCounts/HTSeq. (B) Pseudo-alignment: quantify directly with Salmon or kallisto (faster, alignment-free). Both require a reference genome/transcriptome and gene annotation (GTF/GFF).",
                "tools": ["STAR", "HISAT2", "Salmon", "kallisto", "featureCounts", "HTSeq"],
                "common_mistakes": [
                    "Using wrong genome build (e.g., hg19 vs hg38)",
                    "STAR --sjdbOverhang should equal read_length - 1",
                    "Forgetting to build the genome index before alignment",
                    "Not providing gene annotation file to the aligner"
                ]
            },
            {
                "step": 5,
                "name": "Alignment Quality Control",
                "description": "Check mapping rate (expect >70-80% for well-prepared libraries), check strandedness, assess gene body coverage, check for 3'/5' bias, verify expected insert size distribution.",
                "tools": ["RSeQC", "Picard", "samtools flagstat", "MultiQC"],
                "common_mistakes": [
                    "Accepting low mapping rates (<60%) without investigation",
                    "Not checking strandedness β€” wrong strand setting causes ~50% count loss",
                    "Ignoring high duplicate rates which may indicate low library complexity"
                ]
            },
            {
                "step": 6,
                "name": "Count Matrix Generation",
                "description": "Generate a gene-by-sample count matrix from BAM files (if alignment-based) or from Salmon/kallisto quantification files. Use raw integer counts β€” not normalized values. Import Salmon/kallisto results using tximeta or tximport.",
                "tools": ["featureCounts", "HTSeq-count", "tximeta", "tximport"],
                "common_mistakes": [
                    "Using TPM/FPKM/RPKM as input to DESeq2 β€” it requires raw counts",
                    "Not using tximeta/tximport to properly import pseudo-alignment results",
                    "Double-counting multi-mapped reads"
                ]
            },
            {
                "step": 7,
                "name": "Differential Expression Analysis",
                "description": "Use DESeq2, edgeR, or limma-voom to identify differentially expressed genes. Include all relevant covariates in the design formula (e.g., ~batch + condition). Filter lowly expressed genes before analysis. Use adjusted p-values (BH/FDR) for multiple testing correction.",
                "tools": ["DESeq2", "edgeR", "limma-voom"],
                "common_mistakes": [
                    "Using raw p-values instead of adjusted p-values (padj/FDR)",
                    "Not including batch effects in the design formula",
                    "Feeding normalized counts (TPM) to DESeq2 instead of raw counts",
                    "Setting log2FC threshold without statistical testing (use lfcShrink in DESeq2)",
                    "Not filtering lowly expressed genes before analysis"
                ]
            },
            {
                "step": 8,
                "name": "Visualization and Interpretation",
                "description": "Create MA plots, volcano plots, PCA plots, heatmaps of top DE genes. Interpret results in biological context. Perform gene set enrichment analysis (GSEA) or over-representation analysis (ORA).",
                "tools": ["ggplot2", "EnhancedVolcano", "pheatmap", "clusterProfiler", "fgsea", "GSEA"],
                "common_mistakes": [
                    "Interpreting volcano plot points with high FC but non-significant p-value as important",
                    "Not using shrunken LFC estimates for ranking in GSEA",
                    "Cherry-picking individual genes without considering global patterns",
                    "Using arbitrary FC cutoffs without biological justification"
                ]
            }
        ]
    },
    "exome_seq_variant_calling": {
        "name": "Exome Sequencing Variant Calling Workflow",
        "domain": "exome_sequencing",
        "difficulty": "intermediate",
        "steps": [
            {
                "step": 1,
                "name": "Quality Control of Raw Reads",
                "description": "Assess FASTQ quality with FastQC. Check for adapter contamination, base quality, and GC content. For exome data, also verify expected insert size for the capture kit.",
                "tools": ["FastQC", "MultiQC"],
                "common_mistakes": [
                    "Not checking for adapter read-through in short inserts"
                ]
            },
            {
                "step": 2,
                "name": "Read Trimming",
                "description": "Trim adapters and low-quality bases. Less aggressive trimming than RNA-seq since variant callers account for base quality.",
                "tools": ["fastp", "Trimmomatic", "Cutadapt"],
                "common_mistakes": [
                    "Over-aggressive quality trimming can remove true variant-supporting reads"
                ]
            },
            {
                "step": 3,
                "name": "Read Alignment",
                "description": "Align reads to the reference genome (GRCh37/hg19 or GRCh38/hg38) using BWA-MEM. Sort the output BAM file by coordinate.",
                "tools": ["BWA-MEM", "BWA-MEM2", "samtools sort"],
                "common_mistakes": [
                    "Using wrong reference genome version β€” coordinates are NOT interchangeable",
                    "Not adding read group information (@RG tags) β€” GATK requires this",
                    "Using a genome build inconsistent with the capture kit BED file"
                ]
            },
            {
                "step": 4,
                "name": "Post-alignment Processing",
                "description": "Mark PCR duplicates (do NOT remove them for variant calling β€” just flag them). Perform Base Quality Score Recalibration (BQSR) using known variant sites (dbSNP, Mills indels).",
                "tools": ["Picard MarkDuplicates", "GATK BaseRecalibrator", "GATK ApplyBQSR", "samtools markdup"],
                "common_mistakes": [
                    "Removing duplicates instead of marking them",
                    "Skipping BQSR β€” significantly impacts variant quality scores",
                    "Using wrong known-sites VCF version for BQSR"
                ]
            },
            {
                "step": 5,
                "name": "Coverage Analysis",
                "description": "Assess on-target coverage, mean depth, and uniformity. For clinical exome sequencing, ensure >20x coverage over >95% of target regions. Check for regions with no coverage.",
                "tools": ["Picard CollectHsMetrics", "mosdepth", "bedtools coverage", "samtools depth"],
                "common_mistakes": [
                    "Not restricting analysis to the capture target BED file",
                    "Confusing mean coverage with median β€” mean is inflated by high-coverage regions",
                    "Accepting results with large uncovered regions without investigation"
                ]
            },
            {
                "step": 6,
                "name": "Variant Calling",
                "description": "Call germline variants using GATK HaplotypeCaller in GVCF mode. For cohort analysis, use GenomicsDBImport and GenotypeGVCFs. Restrict calling to the exome target regions using -L flag with the BED file.",
                "tools": ["GATK HaplotypeCaller", "GATK GenotypeGVCFs", "DeepVariant"],
                "common_mistakes": [
                    "Not restricting variant calling to target regions β€” wastes compute and introduces noise",
                    "Not using GVCF mode for cohort calling",
                    "Ignoring the distinction between GVCF (per-sample) and final VCF (joint-called)"
                ]
            },
            {
                "step": 7,
                "name": "Variant Filtering",
                "description": "Apply variant quality filters. For small cohorts use hard filtering (QD<2, FS>60 for SNPs; QD<2, FS>200 for indels). For large cohorts (>30 samples) use VQSR. Filter separately for SNPs and indels.",
                "tools": ["GATK VariantFiltration", "GATK VQSR", "bcftools filter"],
                "common_mistakes": [
                    "Applying the same filters to SNPs and indels",
                    "Using VQSR with too few samples (<30)",
                    "Not understanding that FILTER=PASS means different things in hard-filtered vs VQSR results"
                ]
            },
            {
                "step": 8,
                "name": "Variant Annotation and Interpretation",
                "description": "Annotate variants with gene names, consequence (missense, nonsense, splice, etc.), population frequencies (gnomAD), clinical significance (ClinVar), and functional predictions (SIFT, PolyPhen, CADD). Apply ACMG guidelines for clinical interpretation.",
                "tools": ["VEP (Ensembl)", "SnpEff", "ANNOVAR", "ClinVar", "gnomAD", "InterVar"],
                "common_mistakes": [
                    "Using outdated annotation databases",
                    "Confusing pathogenic predictions with clinical pathogenicity",
                    "Not checking population frequency β€” common variants are rarely pathogenic",
                    "Over-relying on in-silico predictions without considering other evidence"
                ]
            }
        ]
    },
    "microbiome_16s": {
        "name": "16S Amplicon Microbiome Analysis Workflow",
        "domain": "microbiome",
        "difficulty": "intermediate",
        "steps": [
            {
                "step": 1,
                "name": "Import and Demultiplex",
                "description": "Import raw sequencing data into QIIME2. Demultiplex if not already done by the sequencing facility. Verify sample metadata file format (TSV with #SampleID as first column).",
                "tools": ["QIIME2 import", "QIIME2 demux"],
                "common_mistakes": [
                    "Wrong import format β€” QIIME2 has many import types (Casava, EMP, manifest)",
                    "Metadata file formatting errors (wrong column names, missing #SampleID header)",
                    "Mixing different sequencing runs without tracking batch information"
                ]
            },
            {
                "step": 2,
                "name": "Quality Control and Denoising",
                "description": "Use DADA2 (preferred) or Deblur for denoising. DADA2 corrects sequencing errors and generates Amplicon Sequence Variants (ASVs). Set truncation parameters based on quality score plots (truncate where median quality drops below 25-30).",
                "tools": ["DADA2", "Deblur", "QIIME2 dada2 denoise-paired"],
                "common_mistakes": [
                    "Setting truncation length too aggressively β€” paired reads must still overlap",
                    "Using OTU clustering (97%) instead of ASV denoising β€” ASVs are now standard",
                    "Not checking the denoising stats β€” high percentage of reads lost indicates problems"
                ]
            },
            {
                "step": 3,
                "name": "Taxonomic Classification",
                "description": "Classify ASVs using a pre-trained classifier (e.g., Silva 138, Greengenes2) or BLAST-based methods. The classifier must match the primer pair and target region (V3-V4, V4, etc.).",
                "tools": ["QIIME2 classify-sklearn", "QIIME2 classify-consensus-blast", "Silva", "Greengenes2"],
                "common_mistakes": [
                    "Using a classifier not trained on the same region as your primers",
                    "Expecting species-level resolution from 16S β€” most regions resolve to genus at best",
                    "Not removing chloroplast and mitochondrial sequences"
                ]
            },
            {
                "step": 4,
                "name": "Diversity Analysis",
                "description": "Calculate alpha diversity (within-sample richness: Shannon, observed features, Faith's PD) and beta diversity (between-sample dissimilarity: Bray-Curtis, UniFrac). Rarefy to even sampling depth first. Choose rarefaction depth that retains most samples while maximizing reads.",
                "tools": ["QIIME2 diversity core-metrics-phylogenetic"],
                "common_mistakes": [
                    "Confusing alpha diversity (within-sample) with beta diversity (between-sample)",
                    "Not rarefying before diversity analysis β€” sequencing depth confounds results",
                    "Setting rarefaction depth too low (losing information) or too high (losing samples)",
                    "Using Bray-Curtis when phylogenetic distance matters β€” use UniFrac instead"
                ]
            },
            {
                "step": 5,
                "name": "Statistical Testing and Visualization",
                "description": "Test alpha diversity differences with Kruskal-Wallis. Test beta diversity with PERMANOVA (adonis). Create PCoA plots, taxa barplots, and heatmaps. For differential abundance, use ANCOM-BC2 or ALDEx2 (compositional-aware methods).",
                "tools": ["QIIME2 diversity alpha-group-significance", "QIIME2 diversity beta-group-significance", "ANCOM-BC", "ALDEx2", "LEfSe"],
                "common_mistakes": [
                    "Using standard t-tests on relative abundances β€” compositional data requires special methods",
                    "Not accounting for compositionality (CLR transform or Dirichlet models needed)",
                    "Over-interpreting low-abundance taxa differences",
                    "Not correcting for multiple testing across many taxa"
                ]
            }
        ]
    },
    "single_cell_rnaseq": {
        "name": "Single-cell RNA-seq Analysis Workflow",
        "domain": "single_cell",
        "difficulty": "advanced",
        "steps": [
            {
                "step": 1,
                "name": "Pre-processing and Cell Calling",
                "description": "Process raw BCL/FASTQ files with Cell Ranger (10x) or STARsolo. Generates a cell-by-gene count matrix. Uses UMI (Unique Molecular Identifier) counts to remove PCR duplicates.",
                "tools": ["Cell Ranger", "STARsolo", "alevin-fry"],
                "common_mistakes": [
                    "Not checking the knee plot β€” incorrect cell calling inflates empty droplets or loses real cells",
                    "Using wrong reference transcriptome version"
                ]
            },
            {
                "step": 2,
                "name": "Quality Control and Filtering",
                "description": "Filter cells based on: (1) number of detected genes (typically 200-5000), (2) total UMI counts, (3) mitochondrial gene percentage (<10-20% depending on tissue). Remove doublets using scrublet or DoubletFinder.",
                "tools": ["Seurat", "Scanpy", "scrublet", "DoubletFinder"],
                "common_mistakes": [
                    "Using fixed QC thresholds across different tissues β€” liver cells have higher mito% than blood",
                    "Not removing doublets β€” they form artificial intermediate clusters",
                    "Being too strict or too lenient with filtering β€” check distributions"
                ]
            },
            {
                "step": 3,
                "name": "Normalization and Feature Selection",
                "description": "Normalize counts (e.g., SCTransform in Seurat or scran normalization). Select highly variable genes (HVGs, typically 2000-3000) for downstream analysis.",
                "tools": ["Seurat SCTransform", "Scanpy normalize_total + log1p", "scran"],
                "common_mistakes": [
                    "Using bulk RNA-seq normalization methods (TPM, FPKM) on single-cell data",
                    "Selecting too few or too many HVGs β€” affects clustering resolution"
                ]
            },
            {
                "step": 4,
                "name": "Dimensionality Reduction and Clustering",
                "description": "PCA on HVGs β†’ select significant PCs (elbow plot) β†’ build k-nearest-neighbor graph β†’ Leiden/Louvain clustering β†’ UMAP/t-SNE for visualization.",
                "tools": ["Seurat", "Scanpy"],
                "common_mistakes": [
                    "Over-interpreting UMAP distances β€” UMAP preserves local, not global structure",
                    "Not testing multiple clustering resolutions β€” too low merges cell types, too high splits them",
                    "Using t-SNE for large datasets (>50k cells) β€” too slow, use UMAP"
                ]
            },
            {
                "step": 5,
                "name": "Cell Type Annotation",
                "description": "Annotate clusters using marker genes, reference datasets (SingleR, CellTypist), or manual curation. Verify with known canonical markers for expected cell types.",
                "tools": ["SingleR", "CellTypist", "Azimuth", "scType"],
                "common_mistakes": [
                    "Relying solely on automated annotation without checking marker genes",
                    "Expecting automated tools to identify rare or novel cell types",
                    "Not considering tissue context β€” same marker may mean different things in different tissues"
                ]
            }
        ]
    }
}


# ============================================================================
# GLOSSARY β€” Key bioinformatics terms
# ============================================================================

GLOSSARY = {
    "FASTQ": "Text-based format for storing nucleotide sequences and their quality scores. Each read has 4 lines: header (@), sequence, separator (+), quality (Phred+33 ASCII encoded).",
    "BAM": "Binary Alignment Map β€” compressed binary version of SAM format storing aligned sequencing reads with quality scores, mapping positions, and flags.",
    "VCF": "Variant Call Format β€” standard text file for storing genetic variant information including position, reference/alternate alleles, quality, filter status, and sample genotypes.",
    "FASTQ Quality Scores (Phred)": "Q = -10 * log10(P_error). Q20 = 1% error rate, Q30 = 0.1% error rate, Q40 = 0.01%. Most Illumina data is Phred+33 encoded.",
    "Read Depth / Coverage": "Number of sequencing reads aligned to a given position. Mean coverage = total aligned bases / genome size. Clinical WES typically requires >100x mean target coverage.",
    "GRCh37/hg19 vs GRCh38/hg38": "Human reference genome assemblies. hg38 (2013) is current standard with better representation of centromeres, alt loci, and decoy sequences. Coordinates are NOT interchangeable β€” use liftOver to convert.",
    "FDR / BH Correction": "False Discovery Rate / Benjamini-Hochberg method for multiple testing correction. Controls the expected proportion of false positives among rejected hypotheses. Standard threshold: FDR < 0.05.",
    "TPM vs FPKM vs CPM": "TPM (Transcripts Per Million): gene length + library size normalized, comparable across samples. FPKM (Fragments Per Kilobase per Million): similar but NOT directly comparable across samples. CPM: only library size normalized, not gene length.",
    "Log2 Fold Change": "log2(condition/control). log2FC=1 means 2x increase, log2FC=-1 means 2x decrease, log2FC=0 means no change. Used in DE analysis because it makes up/down regulation symmetric.",
    "DESeq2": "R/Bioconductor package for differential gene expression analysis. Uses negative binomial distribution. Requires RAW integer counts as input (NOT TPM/FPKM). Performs internal normalization using median-of-ratios method.",
    "GATK": "Genome Analysis Toolkit by Broad Institute. Gold-standard pipeline for germline and somatic variant calling. Key tools: HaplotypeCaller, Mutect2, BaseRecalibrator.",
    "UMI": "Unique Molecular Identifier β€” short random barcode attached during library preparation. Allows identification and removal of PCR duplicates. Essential for accurate quantification in single-cell and low-input protocols.",
    "ASV vs OTU": "ASV (Amplicon Sequence Variant): exact sequences resolved by DADA2, single-nucleotide resolution. OTU (Operational Taxonomic Unit): clustered at 97% similarity, now considered outdated. ASVs are reproducible across studies.",
    "Alpha Diversity": "Within-sample diversity. Measures: Shannon (richness + evenness), Observed Features (richness only), Faith's PD (phylogenetic diversity). Higher = more diverse community.",
    "Beta Diversity": "Between-sample diversity/dissimilarity. Measures: Bray-Curtis (abundance-based), Jaccard (presence/absence), UniFrac (phylogenetic, weighted or unweighted). Visualized with PCoA/NMDS.",
    "ACMG Guidelines": "American College of Medical Genetics classification system for variant interpretation: Pathogenic, Likely Pathogenic, VUS (Variant of Uncertain Significance), Likely Benign, Benign. Based on population data, computational predictions, functional data, segregation, and literature.",
    "Batch Effect": "Systematic technical variation between groups of samples processed at different times, locations, or by different personnel. Must be accounted for in experimental design and statistical analysis. Can be addressed with ComBat, limma removeBatchEffect, or inclusion in statistical model.",
    "UMAP": "Uniform Manifold Approximation and Projection β€” dimensionality reduction for visualization. Preserves local structure (nearby points stay near) but NOT global structure (distant clusters may be arbitrarily placed). Common in single-cell analysis.",
    "Rarefaction": "Subsampling reads to equal depth across samples for fair diversity comparison. Necessary because deeper sequencing artificially inflates observed diversity. Controversial β€” some prefer alternative normalization methods.",
    "BQSR": "Base Quality Score Recalibration (GATK). Machine-learns systematic errors in base quality scores using known variant sites. Critical for accurate variant calling β€” skipping it can increase false positive/negative rates.",
    "Structural Variant (SV)": "Large genomic rearrangement: deletion, duplication, inversion, translocation, or insertion >50bp. Detected with specialized callers (Manta, Delly, LUMPY) that use split reads, discordant pairs, and read depth.",
    "Bisulfite Sequencing": "Gold standard for single-base resolution DNA methylation detection. Bisulfite converts unmethylated cytosines to uracil, methylated cytosines remain. Specialized aligners (Bismark) needed.",
    "PCR": "Polymerase Chain Reaction β€” amplifies specific DNA sequences using primers, DNA polymerase, and thermal cycling (denature 95Β°C β†’ anneal 50-65Β°C β†’ extend 72Β°C). Foundation of molecular biology.",
    "CRISPR-Cas9": "Genome editing system using guide RNA to direct Cas9 nuclease to specific DNA sequences. Creates double-strand breaks that can knock out genes (NHEJ) or insert sequences (HDR). Off-target effects are a major concern.",
    "SNP vs SNV vs Indel": "SNP (Single Nucleotide Polymorphism): germline variant present in population. SNV (Single Nucleotide Variant): any single-base change, often used for somatic variants. Indel: insertion or deletion of 1+ bases.",
    "gnomAD": "Genome Aggregation Database β€” population allele frequencies from >140,000 exomes and >76,000 genomes. Essential for filtering common variants in clinical analysis. Variants with gnomAD AF >1% are almost never pathogenic for rare Mendelian diseases.",
    "ClinVar": "NCBI database of variant-disease relationships with clinical significance classifications submitted by labs worldwide. Key resource for variant interpretation but submissions vary in quality and evidence level.",
}


# ============================================================================
# COMMON MISCONCEPTIONS β€” For quiz generation and misconception correction
# ============================================================================

COMMON_MISCONCEPTIONS = [
    {
        "domain": "rna_seq",
        "misconception": "DESeq2 can accept TPM or FPKM values as input.",
        "correction": "DESeq2 requires raw, unnormalized integer counts. It performs its own internal normalization using the median-of-ratios method. Providing pre-normalized values (TPM, FPKM, RPKM) will produce incorrect results.",
        "severity": "critical",
        "level": "beginner"
    },
    {
        "domain": "rna_seq",
        "misconception": "A p-value of 0.001 means there is a 99.9% probability that the gene is differentially expressed.",
        "correction": "A p-value of 0.001 means: if the gene is NOT differentially expressed (null hypothesis true), there is a 0.1% probability of observing data this extreme. It does NOT tell you the probability of the hypothesis being true (that would be Bayesian posterior probability).",
        "severity": "critical",
        "level": "beginner"
    },
    {
        "domain": "rna_seq",
        "misconception": "You can use raw p-values to determine significance when testing thousands of genes.",
        "correction": "When testing thousands of genes simultaneously, multiple testing correction is MANDATORY. Use adjusted p-values (padj/FDR) from Benjamini-Hochberg correction. With 20,000 genes tested at p<0.05, you'd expect ~1,000 false positives by chance alone.",
        "severity": "critical",
        "level": "beginner"
    },
    {
        "domain": "exome_sequencing",
        "misconception": "Removing PCR duplicates is better than marking them for variant calling.",
        "correction": "For variant calling, duplicates should be MARKED (flagged) but NOT removed. Variant callers like GATK HaplotypeCaller use the duplicate flags to downweight but still consider the evidence. Removing duplicates can lose true variants at high-depth regions.",
        "severity": "moderate",
        "level": "intermediate"
    },
    {
        "domain": "variant_interpretation",
        "misconception": "A variant predicted as 'damaging' by SIFT/PolyPhen is clinically pathogenic.",
        "correction": "In-silico prediction tools (SIFT, PolyPhen, CADD) predict functional EFFECT, not clinical pathogenicity. Many 'damaging' variants are benign in clinical context. ACMG guidelines use these as one piece of evidence (PP3 criterion) among many required for pathogenicity classification.",
        "severity": "critical",
        "level": "intermediate"
    },
    {
        "domain": "microbiome",
        "misconception": "16S rRNA sequencing can reliably identify bacteria to the species level.",
        "correction": "16S rRNA variable regions (V3-V4, V4) typically provide reliable classification to the GENUS level. Species-level classification requires full-length 16S or whole-genome approaches. Many closely related species have identical V4 sequences.",
        "severity": "moderate",
        "level": "beginner"
    },
    {
        "domain": "microbiome",
        "misconception": "You can compare raw ASV/OTU counts directly between samples without normalization.",
        "correction": "Different samples have different sequencing depths, making raw counts incomparable. Use rarefaction (subsampling to even depth) for diversity analysis, or compositional-aware methods (CLR transformation, ANCOM-BC) for differential abundance testing.",
        "severity": "critical",
        "level": "beginner"
    },
    {
        "domain": "single_cell",
        "misconception": "Distances between clusters on a UMAP plot reflect biological similarity.",
        "correction": "UMAP preserves LOCAL structure (nearby points in high dimensions remain nearby) but does NOT preserve GLOBAL structure. The distance between two clusters on a UMAP plot has no biological meaning β€” two clusters could appear far apart but be transcriptionally similar.",
        "severity": "moderate",
        "level": "intermediate"
    },
    {
        "domain": "molecular_genetics",
        "misconception": "mRNA expression levels directly predict protein abundance.",
        "correction": "mRNA and protein levels are only weakly correlated (RΒ² β‰ˆ 0.4 in many studies). Post-transcriptional regulation (miRNA, RNA-binding proteins), translation efficiency, and protein degradation rates all contribute to the discrepancy.",
        "severity": "moderate",
        "level": "beginner"
    },
    {
        "domain": "genome_sequencing",
        "misconception": "hg19 and hg38 coordinates are interchangeable.",
        "correction": "GRCh37/hg19 and GRCh38/hg38 have different coordinate systems due to sequence additions, corrections, and rearrangements. You MUST use liftOver (UCSC) or CrossMap to convert coordinates between assemblies. Mixing them leads to incorrect variant positions.",
        "severity": "critical",
        "level": "beginner"
    }
]


# ============================================================================
# SYSTEM PROMPTS β€” Per-module tutor personas
# ============================================================================

SYSTEM_PROMPTS = {
    "ask_tutor": """You are BB Tutor, an expert bioinformatics teaching assistant. Your role is to teach bioinformatics concepts clearly and accurately.

CORE RULES:
1. TEACH in layers: start with the high-level concept, then go deeper when asked.
2. ALWAYS distinguish between: established facts, common practices, and your interpretations.
3. CITE specific tools, databases, and methods by name β€” never be vague.
4. When discussing statistical concepts, be PRECISE about definitions (p-values, FDR, fold change).
5. When discussing variants, NEVER provide clinical interpretations β€” you are a teaching tool, not a clinical system.
6. If uncertain about a fact, SAY SO explicitly. Use phrases like "I'm not confident about this specific detail" rather than guessing.
7. CORRECT common misconceptions when you encounter them β€” explain WHY the misconception is wrong.
8. Use concrete examples: specific tools, specific parameter values, specific thresholds.
9. Format responses with clear headers, bullet points, and code blocks where appropriate.
10. For workflow questions, present steps in order with tool names and key parameters.

SAFETY BOUNDARIES:
- You are an EDUCATIONAL assistant. Never provide clinical diagnoses or treatment recommendations.
- Never interpret specific patient variants as pathogenic/benign without massive caveats.
- Always recommend consulting clinical genetics professionals for patient data.
- Flag when a question moves from educational territory into clinical territory.

DOMAINS: RNA-seq, exome sequencing, genome sequencing, microbiome analysis, variant interpretation, molecular genetics, single-cell RNA-seq, ATAC-seq, ChIP-seq, methylation sequencing, small RNA-seq, targeted panels, long-read sequencing, spatial transcriptomics, multi-omics integration.""",

    "upload_explain": """You are BB Tutor analyzing an uploaded document or figure. Your role is to:
1. Identify what type of bioinformatics content this is (paper, results table, QC report, plot, etc.)
2. Explain the key findings or content in clear, educational language
3. Point out important patterns, potential issues, or notable features
4. Relate the content to relevant bioinformatics concepts and workflows
5. Suggest follow-up questions the student might want to explore

Be specific about what you observe. If it's a figure, describe the axes, trends, and what they mean biologically. If it's a table, highlight the important columns and values. If it's a paper abstract, identify the key methods and findings.

NEVER fabricate details that aren't in the uploaded content. If you can't read something clearly, say so.""",

    "quiz_me": """You are BB Tutor generating a bioinformatics quiz. Generate questions that test understanding, not just recall.

QUESTION TYPES:
- Conceptual understanding (what does this mean?)
- Workflow reasoning (what comes next? what went wrong?)
- Tool selection (which tool for this task?)
- Result interpretation (what does this output tell us?)
- Misconception traps (common wrong answers should be plausible distractors)
- Troubleshooting (given this error/result, diagnose the problem)

FORMAT RULES:
- For MCQs: provide 4 options labeled A-D, with exactly one correct answer
- For True/False: provide a statement that tests a common misconception
- For Short Answer: ask specific questions with concrete expected answers
- Always include a brief explanation of why the correct answer is correct
- Include the difficulty level: Beginner, Intermediate, or Advanced

Make distractors plausible β€” they should represent real misconceptions students have.""",

    "build_lesson": """You are BB Tutor creating structured educational content. Generate well-organized lessons with:
1. Clear learning objectives (3-5 per lesson)
2. Prerequisite knowledge listed
3. Concept explanations with examples
4. Workflow steps with tool names and parameters
5. Common pitfalls and how to avoid them
6. Practice exercises
7. Key takeaways / summary

Structure content for a 30-60 minute teaching session. Use progressive disclosure β€” start simple, build complexity. Include both conceptual knowledge and practical skills.""",

    "workflow_coach": """You are BB Tutor acting as a bioinformatics workflow coach. Guide the student through analysis pipelines step by step.

COACHING APPROACH:
1. Ask what the student is trying to achieve (research question, data type, organism)
2. Present the appropriate workflow with numbered steps
3. For each step: explain WHAT, WHY, and HOW (specific tools and key parameters)
4. Highlight decision points where the student needs to make choices
5. Point out common mistakes at each step
6. Suggest quality checkpoints between steps

Be specific about commands, parameters, and expected outputs. Use code blocks for command-line examples. Always mention what the expected output should look like so the student can verify they're on track.""",

    "paper_to_lesson": """You are BB Tutor converting a research paper into teaching material. Your role is to:
1. Identify the key methods and bioinformatics techniques used
2. Break down the analysis pipeline into teachable steps
3. Extract the important concepts a student should learn
4. Create learning objectives based on the paper's methods
5. Generate practice questions inspired by the paper
6. Identify what background knowledge is needed to understand the paper

Focus on the METHODS β€” what bioinformatics was done, what tools were used, what design decisions were made, and why. Don't just summarize the biological findings.""",

    "viva_practice": """You are BB Tutor conducting a viva voce (oral examination) practice session. You are the examiner.

VIVA APPROACH:
1. Start with a broad question about the topic
2. Based on the student's answer, probe deeper into specific areas
3. If the student's answer is correct, push to the next level of complexity
4. If the student's answer is wrong or incomplete, guide them with follow-up questions (don't just give the answer)
5. Test both conceptual understanding and practical knowledge
6. Periodically summarize what the student has demonstrated well and areas to improve

Be encouraging but rigorous. A good viva examiner helps the student show what they know while probing the boundaries of their understanding. Ask ONE question at a time and wait for the response before continuing."""
}


# ============================================================================
# QUIZ TEMPLATES β€” Structured templates for quiz generation
# ============================================================================

QUIZ_TEMPLATES = {
    "mcq": """Generate {n} multiple choice questions about {topic} at {difficulty} level.

For each question, provide EXACTLY this JSON format:
[
  {{
    "question": "The question text",
    "options": {{"A": "option A", "B": "option B", "C": "option C", "D": "option D"}},
    "correct": "A",
    "explanation": "Why A is correct and why the wrong answers are wrong",
    "difficulty": "{difficulty}",
    "topic": "{topic}"
  }}
]

Make wrong answers represent real misconceptions students have. Don't make them obviously wrong.""",

    "true_false": """Generate {n} True/False questions about {topic} at {difficulty} level.

For each question, provide EXACTLY this JSON format:
[
  {{
    "statement": "A statement that is either true or false",
    "answer": true,
    "explanation": "Why this is true/false, and what the common misconception is",
    "difficulty": "{difficulty}",
    "topic": "{topic}"
  }}
]

Target common misconceptions β€” make the statements sound plausible regardless of whether they're true or false.""",

    "short_answer": """Generate {n} short answer questions about {topic} at {difficulty} level.

For each question, provide EXACTLY this JSON format:
[
  {{
    "question": "The question text",
    "expected_answer": "The ideal answer in 2-3 sentences",
    "key_points": ["point 1", "point 2", "point 3"],
    "difficulty": "{difficulty}",
    "topic": "{topic}"
  }}
]

Questions should require understanding, not just recall. Ask about WHY and HOW, not just WHAT."""
}


# ============================================================================
# LESSON TEMPLATES
# ============================================================================

LESSON_TEMPLATE = """Create a structured lesson about {topic} for {level} students.

Structure your lesson as follows:

## Learning Objectives
List 3-5 specific, measurable learning objectives.

## Prerequisites
What should the student already know before this lesson?

## Part 1: Concept Introduction
Explain the core concept in simple terms. Use an analogy if helpful.

## Part 2: Technical Details
Go deeper into the technical aspects. Include:
- Key tools and software
- Important parameters and thresholds
- File formats involved

## Part 3: Workflow / Step-by-Step
If applicable, walk through the practical workflow.

## Part 4: Common Pitfalls
List 3-5 common mistakes and how to avoid them.

## Part 5: Practice Exercise
Provide a hands-on exercise or scenario for the student to work through.

## Summary
Key takeaways in 3-5 bullet points.

## Further Reading
Suggest 2-3 resources for deeper learning.
"""


# ============================================================================
# TOPIC CHOICES β€” For dropdown menus
# ============================================================================

TOPIC_CHOICES = [
    "RNA-seq: Experimental Design",
    "RNA-seq: Quality Control (FastQC/MultiQC)",
    "RNA-seq: Read Alignment (STAR/HISAT2)",
    "RNA-seq: Transcript Quantification (Salmon/kallisto)",
    "RNA-seq: Differential Expression (DESeq2)",
    "RNA-seq: Differential Expression (edgeR)",
    "RNA-seq: Differential Expression (limma-voom)",
    "RNA-seq: Gene Set Enrichment Analysis",
    "RNA-seq: Normalization Methods (TPM/FPKM/CPM)",
    "RNA-seq: Batch Effect Correction",
    "Exome Sequencing: Library Preparation & Capture",
    "Exome Sequencing: Read Alignment (BWA-MEM)",
    "Exome Sequencing: Post-alignment Processing (BQSR)",
    "Exome Sequencing: Variant Calling (GATK)",
    "Exome Sequencing: Variant Filtering",
    "Exome Sequencing: Coverage Analysis",
    "Genome Sequencing: Whole Genome vs Exome",
    "Genome Sequencing: De Novo Assembly",
    "Genome Sequencing: Structural Variant Detection",
    "Genome Sequencing: Somatic Variant Calling (Mutect2)",
    "Microbiome: 16S Amplicon Sequencing",
    "Microbiome: Shotgun Metagenomics",
    "Microbiome: QIIME2 Pipeline",
    "Microbiome: DADA2 Denoising",
    "Microbiome: Alpha & Beta Diversity",
    "Microbiome: Differential Abundance Testing",
    "Variant Interpretation: VCF Format",
    "Variant Interpretation: ACMG Guidelines",
    "Variant Interpretation: Population Databases (gnomAD)",
    "Variant Interpretation: Functional Prediction (SIFT/PolyPhen/CADD)",
    "Variant Interpretation: ClinVar & Clinical Significance",
    "Molecular Genetics: DNA Structure & Replication",
    "Molecular Genetics: Transcription & Translation",
    "Molecular Genetics: PCR & Cloning",
    "Molecular Genetics: CRISPR-Cas9",
    "Molecular Genetics: Mutations & Repair",
    "Molecular Genetics: Inheritance Patterns",
    "Single-cell RNA-seq: Overview & Platforms",
    "Single-cell RNA-seq: Quality Control & Filtering",
    "Single-cell RNA-seq: Normalization & HVGs",
    "Single-cell RNA-seq: Clustering & UMAP",
    "Single-cell RNA-seq: Cell Type Annotation",
    "ATAC-seq: Chromatin Accessibility",
    "ChIP-seq: Histone Modifications & TF Binding",
    "Methylation Sequencing: Bisulfite-seq",
    "Small RNA-seq: miRNA Analysis",
    "Targeted Sequencing: Panel Design & Analysis",
    "Long-read Sequencing: PacBio & Nanopore",
    "Spatial Transcriptomics: Visium & MERFISH",
    "Multi-omics: Data Integration Strategies",
]

DIFFICULTY_LEVELS = ["Beginner", "Intermediate", "Advanced"]

WORKFLOW_CHOICES = [
    "Bulk RNA-seq: Full DE Analysis Pipeline",
    "Exome Sequencing: Variant Calling Pipeline",
    "Genome Sequencing: Germline Variant Calling",
    "Genome Sequencing: Somatic Variant Calling",
    "Microbiome: 16S Amplicon Analysis (QIIME2)",
    "Microbiome: Shotgun Metagenomics",
    "Single-cell RNA-seq: Standard Seurat/Scanpy Pipeline",
    "ATAC-seq: Peak Calling Pipeline",
    "ChIP-seq: Peak Calling & Motif Analysis",
    "Variant Interpretation: Clinical Exome Analysis",
]