""" Bioinformatics with BB Tutor — Complete Application A production bioinformatics teaching assistant with 7 modules. Architecture: - Backend: LLMService (HuggingFace InferenceClient), RAGService (sentence-transformers), DocumentParser (PyMuPDF + text), knowledge_base (domain content) - Frontend: 7 Gradio tabs with ChatInterface, file upload, quiz generation, lesson building - Data flow: User query → RAG retrieval → LLM with context → streaming response - Shared state: rag_store (gr.State) holds uploaded document chunks + embeddings across tabs """ import gradio as gr import numpy as np import os from pathlib import Path # ── Conditional imports with fallbacks ──────────────────────────────────────── try: import fitz # PyMuPDF HAS_FITZ = True except ImportError: HAS_FITZ = False print("Warning: PyMuPDF not available. PDF parsing disabled.") try: from sentence_transformers import SentenceTransformer HAS_ST = True except ImportError: HAS_ST = False print("Warning: sentence-transformers not available. Embedding search disabled.") try: from huggingface_hub import InferenceClient HAS_HF = True except ImportError: HAS_HF = False print("Warning: huggingface_hub not available. LLM service disabled.") # ── Import knowledge base ──────────────────────────────────────────────────── from knowledge_base import ( DOMAIN_TAXONOMY, WORKFLOWS, GLOSSARY, COMMON_MISCONCEPTIONS, SYSTEM_PROMPTS, QUIZ_TEMPLATES, LESSON_TEMPLATE, TOPIC_CHOICES, DIFFICULTY_LEVELS, WORKFLOW_CHOICES ) # ============================================================================ # CONFIGURATION # ============================================================================ LLM_MODEL = os.environ.get("LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3") EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" HF_TOKEN = os.environ.get("HF_TOKEN", None) CHUNK_SIZE = 400 CHUNK_OVERLAP = 60 TOP_K_RETRIEVAL = 3 DEFAULT_SYSTEM_PROMPT = SYSTEM_PROMPTS["ask_tutor"] DEFAULT_RAG_STATE = {"chunks": [], "embeddings": None} # ============================================================================ # BACKEND SERVICES — Singleton Pattern # ============================================================================ class LLMService: """Lazy-initialized LLM inference service.""" _instance = None _initialized = False def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def __init__(self): if LLMService._initialized: return LLMService._initialized = True self.client = None self._try_init() def _try_init(self): if not HAS_HF: print("LLMService: huggingface_hub not available") return if not HF_TOKEN: print("LLMService: HF_TOKEN not set in environment") return try: self.client = InferenceClient( model=LLM_MODEL, token=HF_TOKEN, timeout=120, ) print("LLMService: Initialized successfully") except Exception as e: print(f"LLMService: Failed to initialize: {e}") self.client = None def is_available(self): return self.client is not None def stream_chat(self, messages, temperature=0.7, max_tokens=1024): """Stream chat completion. Yields partial response strings.""" if not self.is_available(): yield self._fallback_response(messages) return try: partial = "" for chunk in self.client.chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=0.9, stream=True, ): token = "" if hasattr(chunk, 'choices') and chunk.choices: choice = chunk.choices[0] if hasattr(choice, 'delta') and hasattr(choice.delta, 'content'): token = choice.delta.content or "" partial += token yield partial except Exception as e: print(f"LLM stream error: {e}") yield f"⚠️ LLM API error: {str(e)}\n\nPlease check your HF_TOKEN in Space settings and ensure the model '{LLM_MODEL}' is accessible.\n\nThe tutor is still functional using its knowledge base for many questions — try asking about specific bioinformatics topics!" def generate(self, messages, temperature=0.7, max_tokens=1024): """Non-streaming generation. Returns complete response.""" if not self.is_available(): return self._fallback_response(messages) try: response = self.client.chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=0.9, stream=False, ) return response.choices[0].message.content except Exception as e: print(f"LLM generate error: {e}") return f"⚠️ LLM API error: {str(e)}\n\nThe tutor can still answer from its knowledge base. Try asking about specific concepts!" def _fallback_response(self, messages): """Knowledge-base fallback when LLM unavailable.""" user_msg = "" for m in reversed(messages): if isinstance(m, dict) and m.get("role") == "user": user_msg = m.get("content", "").lower() break if not user_msg: return "⚠️ **LLM not available.** Add HF_TOKEN in Space settings to enable AI responses.\n\nMeanwhile, the knowledge base covers: DESeq2, variant calling, microbiome diversity, scRNA-seq clustering, and more. Try asking a specific question!" response_parts = [] for term, definition in GLOSSARY.items(): if term.lower() in user_msg: response_parts.append(f"**{term}**: {definition}") if len(response_parts) >= 3: break for wf_key, wf in WORKFLOWS.items(): if any(kw in user_msg for kw in wf["name"].lower().split()): response_parts.append(f"\n### {wf['name']}") for step in wf["steps"][:3]: response_parts.append(f"**Step {step['step']}: {step['name']}**\n{step['description']}") break for misc in COMMON_MISCONCEPTIONS: if misc["domain"].replace("_", " ") in user_msg or any(w in user_msg for w in misc["misconception"].lower().split()[:5]): response_parts.append(f"\n⚠️ **Common Misconception**: {misc['misconception']}\n\n✅ **Correction**: {misc['correction']}") break if response_parts: return "📚 *Responding from knowledge base (LLM not configured):*\n\n" + "\n\n".join(response_parts) return ( "⚠️ **AI responses require HF_TOKEN.**\n\n" "To enable full AI-powered responses:\n" "1. Go to your HuggingFace account → Settings → Access Tokens\n" "2. Create a token with 'inference-api' scope\n" "3. Add it as a Secret named `HF_TOKEN` in this Space's Settings\n\n" "The knowledge base can still answer many questions. Try asking about 'RNA-seq workflow', 'variant calling', or 'microbiome diversity'!" ) class RAGService: """Document retrieval with lazy embedding model loading.""" _instance = None _initialized = False def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def __init__(self): if RAGService._initialized: return RAGService._initialized = True self.embedder = None self.kb_chunks = [] self.kb_metadata = [] self.kb_embeddings = None self._build_kb_index() def _ensure_embedder(self): if self.embedder is not None: return True if not HAS_ST: return False try: print("RAGService: Loading embedding model...") self.embedder = SentenceTransformer(EMBED_MODEL) print("RAGService: Embedding model loaded") if self.kb_chunks: self.kb_embeddings = self.embedder.encode( self.kb_chunks, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False, batch_size=32, ) print(f"RAGService: KB embedded ({len(self.kb_chunks)} chunks)") return True except Exception as e: print(f"RAGService: Failed to load embedder: {e}") return False def _build_kb_index(self): chunks = [] metadata = [] for term, definition in GLOSSARY.items(): chunks.append(f"{term}: {definition}") metadata.append({"source": "glossary", "topic": term, "type": "definition"}) for wf_key, wf in WORKFLOWS.items(): for step in wf["steps"]: step_text = f"{wf['name']} - Step {step['step']}: {step['name']}. {step['description']}" if step.get("tools"): step_text += f" Tools: {', '.join(step['tools'])}." if step.get("common_mistakes"): step_text += " Common mistakes: " + "; ".join(step["common_mistakes"]) chunks.append(step_text) metadata.append({ "source": "workflow", "topic": wf["domain"], "type": "workflow_step", "step": step["step"], "workflow": wf_key }) for misc in COMMON_MISCONCEPTIONS: text = f"Misconception: {misc['misconception']} Correction: {misc['correction']}" chunks.append(text) metadata.append({ "source": "misconception", "topic": misc["domain"], "type": "misconception", "severity": misc["severity"] }) for key, domain in DOMAIN_TAXONOMY.items(): text = f"{domain['name']} covers: {', '.join(domain['subtopics'][:10])}" chunks.append(text) metadata.append({"source": "taxonomy", "topic": key, "type": "domain_overview"}) self.kb_chunks = chunks self.kb_metadata = metadata print(f"RAGService: Built KB with {len(chunks)} chunks") def search(self, query, top_k=TOP_K_RETRIEVAL, user_chunks=None, user_embeddings=None): if not self._ensure_embedder(): return self._keyword_search(query, top_k) try: query_embedding = self.embedder.encode( [query], convert_to_numpy=True, normalize_embeddings=True, ) results = [] if self.kb_embeddings is not None and len(self.kb_embeddings) > 0: kb_scores = np.dot(query_embedding, self.kb_embeddings.T)[0] top_indices = np.argsort(kb_scores)[::-1][:top_k] for idx in top_indices: if kb_scores[idx] > 0.15: results.append({ "text": self.kb_chunks[idx], "score": float(kb_scores[idx]), "metadata": self.kb_metadata[idx] }) if user_chunks and user_embeddings is not None and len(user_embeddings) > 0: user_scores = np.dot(query_embedding, user_embeddings.T)[0] top_user = np.argsort(user_scores)[::-1][:top_k] for idx in top_user: if user_scores[idx] > 0.15: results.append({ "text": user_chunks[idx], "score": float(user_scores[idx]), "metadata": {"source": "uploaded", "type": "user_content"} }) results.sort(key=lambda x: x["score"], reverse=True) return results[:top_k] except Exception as e: print(f"RAG search error: {e}") return self._keyword_search(query, top_k) def _keyword_search(self, query, top_k=3): query_words = set(query.lower().split()) scored = [] for i, chunk in enumerate(self.kb_chunks): chunk_words = set(chunk.lower().split()) overlap = len(query_words & chunk_words) if overlap > 0: scored.append({ "text": chunk, "score": overlap / max(len(query_words), 1), "metadata": self.kb_metadata[i] }) scored.sort(key=lambda x: x["score"], reverse=True) return scored[:top_k] def embed_chunks(self, chunks): if not self._ensure_embedder() or not chunks: return None try: return self.embedder.encode( chunks, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False, batch_size=16, ) except Exception as e: print(f"Embed chunks error: {e}") return None class DocumentParser: @staticmethod def parse_file(filepath): if filepath is None: return "", [] filepath = str(filepath) ext = Path(filepath).suffix.lower() try: if ext == ".pdf" and HAS_FITZ: return DocumentParser._parse_pdf(filepath) elif ext in (".txt", ".md", ".csv", ".tsv", ".fasta", ".fa", ".fastq", ".fq", ".vcf", ".bed", ".gff", ".gtf", ".sam", ".bam"): return DocumentParser._parse_text(filepath) else: return f"Unsupported file type: {ext}", [] except Exception as e: return f"Error parsing file: {str(e)}", [] @staticmethod def _parse_pdf(filepath): doc = fitz.open(filepath) pages = [] for page_num in range(len(doc)): text = doc[page_num].get_text() if text.strip(): pages.append(text) doc.close() full_text = "\n\n".join(pages) chunks = DocumentParser._chunk_text(full_text) return full_text, chunks @staticmethod def _parse_text(filepath): with open(filepath, "r", encoding="utf-8", errors="replace") as f: text = f.read() chunks = DocumentParser._chunk_text(text) return text, chunks @staticmethod def _chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP): words = text.split() if len(words) <= chunk_size: return [text] if text.strip() else [] chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = " ".join(words[i:i + chunk_size]) if chunk.strip(): chunks.append(chunk) return chunks llm_service = LLMService() rag_service = RAGService() doc_parser = DocumentParser() print(f"🧬 BB Tutor initialized. LLM: {llm_service.is_available()}, Embeddings: {rag_service.embedder is not None}") # ============================================================================ # HELPER FUNCTIONS # ============================================================================ def _rag_context(query, user_chunks=None, user_embeddings=None): results = rag_service.search(query, top_k=TOP_K_RETRIEVAL, user_chunks=user_chunks, user_embeddings=user_embeddings) if not results: return "" parts = ["RELEVANT KNOWLEDGE BASE CONTEXT:"] for r in results: source = r["metadata"].get("source", "kb") parts.append(f"[{source}] {r['text'][:800]}") return "\n".join(parts) def _format_history(history): messages = [] for h in history: if isinstance(h, dict): messages.append(h) elif isinstance(h, (list, tuple)): if len(h) >= 1 and h[0]: messages.append({"role": "user", "content": str(h[0])}) if len(h) >= 2 and h[1]: messages.append({"role": "assistant", "content": str(h[1])}) return messages # ============================================================================ # MODULE HANDLERS # ============================================================================ def tutor_respond(message, history, system_prompt, temperature, max_tokens, rag_state): if not message or not message.strip(): yield "" return rag_state = rag_state or DEFAULT_RAG_STATE user_chunks = rag_state.get("chunks", []) user_embeddings = rag_state.get("embeddings") rag_ctx = _rag_context(message, user_chunks, user_embeddings) messages = [{"role": "system", "content": system_prompt}] if rag_ctx: messages.append({"role": "system", "content": rag_ctx}) messages.extend(_format_history(history)) messages.append({"role": "user", "content": message}) for partial in llm_service.stream_chat(messages, temperature, max_tokens): yield partial def process_upload(file, rag_state): rag_state = rag_state or DEFAULT_RAG_STATE if file is None: return "📁 Please upload a file first.", "", rag_state full_text, chunks = doc_parser.parse_file(file) if not chunks: return "⚠️ Could not extract text from the uploaded file.", full_text[:2000] if full_text else "", rag_state embeddings = rag_service.embed_chunks(chunks) new_state = {"chunks": chunks, "embeddings": embeddings} preview = full_text[:2500] if len(full_text) > 2500 else full_text msgs = [ {"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]}, {"role": "user", "content": f"Analyze and explain this bioinformatics document:\n\n{preview}"} ] explanation = llm_service.generate(msgs, temperature=0.5, max_tokens=1500) stats = f"📊 **Document Stats:** {len(chunks)} chunks, ~{len(full_text.split())} words | " stats += f"File type: {Path(str(file)).suffix} | " stats += "🤖 AI-powered" if llm_service.is_available() else "📚 Knowledge-base mode" stats += f"\n\n---\n\n" return stats + explanation, full_text[:5000], new_state def upload_chat_respond(message, history, rag_state): if not message or not message.strip(): yield "" return rag_state = rag_state or DEFAULT_RAG_STATE user_chunks = rag_state.get("chunks", []) user_embeddings = rag_state.get("embeddings") if not user_chunks: yield "📁 Please upload a document in the panel above, then ask questions about it.\n\nYour uploaded document will be indexed and searchable across all modules!" return rag_results = rag_service.search(message, top_k=4, user_chunks=user_chunks, user_embeddings=user_embeddings) ctx = "CONTEXT FROM UPLOADED DOCUMENT:\n" if rag_results: for r in rag_results: ctx += f"\n{r['text'][:600]}\n" else: ctx += "(No highly relevant passages found — answering from general knowledge)\n" messages = [ {"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]}, {"role": "system", "content": ctx}, ] messages.extend(_format_history(history)) messages.append({"role": "user", "content": message}) for partial in llm_service.stream_chat(messages, temperature=0.5, max_tokens=1024): yield partial def generate_quiz(topic, quiz_type, num_questions, difficulty, rag_state): if not topic: return "❓ Please select or enter a topic first.", "" rag_results = rag_service.search(topic, top_k=3) context = "" if rag_results: context = "Reference material:\n" + "\n".join(r["text"][:500] for r in rag_results) template_key = { "Multiple Choice (MCQ)": "mcq", "True/False": "true_false", "Short Answer": "short_answer" }.get(quiz_type, "mcq") quiz_prompt = QUIZ_TEMPLATES[template_key].format( n=int(num_questions), topic=topic, difficulty=difficulty ) messages = [{"role": "system", "content": SYSTEM_PROMPTS["quiz_me"]}] if context: messages.append({"role": "system", "content": context}) messages.append({"role": "user", "content": quiz_prompt}) response = llm_service.generate(messages, temperature=0.8, max_tokens=2000) formatted = f"## 🧠 {topic} Quiz — {difficulty}\n\n" formatted += f"*Format: {quiz_type} | Questions: {int(num_questions)}*\n\n---\n\n" formatted += response return formatted, response def check_quiz_answers(user_answers, answer_key): if not user_answers or not user_answers.strip(): return "✍️ Please enter your answers above before checking." if not answer_key: return "⚠️ Please generate a quiz first (use the panel above)." messages = [ {"role": "system", "content": "You are a bioinformatics tutor grading a quiz. Compare student answers to correct answers. For each: mark ✅ or ❌, explain briefly, provide correct answer if wrong. Be encouraging. Give final score."}, {"role": "user", "content": f"QUIZ AND ANSWERS:\n{answer_key}\n\nSTUDENT ANSWERS:\n{user_answers}\n\nGrade each:"} ] return llm_service.generate(messages, temperature=0.3, max_tokens=1500) def generate_lesson(topic, level, include_exercises, include_quiz): if not topic: return "📚 Please select or enter a topic." rag_results = rag_service.search(topic, top_k=4) context = "" if rag_results: context = "Reference:\n" + "\n".join(r["text"][:500] for r in rag_results) prompt = LESSON_TEMPLATE.format(topic=topic, level=level) if include_exercises: prompt += "\n\nInclude 2-3 practical exercises with clear instructions." if include_quiz: prompt += "\n\nInclude a 5-question self-assessment quiz (with answers)." messages = [{"role": "system", "content": SYSTEM_PROMPTS["build_lesson"]}] if context: messages.append({"role": "system", "content": context}) messages.append({"role": "user", "content": prompt}) return llm_service.generate(messages, temperature=0.7, max_tokens=3000) def workflow_respond(message, history, selected_workflow, temperature): if not message or not message.strip(): yield "" return workflow_context = "" for wf_key, wf in WORKFLOWS.items(): if wf["name"] in selected_workflow or selected_workflow.lower() in wf["name"].lower(): workflow_context = f"WORKFLOW REFERENCE: {wf['name']}\n\n" for step in wf["steps"]: workflow_context += f"Step {step['step']}: {step['name']}\n" workflow_context += f" {step['description']}\n" if step.get("tools"): workflow_context += f" Tools: {', '.join(step['tools'])}\n" if step.get("common_mistakes"): workflow_context += f" ⚠️ Common mistakes: {'; '.join(step['common_mistakes'])}\n" workflow_context += "\n" break rag_results = rag_service.search(message, top_k=2) if rag_results: workflow_context += "\nADDITIONAL CONTEXT:\n" + "\n".join(r["text"][:500] for r in rag_results) messages = [{"role": "system", "content": SYSTEM_PROMPTS["workflow_coach"]}] if workflow_context: messages.append({"role": "system", "content": workflow_context}) messages.extend(_format_history(history)) messages.append({"role": "user", "content": message}) for partial in llm_service.stream_chat(messages, temperature, 1500): yield partial def paper_to_lesson_respond(message, history, output_format, rag_state): if not message or not message.strip(): yield "" return rag_state = rag_state or DEFAULT_RAG_STATE user_chunks = rag_state.get("chunks", []) user_embeddings = rag_state.get("embeddings") context = "" if user_chunks: rag_results = rag_service.search(message, top_k=4, user_chunks=user_chunks, user_embeddings=user_embeddings) if rag_results: context = "PAPER CONTENT:\n" + "\n".join(r["text"][:600] for r in rag_results) format_instruction = { "Lesson Plan": "Create a structured lesson plan with learning objectives, sections, and exercises.", "Slide Outline": "Create a slide-by-slide outline with key points for each slide.", "Study Notes": "Create concise study notes highlighting key methods, tools, and findings.", "Quiz Questions": "Generate 5-10 quiz questions based on the paper's methods and findings.", }.get(output_format, "Create a structured lesson plan.") messages = [{"role": "system", "content": SYSTEM_PROMPTS["paper_to_lesson"]}] if context: messages.append({"role": "system", "content": context}) messages.extend(_format_history(history)) full_msg = f"{message}\n\nOUTPUT FORMAT: {format_instruction}" messages.append({"role": "user", "content": full_msg}) for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=2500): yield partial def viva_respond(message, history, topic, difficulty): if not message or not message.strip(): yield "" return rag_results = rag_service.search(f"{topic} {message}", top_k=3) context = "" if rag_results: context = "Reference:\n" + "\n".join(r["text"][:500] for r in rag_results) messages = [ {"role": "system", "content": SYSTEM_PROMPTS["viva_practice"]}, {"role": "system", "content": f"VIVA TOPIC: {topic}\nDIFFICULTY: {difficulty}\n\n{context}"}, ] messages.extend(_format_history(history)) messages.append({"role": "user", "content": message}) for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=1000): yield partial # ============================================================================ # GRADIO APP ASSEMBLY # ============================================================================ CUSTOM_CSS = """ .main-header { text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5276 0%, #2e86c1 50%, #48c9b0 100%); border-radius: 12px; margin-bottom: 20px; color: white; } .main-header h1 { color: white; font-size: 2em; margin: 0; } .main-header p { color: #ecf0f1; margin: 5px 0; } .module-info { background: #f0f9ff; border-left: 4px solid #2e86c1; padding: 12px 16px; margin-bottom: 16px; border-radius: 0 8px 8px 0; } .safety-notice { background: #fff3e0; border-left: 4px solid #f39c12; padding: 10px 14px; margin-top: 10px; border-radius: 0 8px 8px 0; font-size: 0.9em; } .status-badge { display: inline-block; padding: 4px 12px; border-radius: 12px; font-size: 0.85em; font-weight: bold; } .status-on { background: #d4edda; color: #155724; } .status-off { background: #f8d7da; color: #721c24; } """ def build_app(): with gr.Blocks(title="Bioinformatics with BB Tutor", css=CUSTOM_CSS) as demo: # ── Global shared state ───────────────────────────────────────── rag_store = gr.State(DEFAULT_RAG_STATE) # ── Status indicator ──────────────────────────────────────────── llm_status = "🟢 AI Enabled" if llm_service.is_available() else "🔴 AI Offline (Knowledge Base Active)" # ── Header ───────────────────────────────────────────────────── gr.HTML(f"""

🧬 Bioinformatics with BB Tutor

AI-powered bioinformatics teaching assistant

RNA-seq · Exome · Genome · Microbiome · Variants · Molecular Genetics · scRNA-seq · ATAC-seq · ChIP-seq · and more

{llm_status}

""") with gr.Tabs(): # ══════════════════════════════════════════════════════════════ # TAB 1: ASK THE TUTOR # ══════════════════════════════════════════════════════════════ with gr.Tab("🧬 Ask the Tutor", id="ask"): gr.HTML('
💡 Ask any bioinformatics question. RAG-augmented responses from a curated knowledge base covering 15+ domains.
') # Examples must be list-of-lists matching fn signature: (message, history, system_prompt, temperature, max_tokens, rag_state) # Additional inputs: [system_prompt, temperature, max_tokens, rag_store] → 4 values per example ask_examples = [ ["What is the difference between DESeq2 and edgeR?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], ["Explain the GATK variant calling pipeline step by step.", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], ["What is the difference between alpha and beta diversity?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], ["Why should I use adjusted p-values instead of raw p-values?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], ["Explain the single-cell RNA-seq analysis workflow.", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], ["What is BQSR and why is it important?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], ["How do I choose between STAR and HISAT2 for alignment?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], ["What common mistakes do students make with DESeq2?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], ] gr.ChatInterface( fn=tutor_respond, type="messages", additional_inputs=[ gr.Textbox(value=DEFAULT_SYSTEM_PROMPT, label="System Prompt", lines=2, visible=False), gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Temperature", visible=False), gr.Slider(256, 4096, 1024, step=256, label="Max Tokens", visible=False), rag_store, ], additional_inputs_accordion=gr.Accordion("⚙️ Advanced", open=False, visible=False), examples=ask_examples, ) gr.HTML('
⚠️ Educational use only. Not for clinical interpretation. Always consult qualified professionals for clinical genomics.
') # ══════════════════════════════════════════════════════════════ # TAB 2: UPLOAD & EXPLAIN # ══════════════════════════════════════════════════════════════ with gr.Tab("📄 Upload & Explain", id="upload"): gr.HTML('
📄 Upload bioinformatics documents (PDF, TXT, FASTA, VCF, etc.) and get AI-powered analysis. Content is indexed and searchable across all modules.
') with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="Upload Document", file_types=[".pdf", ".txt", ".md", ".csv", ".tsv", ".fasta", ".fa", ".fastq", ".vcf", ".bed", ".gff", ".gtf", ".sam", ".bam"], file_count="single", type="filepath", ) process_btn = gr.Button("🔍 Analyze Document", variant="primary") gr.Markdown("**Supported:** PDF, text, FASTA/FASTQ, VCF, BED, GFF/GTF, SAM/BAM, CSV/TSV") with gr.Column(scale=2): explanation_output = gr.Markdown(label="Analysis & Explanation") with gr.Accordion("📝 Raw Extracted Text", open=False): raw_text_output = gr.Textbox(label="Extracted Text", lines=10, show_copy_button=True) process_btn.click( fn=process_upload, inputs=[file_input, rag_store], outputs=[explanation_output, raw_text_output, rag_store], ) gr.Markdown("### 💬 Chat About Your Document") # fn signature: (message, history, rag_state) → 1 additional input upload_chat_examples = [ ["Summarize the key methods in this paper.", DEFAULT_RAG_STATE], ["What bioinformatics tools are mentioned?", DEFAULT_RAG_STATE], ["Explain the main findings in simple terms.", DEFAULT_RAG_STATE], ["What are the limitations of this analysis?", DEFAULT_RAG_STATE], ] gr.ChatInterface( fn=upload_chat_respond, type="messages", additional_inputs=[rag_store], examples=upload_chat_examples, ) # ══════════════════════════════════════════════════════════════ # TAB 3: QUIZ ME # ══════════════════════════════════════════════════════════════ with gr.Tab("❓ Quiz Me", id="quiz"): gr.HTML('
🧠 Test your knowledge with AI-generated quizzes across all bioinformatics domains.
') with gr.Row(): quiz_topic = gr.Dropdown( choices=TOPIC_CHOICES, label="Select Topic", allow_custom_value=True, value="RNA-seq: Differential Expression (DESeq2)" ) quiz_type = gr.Radio( choices=["Multiple Choice (MCQ)", "True/False", "Short Answer"], value="Multiple Choice (MCQ)", label="Format" ) with gr.Row(): quiz_difficulty = gr.Radio( choices=DIFFICULTY_LEVELS, value="Intermediate", label="Difficulty" ) num_questions = gr.Slider(1, 10, 5, step=1, label="# Questions") generate_quiz_btn = gr.Button("🎲 Generate Quiz", variant="primary") quiz_output = gr.Markdown(label="Generated Quiz") answer_key_state = gr.State("") generate_quiz_btn.click( fn=generate_quiz, inputs=[quiz_topic, quiz_type, num_questions, quiz_difficulty, rag_store], outputs=[quiz_output, answer_key_state], ) gr.Markdown("---") gr.Markdown("### ✍️ Submit Your Answers") with gr.Row(): user_answers = gr.Textbox( label="Your Answers (e.g., '1: A, 2: B')", lines=5, placeholder="Type your answers here...", scale=3 ) check_btn = gr.Button("✅ Check", variant="primary", scale=1) feedback_output = gr.Markdown(label="Feedback") check_btn.click( fn=check_quiz_answers, inputs=[user_answers, answer_key_state], outputs=[feedback_output], ) # ══════════════════════════════════════════════════════════════ # TAB 4: BUILD A LESSON # ══════════════════════════════════════════════════════════════ with gr.Tab("📚 Build a Lesson", id="lesson"): gr.HTML('
📚 Generate structured lessons with learning objectives, explanations, exercises, and self-assessment quizzes.
') with gr.Row(): lesson_topic = gr.Dropdown( choices=TOPIC_CHOICES, label="Lesson Topic", allow_custom_value=True, value="RNA-seq: Differential Expression (DESeq2)" ) lesson_level = gr.Radio( choices=DIFFICULTY_LEVELS, value="Intermediate", label="Level" ) with gr.Row(): include_exercises = gr.Checkbox(label="Include Exercises", value=True) include_quiz = gr.Checkbox(label="Include Quiz", value=True) generate_lesson_btn = gr.Button("📝 Generate Lesson", variant="primary") lesson_output = gr.Markdown(label="Generated Lesson") generate_lesson_btn.click( fn=generate_lesson, inputs=[lesson_topic, lesson_level, include_exercises, include_quiz], outputs=[lesson_output], ) # ══════════════════════════════════════════════════════════════ # TAB 5: WORKFLOW COACH # ══════════════════════════════════════════════════════════════ with gr.Tab("🔬 Workflow Coach", id="workflow"): gr.HTML('
🔬 Step-by-step guidance through bioinformatics analysis pipelines. Select a workflow and ask about any step.
') workflow_selector = gr.Dropdown( choices=WORKFLOW_CHOICES, label="Select Workflow", value="Bulk RNA-seq: Full DE Analysis Pipeline", allow_custom_value=True, ) # fn signature: (message, history, selected_workflow, temperature) → 2 additional inputs workflow_examples = [ ["Walk me through the complete pipeline from raw FASTQ to DE results.", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7], ["I'm at alignment. What should I check before counting?", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7], ["My mapping rate is only 45%. What could be wrong?", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7], ["How do I choose between STAR and HISAT2?", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7], ["What parameters for GATK HaplotypeCaller on exome data?", "Exome Sequencing: Variant Calling Pipeline", 0.7], ["How do I set DADA2 truncation parameters?", "Microbiome: 16S Amplicon Analysis (QIIME2)", 0.7], ] gr.ChatInterface( fn=workflow_respond, type="messages", additional_inputs=[ workflow_selector, gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Temperature", visible=False), ], additional_inputs_accordion=gr.Accordion("⚙️", open=False, visible=False), examples=workflow_examples, ) # ══════════════════════════════════════════════════════════════ # TAB 6: PAPER TO LESSON # ══════════════════════════════════════════════════════════════ with gr.Tab("📰 Paper to Lesson", id="paper"): gr.HTML('
📰 Convert research papers into teaching material. Upload a paper in the Upload tab first, then generate lessons, slides, or quizzes from it.
') output_format = gr.Radio( choices=["Lesson Plan", "Slide Outline", "Study Notes", "Quiz Questions"], value="Lesson Plan", label="Output Format" ) # fn signature: (message, history, output_format, rag_state) → 2 additional inputs paper_examples = [ ["Convert this paper into a 45-minute lecture plan.", "Lesson Plan", DEFAULT_RAG_STATE], ["Create a slide outline covering the key methods.", "Slide Outline", DEFAULT_RAG_STATE], ["Generate study notes on the bioinformatics methods.", "Study Notes", DEFAULT_RAG_STATE], ["Create quiz questions on this paper's methodology.", "Quiz Questions", DEFAULT_RAG_STATE], ] gr.ChatInterface( fn=paper_to_lesson_respond, type="messages", additional_inputs=[output_format, rag_store], examples=paper_examples, ) # ══════════════════════════════════════════════════════════════ # TAB 7: VIVA PRACTICE # ══════════════════════════════════════════════════════════════ with gr.Tab("🎓 Viva Practice", id="viva"): gr.HTML('
🎓 Practice oral examinations. The AI examiner asks probing questions, evaluates your answers, and pushes deeper understanding.
') with gr.Row(): viva_topic = gr.Dropdown( choices=TOPIC_CHOICES, label="Viva Topic", allow_custom_value=True, value="RNA-seq: Differential Expression (DESeq2)" ) viva_difficulty = gr.Radio( choices=DIFFICULTY_LEVELS, value="Intermediate", label="Difficulty" ) # fn signature: (message, history, topic, difficulty) → 2 additional inputs viva_examples = [ ["I'm ready for my viva. Start with your first question.", "RNA-seq: Differential Expression (DESeq2)", "Intermediate"], ["Focus on the statistical aspects of RNA-seq.", "RNA-seq: Differential Expression (DESeq2)", "Intermediate"], ["Ask me about variant calling and interpretation.", "Variant Interpretation: ACMG Guidelines", "Intermediate"], ["Test my understanding of microbiome diversity.", "Microbiome: Alpha & Beta Diversity", "Intermediate"], ] gr.ChatInterface( fn=viva_respond, type="messages", additional_inputs=[viva_topic, viva_difficulty], examples=viva_examples, ) # ── Footer ───────────────────────────────────────────────────── gr.HTML("""

Bioinformatics with BB Tutor — Educational AI Assistant

⚠️ For educational purposes only. Not for clinical use.

RNA-seq · Exome · Genome · Microbiome · Variants · Molecular Genetics · scRNA-seq · ATAC-seq · ChIP-seq · Methylation · Small RNA · Targeted Panels · Long-read · Spatial · Multi-omics

""") return demo if __name__ == "__main__": demo = build_app() demo.launch(server_name="0.0.0.0", server_port=7860, share=False)