Spaces:
Sleeping
Sleeping
| """ | |
| Bioinformatics with BB Tutor β Complete Application | |
| A production bioinformatics teaching assistant with 7 modules. | |
| Architecture: | |
| - Backend: LLMService (HuggingFace InferenceClient), RAGService (sentence-transformers), | |
| DocumentParser (PyMuPDF + text), knowledge_base (domain content) | |
| - Frontend: 7 Gradio tabs with ChatInterface, file upload, quiz generation, lesson building | |
| - Data flow: User query β RAG retrieval β LLM with context β streaming response | |
| - Shared state: rag_store (gr.State) holds uploaded document chunks + embeddings across tabs | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import os | |
| from pathlib import Path | |
| # ββ Conditional imports with fallbacks ββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| import fitz # PyMuPDF | |
| HAS_FITZ = True | |
| except ImportError: | |
| HAS_FITZ = False | |
| print("Warning: PyMuPDF not available. PDF parsing disabled.") | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| HAS_ST = True | |
| except ImportError: | |
| HAS_ST = False | |
| print("Warning: sentence-transformers not available. Embedding search disabled.") | |
| try: | |
| from huggingface_hub import InferenceClient | |
| HAS_HF = True | |
| except ImportError: | |
| HAS_HF = False | |
| print("Warning: huggingface_hub not available. LLM service disabled.") | |
| # ββ Import knowledge base ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| from knowledge_base import ( | |
| DOMAIN_TAXONOMY, WORKFLOWS, GLOSSARY, COMMON_MISCONCEPTIONS, | |
| SYSTEM_PROMPTS, QUIZ_TEMPLATES, LESSON_TEMPLATE, | |
| TOPIC_CHOICES, DIFFICULTY_LEVELS, WORKFLOW_CHOICES | |
| ) | |
| # ============================================================================ | |
| # CONFIGURATION | |
| # ============================================================================ | |
| LLM_MODEL = os.environ.get("LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3") | |
| EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| CHUNK_SIZE = 400 | |
| CHUNK_OVERLAP = 60 | |
| TOP_K_RETRIEVAL = 3 | |
| DEFAULT_SYSTEM_PROMPT = SYSTEM_PROMPTS["ask_tutor"] | |
| DEFAULT_RAG_STATE = {"chunks": [], "embeddings": None} | |
| # ============================================================================ | |
| # BACKEND SERVICES β Singleton Pattern | |
| # ============================================================================ | |
| class LLMService: | |
| """Lazy-initialized LLM inference service.""" | |
| _instance = None | |
| _initialized = False | |
| def __new__(cls): | |
| if cls._instance is None: | |
| cls._instance = super().__new__(cls) | |
| return cls._instance | |
| def __init__(self): | |
| if LLMService._initialized: | |
| return | |
| LLMService._initialized = True | |
| self.client = None | |
| self._try_init() | |
| def _try_init(self): | |
| if not HAS_HF: | |
| print("LLMService: huggingface_hub not available") | |
| return | |
| if not HF_TOKEN: | |
| print("LLMService: HF_TOKEN not set in environment") | |
| return | |
| try: | |
| self.client = InferenceClient( | |
| model=LLM_MODEL, | |
| token=HF_TOKEN, | |
| timeout=120, | |
| ) | |
| print("LLMService: Initialized successfully") | |
| except Exception as e: | |
| print(f"LLMService: Failed to initialize: {e}") | |
| self.client = None | |
| def is_available(self): | |
| return self.client is not None | |
| def stream_chat(self, messages, temperature=0.7, max_tokens=1024): | |
| """Stream chat completion. Yields partial response strings.""" | |
| if not self.is_available(): | |
| yield self._fallback_response(messages) | |
| return | |
| try: | |
| partial = "" | |
| for chunk in self.client.chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.9, | |
| stream=True, | |
| ): | |
| token = "" | |
| if hasattr(chunk, 'choices') and chunk.choices: | |
| choice = chunk.choices[0] | |
| if hasattr(choice, 'delta') and hasattr(choice.delta, 'content'): | |
| token = choice.delta.content or "" | |
| partial += token | |
| yield partial | |
| except Exception as e: | |
| print(f"LLM stream error: {e}") | |
| yield f"β οΈ LLM API error: {str(e)}\n\nPlease check your HF_TOKEN in Space settings and ensure the model '{LLM_MODEL}' is accessible.\n\nThe tutor is still functional using its knowledge base for many questions β try asking about specific bioinformatics topics!" | |
| def generate(self, messages, temperature=0.7, max_tokens=1024): | |
| """Non-streaming generation. Returns complete response.""" | |
| if not self.is_available(): | |
| return self._fallback_response(messages) | |
| try: | |
| response = self.client.chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.9, | |
| stream=False, | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| print(f"LLM generate error: {e}") | |
| return f"β οΈ LLM API error: {str(e)}\n\nThe tutor can still answer from its knowledge base. Try asking about specific concepts!" | |
| def _fallback_response(self, messages): | |
| """Knowledge-base fallback when LLM unavailable.""" | |
| user_msg = "" | |
| for m in reversed(messages): | |
| if isinstance(m, dict) and m.get("role") == "user": | |
| user_msg = m.get("content", "").lower() | |
| break | |
| if not user_msg: | |
| return "β οΈ **LLM not available.** Add HF_TOKEN in Space settings to enable AI responses.\n\nMeanwhile, the knowledge base covers: DESeq2, variant calling, microbiome diversity, scRNA-seq clustering, and more. Try asking a specific question!" | |
| response_parts = [] | |
| for term, definition in GLOSSARY.items(): | |
| if term.lower() in user_msg: | |
| response_parts.append(f"**{term}**: {definition}") | |
| if len(response_parts) >= 3: | |
| break | |
| for wf_key, wf in WORKFLOWS.items(): | |
| if any(kw in user_msg for kw in wf["name"].lower().split()): | |
| response_parts.append(f"\n### {wf['name']}") | |
| for step in wf["steps"][:3]: | |
| response_parts.append(f"**Step {step['step']}: {step['name']}**\n{step['description']}") | |
| break | |
| for misc in COMMON_MISCONCEPTIONS: | |
| if misc["domain"].replace("_", " ") in user_msg or any(w in user_msg for w in misc["misconception"].lower().split()[:5]): | |
| response_parts.append(f"\nβ οΈ **Common Misconception**: {misc['misconception']}\n\nβ **Correction**: {misc['correction']}") | |
| break | |
| if response_parts: | |
| return "π *Responding from knowledge base (LLM not configured):*\n\n" + "\n\n".join(response_parts) | |
| return ( | |
| "β οΈ **AI responses require HF_TOKEN.**\n\n" | |
| "To enable full AI-powered responses:\n" | |
| "1. Go to your HuggingFace account β Settings β Access Tokens\n" | |
| "2. Create a token with 'inference-api' scope\n" | |
| "3. Add it as a Secret named `HF_TOKEN` in this Space's Settings\n\n" | |
| "The knowledge base can still answer many questions. Try asking about 'RNA-seq workflow', 'variant calling', or 'microbiome diversity'!" | |
| ) | |
| class RAGService: | |
| """Document retrieval with lazy embedding model loading.""" | |
| _instance = None | |
| _initialized = False | |
| def __new__(cls): | |
| if cls._instance is None: | |
| cls._instance = super().__new__(cls) | |
| return cls._instance | |
| def __init__(self): | |
| if RAGService._initialized: | |
| return | |
| RAGService._initialized = True | |
| self.embedder = None | |
| self.kb_chunks = [] | |
| self.kb_metadata = [] | |
| self.kb_embeddings = None | |
| self._build_kb_index() | |
| def _ensure_embedder(self): | |
| if self.embedder is not None: | |
| return True | |
| if not HAS_ST: | |
| return False | |
| try: | |
| print("RAGService: Loading embedding model...") | |
| self.embedder = SentenceTransformer(EMBED_MODEL) | |
| print("RAGService: Embedding model loaded") | |
| if self.kb_chunks: | |
| self.kb_embeddings = self.embedder.encode( | |
| self.kb_chunks, | |
| convert_to_numpy=True, | |
| normalize_embeddings=True, | |
| show_progress_bar=False, | |
| batch_size=32, | |
| ) | |
| print(f"RAGService: KB embedded ({len(self.kb_chunks)} chunks)") | |
| return True | |
| except Exception as e: | |
| print(f"RAGService: Failed to load embedder: {e}") | |
| return False | |
| def _build_kb_index(self): | |
| chunks = [] | |
| metadata = [] | |
| for term, definition in GLOSSARY.items(): | |
| chunks.append(f"{term}: {definition}") | |
| metadata.append({"source": "glossary", "topic": term, "type": "definition"}) | |
| for wf_key, wf in WORKFLOWS.items(): | |
| for step in wf["steps"]: | |
| step_text = f"{wf['name']} - Step {step['step']}: {step['name']}. {step['description']}" | |
| if step.get("tools"): | |
| step_text += f" Tools: {', '.join(step['tools'])}." | |
| if step.get("common_mistakes"): | |
| step_text += " Common mistakes: " + "; ".join(step["common_mistakes"]) | |
| chunks.append(step_text) | |
| metadata.append({ | |
| "source": "workflow", | |
| "topic": wf["domain"], | |
| "type": "workflow_step", | |
| "step": step["step"], | |
| "workflow": wf_key | |
| }) | |
| for misc in COMMON_MISCONCEPTIONS: | |
| text = f"Misconception: {misc['misconception']} Correction: {misc['correction']}" | |
| chunks.append(text) | |
| metadata.append({ | |
| "source": "misconception", | |
| "topic": misc["domain"], | |
| "type": "misconception", | |
| "severity": misc["severity"] | |
| }) | |
| for key, domain in DOMAIN_TAXONOMY.items(): | |
| text = f"{domain['name']} covers: {', '.join(domain['subtopics'][:10])}" | |
| chunks.append(text) | |
| metadata.append({"source": "taxonomy", "topic": key, "type": "domain_overview"}) | |
| self.kb_chunks = chunks | |
| self.kb_metadata = metadata | |
| print(f"RAGService: Built KB with {len(chunks)} chunks") | |
| def search(self, query, top_k=TOP_K_RETRIEVAL, user_chunks=None, user_embeddings=None): | |
| if not self._ensure_embedder(): | |
| return self._keyword_search(query, top_k) | |
| try: | |
| query_embedding = self.embedder.encode( | |
| [query], | |
| convert_to_numpy=True, | |
| normalize_embeddings=True, | |
| ) | |
| results = [] | |
| if self.kb_embeddings is not None and len(self.kb_embeddings) > 0: | |
| kb_scores = np.dot(query_embedding, self.kb_embeddings.T)[0] | |
| top_indices = np.argsort(kb_scores)[::-1][:top_k] | |
| for idx in top_indices: | |
| if kb_scores[idx] > 0.15: | |
| results.append({ | |
| "text": self.kb_chunks[idx], | |
| "score": float(kb_scores[idx]), | |
| "metadata": self.kb_metadata[idx] | |
| }) | |
| if user_chunks and user_embeddings is not None and len(user_embeddings) > 0: | |
| user_scores = np.dot(query_embedding, user_embeddings.T)[0] | |
| top_user = np.argsort(user_scores)[::-1][:top_k] | |
| for idx in top_user: | |
| if user_scores[idx] > 0.15: | |
| results.append({ | |
| "text": user_chunks[idx], | |
| "score": float(user_scores[idx]), | |
| "metadata": {"source": "uploaded", "type": "user_content"} | |
| }) | |
| results.sort(key=lambda x: x["score"], reverse=True) | |
| return results[:top_k] | |
| except Exception as e: | |
| print(f"RAG search error: {e}") | |
| return self._keyword_search(query, top_k) | |
| def _keyword_search(self, query, top_k=3): | |
| query_words = set(query.lower().split()) | |
| scored = [] | |
| for i, chunk in enumerate(self.kb_chunks): | |
| chunk_words = set(chunk.lower().split()) | |
| overlap = len(query_words & chunk_words) | |
| if overlap > 0: | |
| scored.append({ | |
| "text": chunk, | |
| "score": overlap / max(len(query_words), 1), | |
| "metadata": self.kb_metadata[i] | |
| }) | |
| scored.sort(key=lambda x: x["score"], reverse=True) | |
| return scored[:top_k] | |
| def embed_chunks(self, chunks): | |
| if not self._ensure_embedder() or not chunks: | |
| return None | |
| try: | |
| return self.embedder.encode( | |
| chunks, | |
| convert_to_numpy=True, | |
| normalize_embeddings=True, | |
| show_progress_bar=False, | |
| batch_size=16, | |
| ) | |
| except Exception as e: | |
| print(f"Embed chunks error: {e}") | |
| return None | |
| class DocumentParser: | |
| def parse_file(filepath): | |
| if filepath is None: | |
| return "", [] | |
| filepath = str(filepath) | |
| ext = Path(filepath).suffix.lower() | |
| try: | |
| if ext == ".pdf" and HAS_FITZ: | |
| return DocumentParser._parse_pdf(filepath) | |
| elif ext in (".txt", ".md", ".csv", ".tsv", ".fasta", ".fa", ".fastq", ".fq", ".vcf", ".bed", ".gff", ".gtf", ".sam", ".bam"): | |
| return DocumentParser._parse_text(filepath) | |
| else: | |
| return f"Unsupported file type: {ext}", [] | |
| except Exception as e: | |
| return f"Error parsing file: {str(e)}", [] | |
| def _parse_pdf(filepath): | |
| doc = fitz.open(filepath) | |
| pages = [] | |
| for page_num in range(len(doc)): | |
| text = doc[page_num].get_text() | |
| if text.strip(): | |
| pages.append(text) | |
| doc.close() | |
| full_text = "\n\n".join(pages) | |
| chunks = DocumentParser._chunk_text(full_text) | |
| return full_text, chunks | |
| def _parse_text(filepath): | |
| with open(filepath, "r", encoding="utf-8", errors="replace") as f: | |
| text = f.read() | |
| chunks = DocumentParser._chunk_text(text) | |
| return text, chunks | |
| def _chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP): | |
| words = text.split() | |
| if len(words) <= chunk_size: | |
| return [text] if text.strip() else [] | |
| chunks = [] | |
| for i in range(0, len(words), chunk_size - overlap): | |
| chunk = " ".join(words[i:i + chunk_size]) | |
| if chunk.strip(): | |
| chunks.append(chunk) | |
| return chunks | |
| llm_service = LLMService() | |
| rag_service = RAGService() | |
| doc_parser = DocumentParser() | |
| print(f"𧬠BB Tutor initialized. LLM: {llm_service.is_available()}, Embeddings: {rag_service.embedder is not None}") | |
| # ============================================================================ | |
| # HELPER FUNCTIONS | |
| # ============================================================================ | |
| def _rag_context(query, user_chunks=None, user_embeddings=None): | |
| results = rag_service.search(query, top_k=TOP_K_RETRIEVAL, | |
| user_chunks=user_chunks, user_embeddings=user_embeddings) | |
| if not results: | |
| return "" | |
| parts = ["RELEVANT KNOWLEDGE BASE CONTEXT:"] | |
| for r in results: | |
| source = r["metadata"].get("source", "kb") | |
| parts.append(f"[{source}] {r['text'][:800]}") | |
| return "\n".join(parts) | |
| def _format_history(history): | |
| messages = [] | |
| for h in history: | |
| if isinstance(h, dict): | |
| messages.append(h) | |
| elif isinstance(h, (list, tuple)): | |
| if len(h) >= 1 and h[0]: | |
| messages.append({"role": "user", "content": str(h[0])}) | |
| if len(h) >= 2 and h[1]: | |
| messages.append({"role": "assistant", "content": str(h[1])}) | |
| return messages | |
| # ============================================================================ | |
| # MODULE HANDLERS | |
| # ============================================================================ | |
| def tutor_respond(message, history, system_prompt, temperature, max_tokens, rag_state): | |
| if not message or not message.strip(): | |
| yield "" | |
| return | |
| rag_state = rag_state or DEFAULT_RAG_STATE | |
| user_chunks = rag_state.get("chunks", []) | |
| user_embeddings = rag_state.get("embeddings") | |
| rag_ctx = _rag_context(message, user_chunks, user_embeddings) | |
| messages = [{"role": "system", "content": system_prompt}] | |
| if rag_ctx: | |
| messages.append({"role": "system", "content": rag_ctx}) | |
| messages.extend(_format_history(history)) | |
| messages.append({"role": "user", "content": message}) | |
| for partial in llm_service.stream_chat(messages, temperature, max_tokens): | |
| yield partial | |
| def process_upload(file, rag_state): | |
| rag_state = rag_state or DEFAULT_RAG_STATE | |
| if file is None: | |
| return "π Please upload a file first.", "", rag_state | |
| full_text, chunks = doc_parser.parse_file(file) | |
| if not chunks: | |
| return "β οΈ Could not extract text from the uploaded file.", full_text[:2000] if full_text else "", rag_state | |
| embeddings = rag_service.embed_chunks(chunks) | |
| new_state = {"chunks": chunks, "embeddings": embeddings} | |
| preview = full_text[:2500] if len(full_text) > 2500 else full_text | |
| msgs = [ | |
| {"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]}, | |
| {"role": "user", "content": f"Analyze and explain this bioinformatics document:\n\n{preview}"} | |
| ] | |
| explanation = llm_service.generate(msgs, temperature=0.5, max_tokens=1500) | |
| stats = f"π **Document Stats:** {len(chunks)} chunks, ~{len(full_text.split())} words | " | |
| stats += f"File type: {Path(str(file)).suffix} | " | |
| stats += "π€ AI-powered" if llm_service.is_available() else "π Knowledge-base mode" | |
| stats += f"\n\n---\n\n" | |
| return stats + explanation, full_text[:5000], new_state | |
| def upload_chat_respond(message, history, rag_state): | |
| if not message or not message.strip(): | |
| yield "" | |
| return | |
| rag_state = rag_state or DEFAULT_RAG_STATE | |
| user_chunks = rag_state.get("chunks", []) | |
| user_embeddings = rag_state.get("embeddings") | |
| if not user_chunks: | |
| yield "π Please upload a document in the panel above, then ask questions about it.\n\nYour uploaded document will be indexed and searchable across all modules!" | |
| return | |
| rag_results = rag_service.search(message, top_k=4, user_chunks=user_chunks, user_embeddings=user_embeddings) | |
| ctx = "CONTEXT FROM UPLOADED DOCUMENT:\n" | |
| if rag_results: | |
| for r in rag_results: | |
| ctx += f"\n{r['text'][:600]}\n" | |
| else: | |
| ctx += "(No highly relevant passages found β answering from general knowledge)\n" | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]}, | |
| {"role": "system", "content": ctx}, | |
| ] | |
| messages.extend(_format_history(history)) | |
| messages.append({"role": "user", "content": message}) | |
| for partial in llm_service.stream_chat(messages, temperature=0.5, max_tokens=1024): | |
| yield partial | |
| def generate_quiz(topic, quiz_type, num_questions, difficulty, rag_state): | |
| if not topic: | |
| return "β Please select or enter a topic first.", "" | |
| rag_results = rag_service.search(topic, top_k=3) | |
| context = "" | |
| if rag_results: | |
| context = "Reference material:\n" + "\n".join(r["text"][:500] for r in rag_results) | |
| template_key = { | |
| "Multiple Choice (MCQ)": "mcq", | |
| "True/False": "true_false", | |
| "Short Answer": "short_answer" | |
| }.get(quiz_type, "mcq") | |
| quiz_prompt = QUIZ_TEMPLATES[template_key].format( | |
| n=int(num_questions), topic=topic, difficulty=difficulty | |
| ) | |
| messages = [{"role": "system", "content": SYSTEM_PROMPTS["quiz_me"]}] | |
| if context: | |
| messages.append({"role": "system", "content": context}) | |
| messages.append({"role": "user", "content": quiz_prompt}) | |
| response = llm_service.generate(messages, temperature=0.8, max_tokens=2000) | |
| formatted = f"## π§ {topic} Quiz β {difficulty}\n\n" | |
| formatted += f"*Format: {quiz_type} | Questions: {int(num_questions)}*\n\n---\n\n" | |
| formatted += response | |
| return formatted, response | |
| def check_quiz_answers(user_answers, answer_key): | |
| if not user_answers or not user_answers.strip(): | |
| return "βοΈ Please enter your answers above before checking." | |
| if not answer_key: | |
| return "β οΈ Please generate a quiz first (use the panel above)." | |
| messages = [ | |
| {"role": "system", "content": "You are a bioinformatics tutor grading a quiz. Compare student answers to correct answers. For each: mark β or β, explain briefly, provide correct answer if wrong. Be encouraging. Give final score."}, | |
| {"role": "user", "content": f"QUIZ AND ANSWERS:\n{answer_key}\n\nSTUDENT ANSWERS:\n{user_answers}\n\nGrade each:"} | |
| ] | |
| return llm_service.generate(messages, temperature=0.3, max_tokens=1500) | |
| def generate_lesson(topic, level, include_exercises, include_quiz): | |
| if not topic: | |
| return "π Please select or enter a topic." | |
| rag_results = rag_service.search(topic, top_k=4) | |
| context = "" | |
| if rag_results: | |
| context = "Reference:\n" + "\n".join(r["text"][:500] for r in rag_results) | |
| prompt = LESSON_TEMPLATE.format(topic=topic, level=level) | |
| if include_exercises: | |
| prompt += "\n\nInclude 2-3 practical exercises with clear instructions." | |
| if include_quiz: | |
| prompt += "\n\nInclude a 5-question self-assessment quiz (with answers)." | |
| messages = [{"role": "system", "content": SYSTEM_PROMPTS["build_lesson"]}] | |
| if context: | |
| messages.append({"role": "system", "content": context}) | |
| messages.append({"role": "user", "content": prompt}) | |
| return llm_service.generate(messages, temperature=0.7, max_tokens=3000) | |
| def workflow_respond(message, history, selected_workflow, temperature): | |
| if not message or not message.strip(): | |
| yield "" | |
| return | |
| workflow_context = "" | |
| for wf_key, wf in WORKFLOWS.items(): | |
| if wf["name"] in selected_workflow or selected_workflow.lower() in wf["name"].lower(): | |
| workflow_context = f"WORKFLOW REFERENCE: {wf['name']}\n\n" | |
| for step in wf["steps"]: | |
| workflow_context += f"Step {step['step']}: {step['name']}\n" | |
| workflow_context += f" {step['description']}\n" | |
| if step.get("tools"): | |
| workflow_context += f" Tools: {', '.join(step['tools'])}\n" | |
| if step.get("common_mistakes"): | |
| workflow_context += f" β οΈ Common mistakes: {'; '.join(step['common_mistakes'])}\n" | |
| workflow_context += "\n" | |
| break | |
| rag_results = rag_service.search(message, top_k=2) | |
| if rag_results: | |
| workflow_context += "\nADDITIONAL CONTEXT:\n" + "\n".join(r["text"][:500] for r in rag_results) | |
| messages = [{"role": "system", "content": SYSTEM_PROMPTS["workflow_coach"]}] | |
| if workflow_context: | |
| messages.append({"role": "system", "content": workflow_context}) | |
| messages.extend(_format_history(history)) | |
| messages.append({"role": "user", "content": message}) | |
| for partial in llm_service.stream_chat(messages, temperature, 1500): | |
| yield partial | |
| def paper_to_lesson_respond(message, history, output_format, rag_state): | |
| if not message or not message.strip(): | |
| yield "" | |
| return | |
| rag_state = rag_state or DEFAULT_RAG_STATE | |
| user_chunks = rag_state.get("chunks", []) | |
| user_embeddings = rag_state.get("embeddings") | |
| context = "" | |
| if user_chunks: | |
| rag_results = rag_service.search(message, top_k=4, user_chunks=user_chunks, user_embeddings=user_embeddings) | |
| if rag_results: | |
| context = "PAPER CONTENT:\n" + "\n".join(r["text"][:600] for r in rag_results) | |
| format_instruction = { | |
| "Lesson Plan": "Create a structured lesson plan with learning objectives, sections, and exercises.", | |
| "Slide Outline": "Create a slide-by-slide outline with key points for each slide.", | |
| "Study Notes": "Create concise study notes highlighting key methods, tools, and findings.", | |
| "Quiz Questions": "Generate 5-10 quiz questions based on the paper's methods and findings.", | |
| }.get(output_format, "Create a structured lesson plan.") | |
| messages = [{"role": "system", "content": SYSTEM_PROMPTS["paper_to_lesson"]}] | |
| if context: | |
| messages.append({"role": "system", "content": context}) | |
| messages.extend(_format_history(history)) | |
| full_msg = f"{message}\n\nOUTPUT FORMAT: {format_instruction}" | |
| messages.append({"role": "user", "content": full_msg}) | |
| for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=2500): | |
| yield partial | |
| def viva_respond(message, history, topic, difficulty): | |
| if not message or not message.strip(): | |
| yield "" | |
| return | |
| rag_results = rag_service.search(f"{topic} {message}", top_k=3) | |
| context = "" | |
| if rag_results: | |
| context = "Reference:\n" + "\n".join(r["text"][:500] for r in rag_results) | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPTS["viva_practice"]}, | |
| {"role": "system", "content": f"VIVA TOPIC: {topic}\nDIFFICULTY: {difficulty}\n\n{context}"}, | |
| ] | |
| messages.extend(_format_history(history)) | |
| messages.append({"role": "user", "content": message}) | |
| for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=1000): | |
| yield partial | |
| # ============================================================================ | |
| # GRADIO APP ASSEMBLY | |
| # ============================================================================ | |
| CUSTOM_CSS = """ | |
| .main-header { | |
| text-align: center; padding: 20px; | |
| background: linear-gradient(135deg, #1a5276 0%, #2e86c1 50%, #48c9b0 100%); | |
| border-radius: 12px; margin-bottom: 20px; color: white; | |
| } | |
| .main-header h1 { color: white; font-size: 2em; margin: 0; } | |
| .main-header p { color: #ecf0f1; margin: 5px 0; } | |
| .module-info { | |
| background: #f0f9ff; border-left: 4px solid #2e86c1; | |
| padding: 12px 16px; margin-bottom: 16px; border-radius: 0 8px 8px 0; | |
| } | |
| .safety-notice { | |
| background: #fff3e0; border-left: 4px solid #f39c12; | |
| padding: 10px 14px; margin-top: 10px; border-radius: 0 8px 8px 0; font-size: 0.9em; | |
| } | |
| .status-badge { | |
| display: inline-block; padding: 4px 12px; border-radius: 12px; | |
| font-size: 0.85em; font-weight: bold; | |
| } | |
| .status-on { background: #d4edda; color: #155724; } | |
| .status-off { background: #f8d7da; color: #721c24; } | |
| """ | |
| def build_app(): | |
| with gr.Blocks(title="Bioinformatics with BB Tutor", css=CUSTOM_CSS) as demo: | |
| # ββ Global shared state βββββββββββββββββββββββββββββββββββββββββ | |
| rag_store = gr.State(DEFAULT_RAG_STATE) | |
| # ββ Status indicator ββββββββββββββββββββββββββββββββββββββββββββ | |
| llm_status = "π’ AI Enabled" if llm_service.is_available() else "π΄ AI Offline (Knowledge Base Active)" | |
| # ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.HTML(f""" | |
| <div class="main-header"> | |
| <h1>𧬠Bioinformatics with BB Tutor</h1> | |
| <p>AI-powered bioinformatics teaching assistant</p> | |
| <p style="font-size: 0.85em; opacity: 0.9;"> | |
| RNA-seq Β· Exome Β· Genome Β· Microbiome Β· Variants Β· Molecular Genetics Β· scRNA-seq Β· ATAC-seq Β· ChIP-seq Β· and more | |
| </p> | |
| <p style="font-size: 0.8em; margin-top: 8px;"> | |
| <span class="status-badge {'status-on' if llm_service.is_available() else 'status-off'}">{llm_status}</span> | |
| </p> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 1: ASK THE TUTOR | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("𧬠Ask the Tutor", id="ask"): | |
| gr.HTML('<div class="module-info">π‘ Ask any bioinformatics question. RAG-augmented responses from a curated knowledge base covering 15+ domains.</div>') | |
| # Examples must be list-of-lists matching fn signature: (message, history, system_prompt, temperature, max_tokens, rag_state) | |
| # Additional inputs: [system_prompt, temperature, max_tokens, rag_store] β 4 values per example | |
| ask_examples = [ | |
| ["What is the difference between DESeq2 and edgeR?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], | |
| ["Explain the GATK variant calling pipeline step by step.", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], | |
| ["What is the difference between alpha and beta diversity?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], | |
| ["Why should I use adjusted p-values instead of raw p-values?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], | |
| ["Explain the single-cell RNA-seq analysis workflow.", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], | |
| ["What is BQSR and why is it important?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], | |
| ["How do I choose between STAR and HISAT2 for alignment?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], | |
| ["What common mistakes do students make with DESeq2?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE], | |
| ] | |
| gr.ChatInterface( | |
| fn=tutor_respond, | |
| type="messages", | |
| additional_inputs=[ | |
| gr.Textbox(value=DEFAULT_SYSTEM_PROMPT, label="System Prompt", lines=2, visible=False), | |
| gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Temperature", visible=False), | |
| gr.Slider(256, 4096, 1024, step=256, label="Max Tokens", visible=False), | |
| rag_store, | |
| ], | |
| additional_inputs_accordion=gr.Accordion("βοΈ Advanced", open=False, visible=False), | |
| examples=ask_examples, | |
| ) | |
| gr.HTML('<div class="safety-notice">β οΈ <strong>Educational use only.</strong> Not for clinical interpretation. Always consult qualified professionals for clinical genomics.</div>') | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 2: UPLOAD & EXPLAIN | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Upload & Explain", id="upload"): | |
| gr.HTML('<div class="module-info">π Upload bioinformatics documents (PDF, TXT, FASTA, VCF, etc.) and get AI-powered analysis. Content is indexed and searchable across all modules.</div>') | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_input = gr.File( | |
| label="Upload Document", | |
| file_types=[".pdf", ".txt", ".md", ".csv", ".tsv", | |
| ".fasta", ".fa", ".fastq", ".vcf", ".bed", | |
| ".gff", ".gtf", ".sam", ".bam"], | |
| file_count="single", type="filepath", | |
| ) | |
| process_btn = gr.Button("π Analyze Document", variant="primary") | |
| gr.Markdown("**Supported:** PDF, text, FASTA/FASTQ, VCF, BED, GFF/GTF, SAM/BAM, CSV/TSV") | |
| with gr.Column(scale=2): | |
| explanation_output = gr.Markdown(label="Analysis & Explanation") | |
| with gr.Accordion("π Raw Extracted Text", open=False): | |
| raw_text_output = gr.Textbox(label="Extracted Text", lines=10, show_copy_button=True) | |
| process_btn.click( | |
| fn=process_upload, | |
| inputs=[file_input, rag_store], | |
| outputs=[explanation_output, raw_text_output, rag_store], | |
| ) | |
| gr.Markdown("### π¬ Chat About Your Document") | |
| # fn signature: (message, history, rag_state) β 1 additional input | |
| upload_chat_examples = [ | |
| ["Summarize the key methods in this paper.", DEFAULT_RAG_STATE], | |
| ["What bioinformatics tools are mentioned?", DEFAULT_RAG_STATE], | |
| ["Explain the main findings in simple terms.", DEFAULT_RAG_STATE], | |
| ["What are the limitations of this analysis?", DEFAULT_RAG_STATE], | |
| ] | |
| gr.ChatInterface( | |
| fn=upload_chat_respond, | |
| type="messages", | |
| additional_inputs=[rag_store], | |
| examples=upload_chat_examples, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 3: QUIZ ME | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("β Quiz Me", id="quiz"): | |
| gr.HTML('<div class="module-info">π§ Test your knowledge with AI-generated quizzes across all bioinformatics domains.</div>') | |
| with gr.Row(): | |
| quiz_topic = gr.Dropdown( | |
| choices=TOPIC_CHOICES, label="Select Topic", | |
| allow_custom_value=True, value="RNA-seq: Differential Expression (DESeq2)" | |
| ) | |
| quiz_type = gr.Radio( | |
| choices=["Multiple Choice (MCQ)", "True/False", "Short Answer"], | |
| value="Multiple Choice (MCQ)", label="Format" | |
| ) | |
| with gr.Row(): | |
| quiz_difficulty = gr.Radio( | |
| choices=DIFFICULTY_LEVELS, value="Intermediate", label="Difficulty" | |
| ) | |
| num_questions = gr.Slider(1, 10, 5, step=1, label="# Questions") | |
| generate_quiz_btn = gr.Button("π² Generate Quiz", variant="primary") | |
| quiz_output = gr.Markdown(label="Generated Quiz") | |
| answer_key_state = gr.State("") | |
| generate_quiz_btn.click( | |
| fn=generate_quiz, | |
| inputs=[quiz_topic, quiz_type, num_questions, quiz_difficulty, rag_store], | |
| outputs=[quiz_output, answer_key_state], | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("### βοΈ Submit Your Answers") | |
| with gr.Row(): | |
| user_answers = gr.Textbox( | |
| label="Your Answers (e.g., '1: A, 2: B')", | |
| lines=5, placeholder="Type your answers here...", scale=3 | |
| ) | |
| check_btn = gr.Button("β Check", variant="primary", scale=1) | |
| feedback_output = gr.Markdown(label="Feedback") | |
| check_btn.click( | |
| fn=check_quiz_answers, | |
| inputs=[user_answers, answer_key_state], | |
| outputs=[feedback_output], | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 4: BUILD A LESSON | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Build a Lesson", id="lesson"): | |
| gr.HTML('<div class="module-info">π Generate structured lessons with learning objectives, explanations, exercises, and self-assessment quizzes.</div>') | |
| with gr.Row(): | |
| lesson_topic = gr.Dropdown( | |
| choices=TOPIC_CHOICES, label="Lesson Topic", | |
| allow_custom_value=True, value="RNA-seq: Differential Expression (DESeq2)" | |
| ) | |
| lesson_level = gr.Radio( | |
| choices=DIFFICULTY_LEVELS, value="Intermediate", label="Level" | |
| ) | |
| with gr.Row(): | |
| include_exercises = gr.Checkbox(label="Include Exercises", value=True) | |
| include_quiz = gr.Checkbox(label="Include Quiz", value=True) | |
| generate_lesson_btn = gr.Button("π Generate Lesson", variant="primary") | |
| lesson_output = gr.Markdown(label="Generated Lesson") | |
| generate_lesson_btn.click( | |
| fn=generate_lesson, | |
| inputs=[lesson_topic, lesson_level, include_exercises, include_quiz], | |
| outputs=[lesson_output], | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 5: WORKFLOW COACH | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π¬ Workflow Coach", id="workflow"): | |
| gr.HTML('<div class="module-info">π¬ Step-by-step guidance through bioinformatics analysis pipelines. Select a workflow and ask about any step.</div>') | |
| workflow_selector = gr.Dropdown( | |
| choices=WORKFLOW_CHOICES, label="Select Workflow", | |
| value="Bulk RNA-seq: Full DE Analysis Pipeline", allow_custom_value=True, | |
| ) | |
| # fn signature: (message, history, selected_workflow, temperature) β 2 additional inputs | |
| workflow_examples = [ | |
| ["Walk me through the complete pipeline from raw FASTQ to DE results.", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7], | |
| ["I'm at alignment. What should I check before counting?", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7], | |
| ["My mapping rate is only 45%. What could be wrong?", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7], | |
| ["How do I choose between STAR and HISAT2?", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7], | |
| ["What parameters for GATK HaplotypeCaller on exome data?", "Exome Sequencing: Variant Calling Pipeline", 0.7], | |
| ["How do I set DADA2 truncation parameters?", "Microbiome: 16S Amplicon Analysis (QIIME2)", 0.7], | |
| ] | |
| gr.ChatInterface( | |
| fn=workflow_respond, | |
| type="messages", | |
| additional_inputs=[ | |
| workflow_selector, | |
| gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Temperature", visible=False), | |
| ], | |
| additional_inputs_accordion=gr.Accordion("βοΈ", open=False, visible=False), | |
| examples=workflow_examples, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 6: PAPER TO LESSON | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π° Paper to Lesson", id="paper"): | |
| gr.HTML('<div class="module-info">π° Convert research papers into teaching material. Upload a paper in the Upload tab first, then generate lessons, slides, or quizzes from it.</div>') | |
| output_format = gr.Radio( | |
| choices=["Lesson Plan", "Slide Outline", "Study Notes", "Quiz Questions"], | |
| value="Lesson Plan", label="Output Format" | |
| ) | |
| # fn signature: (message, history, output_format, rag_state) β 2 additional inputs | |
| paper_examples = [ | |
| ["Convert this paper into a 45-minute lecture plan.", "Lesson Plan", DEFAULT_RAG_STATE], | |
| ["Create a slide outline covering the key methods.", "Slide Outline", DEFAULT_RAG_STATE], | |
| ["Generate study notes on the bioinformatics methods.", "Study Notes", DEFAULT_RAG_STATE], | |
| ["Create quiz questions on this paper's methodology.", "Quiz Questions", DEFAULT_RAG_STATE], | |
| ] | |
| gr.ChatInterface( | |
| fn=paper_to_lesson_respond, | |
| type="messages", | |
| additional_inputs=[output_format, rag_store], | |
| examples=paper_examples, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 7: VIVA PRACTICE | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Viva Practice", id="viva"): | |
| gr.HTML('<div class="module-info">π Practice oral examinations. The AI examiner asks probing questions, evaluates your answers, and pushes deeper understanding.</div>') | |
| with gr.Row(): | |
| viva_topic = gr.Dropdown( | |
| choices=TOPIC_CHOICES, label="Viva Topic", | |
| allow_custom_value=True, value="RNA-seq: Differential Expression (DESeq2)" | |
| ) | |
| viva_difficulty = gr.Radio( | |
| choices=DIFFICULTY_LEVELS, value="Intermediate", label="Difficulty" | |
| ) | |
| # fn signature: (message, history, topic, difficulty) β 2 additional inputs | |
| viva_examples = [ | |
| ["I'm ready for my viva. Start with your first question.", "RNA-seq: Differential Expression (DESeq2)", "Intermediate"], | |
| ["Focus on the statistical aspects of RNA-seq.", "RNA-seq: Differential Expression (DESeq2)", "Intermediate"], | |
| ["Ask me about variant calling and interpretation.", "Variant Interpretation: ACMG Guidelines", "Intermediate"], | |
| ["Test my understanding of microbiome diversity.", "Microbiome: Alpha & Beta Diversity", "Intermediate"], | |
| ] | |
| gr.ChatInterface( | |
| fn=viva_respond, | |
| type="messages", | |
| additional_inputs=[viva_topic, viva_difficulty], | |
| examples=viva_examples, | |
| ) | |
| # ββ Footer βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px; margin-top: 20px; border-top: 1px solid #e0e0e0; color: #666; font-size: 0.85em;"> | |
| <p><strong>Bioinformatics with BB Tutor</strong> β Educational AI Assistant</p> | |
| <p>β οΈ For educational purposes only. Not for clinical use.</p> | |
| <p>RNA-seq Β· Exome Β· Genome Β· Microbiome Β· Variants Β· Molecular Genetics Β· scRNA-seq Β· ATAC-seq Β· ChIP-seq Β· Methylation Β· Small RNA Β· Targeted Panels Β· Long-read Β· Spatial Β· Multi-omics</p> | |
| </div> | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = build_app() | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False) | |