Babajaan's picture
Fix: ChatInterface examples must be list-of-lists matching additional_inputs count
28ad690 verified
"""
Bioinformatics with BB Tutor β€” Complete Application
A production bioinformatics teaching assistant with 7 modules.
Architecture:
- Backend: LLMService (HuggingFace InferenceClient), RAGService (sentence-transformers),
DocumentParser (PyMuPDF + text), knowledge_base (domain content)
- Frontend: 7 Gradio tabs with ChatInterface, file upload, quiz generation, lesson building
- Data flow: User query β†’ RAG retrieval β†’ LLM with context β†’ streaming response
- Shared state: rag_store (gr.State) holds uploaded document chunks + embeddings across tabs
"""
import gradio as gr
import numpy as np
import os
from pathlib import Path
# ── Conditional imports with fallbacks ────────────────────────────────────────
try:
import fitz # PyMuPDF
HAS_FITZ = True
except ImportError:
HAS_FITZ = False
print("Warning: PyMuPDF not available. PDF parsing disabled.")
try:
from sentence_transformers import SentenceTransformer
HAS_ST = True
except ImportError:
HAS_ST = False
print("Warning: sentence-transformers not available. Embedding search disabled.")
try:
from huggingface_hub import InferenceClient
HAS_HF = True
except ImportError:
HAS_HF = False
print("Warning: huggingface_hub not available. LLM service disabled.")
# ── Import knowledge base ────────────────────────────────────────────────────
from knowledge_base import (
DOMAIN_TAXONOMY, WORKFLOWS, GLOSSARY, COMMON_MISCONCEPTIONS,
SYSTEM_PROMPTS, QUIZ_TEMPLATES, LESSON_TEMPLATE,
TOPIC_CHOICES, DIFFICULTY_LEVELS, WORKFLOW_CHOICES
)
# ============================================================================
# CONFIGURATION
# ============================================================================
LLM_MODEL = os.environ.get("LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
HF_TOKEN = os.environ.get("HF_TOKEN", None)
CHUNK_SIZE = 400
CHUNK_OVERLAP = 60
TOP_K_RETRIEVAL = 3
DEFAULT_SYSTEM_PROMPT = SYSTEM_PROMPTS["ask_tutor"]
DEFAULT_RAG_STATE = {"chunks": [], "embeddings": None}
# ============================================================================
# BACKEND SERVICES β€” Singleton Pattern
# ============================================================================
class LLMService:
"""Lazy-initialized LLM inference service."""
_instance = None
_initialized = False
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self):
if LLMService._initialized:
return
LLMService._initialized = True
self.client = None
self._try_init()
def _try_init(self):
if not HAS_HF:
print("LLMService: huggingface_hub not available")
return
if not HF_TOKEN:
print("LLMService: HF_TOKEN not set in environment")
return
try:
self.client = InferenceClient(
model=LLM_MODEL,
token=HF_TOKEN,
timeout=120,
)
print("LLMService: Initialized successfully")
except Exception as e:
print(f"LLMService: Failed to initialize: {e}")
self.client = None
def is_available(self):
return self.client is not None
def stream_chat(self, messages, temperature=0.7, max_tokens=1024):
"""Stream chat completion. Yields partial response strings."""
if not self.is_available():
yield self._fallback_response(messages)
return
try:
partial = ""
for chunk in self.client.chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
stream=True,
):
token = ""
if hasattr(chunk, 'choices') and chunk.choices:
choice = chunk.choices[0]
if hasattr(choice, 'delta') and hasattr(choice.delta, 'content'):
token = choice.delta.content or ""
partial += token
yield partial
except Exception as e:
print(f"LLM stream error: {e}")
yield f"⚠️ LLM API error: {str(e)}\n\nPlease check your HF_TOKEN in Space settings and ensure the model '{LLM_MODEL}' is accessible.\n\nThe tutor is still functional using its knowledge base for many questions β€” try asking about specific bioinformatics topics!"
def generate(self, messages, temperature=0.7, max_tokens=1024):
"""Non-streaming generation. Returns complete response."""
if not self.is_available():
return self._fallback_response(messages)
try:
response = self.client.chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
stream=False,
)
return response.choices[0].message.content
except Exception as e:
print(f"LLM generate error: {e}")
return f"⚠️ LLM API error: {str(e)}\n\nThe tutor can still answer from its knowledge base. Try asking about specific concepts!"
def _fallback_response(self, messages):
"""Knowledge-base fallback when LLM unavailable."""
user_msg = ""
for m in reversed(messages):
if isinstance(m, dict) and m.get("role") == "user":
user_msg = m.get("content", "").lower()
break
if not user_msg:
return "⚠️ **LLM not available.** Add HF_TOKEN in Space settings to enable AI responses.\n\nMeanwhile, the knowledge base covers: DESeq2, variant calling, microbiome diversity, scRNA-seq clustering, and more. Try asking a specific question!"
response_parts = []
for term, definition in GLOSSARY.items():
if term.lower() in user_msg:
response_parts.append(f"**{term}**: {definition}")
if len(response_parts) >= 3:
break
for wf_key, wf in WORKFLOWS.items():
if any(kw in user_msg for kw in wf["name"].lower().split()):
response_parts.append(f"\n### {wf['name']}")
for step in wf["steps"][:3]:
response_parts.append(f"**Step {step['step']}: {step['name']}**\n{step['description']}")
break
for misc in COMMON_MISCONCEPTIONS:
if misc["domain"].replace("_", " ") in user_msg or any(w in user_msg for w in misc["misconception"].lower().split()[:5]):
response_parts.append(f"\n⚠️ **Common Misconception**: {misc['misconception']}\n\nβœ… **Correction**: {misc['correction']}")
break
if response_parts:
return "πŸ“š *Responding from knowledge base (LLM not configured):*\n\n" + "\n\n".join(response_parts)
return (
"⚠️ **AI responses require HF_TOKEN.**\n\n"
"To enable full AI-powered responses:\n"
"1. Go to your HuggingFace account β†’ Settings β†’ Access Tokens\n"
"2. Create a token with 'inference-api' scope\n"
"3. Add it as a Secret named `HF_TOKEN` in this Space's Settings\n\n"
"The knowledge base can still answer many questions. Try asking about 'RNA-seq workflow', 'variant calling', or 'microbiome diversity'!"
)
class RAGService:
"""Document retrieval with lazy embedding model loading."""
_instance = None
_initialized = False
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self):
if RAGService._initialized:
return
RAGService._initialized = True
self.embedder = None
self.kb_chunks = []
self.kb_metadata = []
self.kb_embeddings = None
self._build_kb_index()
def _ensure_embedder(self):
if self.embedder is not None:
return True
if not HAS_ST:
return False
try:
print("RAGService: Loading embedding model...")
self.embedder = SentenceTransformer(EMBED_MODEL)
print("RAGService: Embedding model loaded")
if self.kb_chunks:
self.kb_embeddings = self.embedder.encode(
self.kb_chunks,
convert_to_numpy=True,
normalize_embeddings=True,
show_progress_bar=False,
batch_size=32,
)
print(f"RAGService: KB embedded ({len(self.kb_chunks)} chunks)")
return True
except Exception as e:
print(f"RAGService: Failed to load embedder: {e}")
return False
def _build_kb_index(self):
chunks = []
metadata = []
for term, definition in GLOSSARY.items():
chunks.append(f"{term}: {definition}")
metadata.append({"source": "glossary", "topic": term, "type": "definition"})
for wf_key, wf in WORKFLOWS.items():
for step in wf["steps"]:
step_text = f"{wf['name']} - Step {step['step']}: {step['name']}. {step['description']}"
if step.get("tools"):
step_text += f" Tools: {', '.join(step['tools'])}."
if step.get("common_mistakes"):
step_text += " Common mistakes: " + "; ".join(step["common_mistakes"])
chunks.append(step_text)
metadata.append({
"source": "workflow",
"topic": wf["domain"],
"type": "workflow_step",
"step": step["step"],
"workflow": wf_key
})
for misc in COMMON_MISCONCEPTIONS:
text = f"Misconception: {misc['misconception']} Correction: {misc['correction']}"
chunks.append(text)
metadata.append({
"source": "misconception",
"topic": misc["domain"],
"type": "misconception",
"severity": misc["severity"]
})
for key, domain in DOMAIN_TAXONOMY.items():
text = f"{domain['name']} covers: {', '.join(domain['subtopics'][:10])}"
chunks.append(text)
metadata.append({"source": "taxonomy", "topic": key, "type": "domain_overview"})
self.kb_chunks = chunks
self.kb_metadata = metadata
print(f"RAGService: Built KB with {len(chunks)} chunks")
def search(self, query, top_k=TOP_K_RETRIEVAL, user_chunks=None, user_embeddings=None):
if not self._ensure_embedder():
return self._keyword_search(query, top_k)
try:
query_embedding = self.embedder.encode(
[query],
convert_to_numpy=True,
normalize_embeddings=True,
)
results = []
if self.kb_embeddings is not None and len(self.kb_embeddings) > 0:
kb_scores = np.dot(query_embedding, self.kb_embeddings.T)[0]
top_indices = np.argsort(kb_scores)[::-1][:top_k]
for idx in top_indices:
if kb_scores[idx] > 0.15:
results.append({
"text": self.kb_chunks[idx],
"score": float(kb_scores[idx]),
"metadata": self.kb_metadata[idx]
})
if user_chunks and user_embeddings is not None and len(user_embeddings) > 0:
user_scores = np.dot(query_embedding, user_embeddings.T)[0]
top_user = np.argsort(user_scores)[::-1][:top_k]
for idx in top_user:
if user_scores[idx] > 0.15:
results.append({
"text": user_chunks[idx],
"score": float(user_scores[idx]),
"metadata": {"source": "uploaded", "type": "user_content"}
})
results.sort(key=lambda x: x["score"], reverse=True)
return results[:top_k]
except Exception as e:
print(f"RAG search error: {e}")
return self._keyword_search(query, top_k)
def _keyword_search(self, query, top_k=3):
query_words = set(query.lower().split())
scored = []
for i, chunk in enumerate(self.kb_chunks):
chunk_words = set(chunk.lower().split())
overlap = len(query_words & chunk_words)
if overlap > 0:
scored.append({
"text": chunk,
"score": overlap / max(len(query_words), 1),
"metadata": self.kb_metadata[i]
})
scored.sort(key=lambda x: x["score"], reverse=True)
return scored[:top_k]
def embed_chunks(self, chunks):
if not self._ensure_embedder() or not chunks:
return None
try:
return self.embedder.encode(
chunks,
convert_to_numpy=True,
normalize_embeddings=True,
show_progress_bar=False,
batch_size=16,
)
except Exception as e:
print(f"Embed chunks error: {e}")
return None
class DocumentParser:
@staticmethod
def parse_file(filepath):
if filepath is None:
return "", []
filepath = str(filepath)
ext = Path(filepath).suffix.lower()
try:
if ext == ".pdf" and HAS_FITZ:
return DocumentParser._parse_pdf(filepath)
elif ext in (".txt", ".md", ".csv", ".tsv", ".fasta", ".fa", ".fastq", ".fq", ".vcf", ".bed", ".gff", ".gtf", ".sam", ".bam"):
return DocumentParser._parse_text(filepath)
else:
return f"Unsupported file type: {ext}", []
except Exception as e:
return f"Error parsing file: {str(e)}", []
@staticmethod
def _parse_pdf(filepath):
doc = fitz.open(filepath)
pages = []
for page_num in range(len(doc)):
text = doc[page_num].get_text()
if text.strip():
pages.append(text)
doc.close()
full_text = "\n\n".join(pages)
chunks = DocumentParser._chunk_text(full_text)
return full_text, chunks
@staticmethod
def _parse_text(filepath):
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
text = f.read()
chunks = DocumentParser._chunk_text(text)
return text, chunks
@staticmethod
def _chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
words = text.split()
if len(words) <= chunk_size:
return [text] if text.strip() else []
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = " ".join(words[i:i + chunk_size])
if chunk.strip():
chunks.append(chunk)
return chunks
llm_service = LLMService()
rag_service = RAGService()
doc_parser = DocumentParser()
print(f"🧬 BB Tutor initialized. LLM: {llm_service.is_available()}, Embeddings: {rag_service.embedder is not None}")
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def _rag_context(query, user_chunks=None, user_embeddings=None):
results = rag_service.search(query, top_k=TOP_K_RETRIEVAL,
user_chunks=user_chunks, user_embeddings=user_embeddings)
if not results:
return ""
parts = ["RELEVANT KNOWLEDGE BASE CONTEXT:"]
for r in results:
source = r["metadata"].get("source", "kb")
parts.append(f"[{source}] {r['text'][:800]}")
return "\n".join(parts)
def _format_history(history):
messages = []
for h in history:
if isinstance(h, dict):
messages.append(h)
elif isinstance(h, (list, tuple)):
if len(h) >= 1 and h[0]:
messages.append({"role": "user", "content": str(h[0])})
if len(h) >= 2 and h[1]:
messages.append({"role": "assistant", "content": str(h[1])})
return messages
# ============================================================================
# MODULE HANDLERS
# ============================================================================
def tutor_respond(message, history, system_prompt, temperature, max_tokens, rag_state):
if not message or not message.strip():
yield ""
return
rag_state = rag_state or DEFAULT_RAG_STATE
user_chunks = rag_state.get("chunks", [])
user_embeddings = rag_state.get("embeddings")
rag_ctx = _rag_context(message, user_chunks, user_embeddings)
messages = [{"role": "system", "content": system_prompt}]
if rag_ctx:
messages.append({"role": "system", "content": rag_ctx})
messages.extend(_format_history(history))
messages.append({"role": "user", "content": message})
for partial in llm_service.stream_chat(messages, temperature, max_tokens):
yield partial
def process_upload(file, rag_state):
rag_state = rag_state or DEFAULT_RAG_STATE
if file is None:
return "πŸ“ Please upload a file first.", "", rag_state
full_text, chunks = doc_parser.parse_file(file)
if not chunks:
return "⚠️ Could not extract text from the uploaded file.", full_text[:2000] if full_text else "", rag_state
embeddings = rag_service.embed_chunks(chunks)
new_state = {"chunks": chunks, "embeddings": embeddings}
preview = full_text[:2500] if len(full_text) > 2500 else full_text
msgs = [
{"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]},
{"role": "user", "content": f"Analyze and explain this bioinformatics document:\n\n{preview}"}
]
explanation = llm_service.generate(msgs, temperature=0.5, max_tokens=1500)
stats = f"πŸ“Š **Document Stats:** {len(chunks)} chunks, ~{len(full_text.split())} words | "
stats += f"File type: {Path(str(file)).suffix} | "
stats += "πŸ€– AI-powered" if llm_service.is_available() else "πŸ“š Knowledge-base mode"
stats += f"\n\n---\n\n"
return stats + explanation, full_text[:5000], new_state
def upload_chat_respond(message, history, rag_state):
if not message or not message.strip():
yield ""
return
rag_state = rag_state or DEFAULT_RAG_STATE
user_chunks = rag_state.get("chunks", [])
user_embeddings = rag_state.get("embeddings")
if not user_chunks:
yield "πŸ“ Please upload a document in the panel above, then ask questions about it.\n\nYour uploaded document will be indexed and searchable across all modules!"
return
rag_results = rag_service.search(message, top_k=4, user_chunks=user_chunks, user_embeddings=user_embeddings)
ctx = "CONTEXT FROM UPLOADED DOCUMENT:\n"
if rag_results:
for r in rag_results:
ctx += f"\n{r['text'][:600]}\n"
else:
ctx += "(No highly relevant passages found β€” answering from general knowledge)\n"
messages = [
{"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]},
{"role": "system", "content": ctx},
]
messages.extend(_format_history(history))
messages.append({"role": "user", "content": message})
for partial in llm_service.stream_chat(messages, temperature=0.5, max_tokens=1024):
yield partial
def generate_quiz(topic, quiz_type, num_questions, difficulty, rag_state):
if not topic:
return "❓ Please select or enter a topic first.", ""
rag_results = rag_service.search(topic, top_k=3)
context = ""
if rag_results:
context = "Reference material:\n" + "\n".join(r["text"][:500] for r in rag_results)
template_key = {
"Multiple Choice (MCQ)": "mcq",
"True/False": "true_false",
"Short Answer": "short_answer"
}.get(quiz_type, "mcq")
quiz_prompt = QUIZ_TEMPLATES[template_key].format(
n=int(num_questions), topic=topic, difficulty=difficulty
)
messages = [{"role": "system", "content": SYSTEM_PROMPTS["quiz_me"]}]
if context:
messages.append({"role": "system", "content": context})
messages.append({"role": "user", "content": quiz_prompt})
response = llm_service.generate(messages, temperature=0.8, max_tokens=2000)
formatted = f"## 🧠 {topic} Quiz β€” {difficulty}\n\n"
formatted += f"*Format: {quiz_type} | Questions: {int(num_questions)}*\n\n---\n\n"
formatted += response
return formatted, response
def check_quiz_answers(user_answers, answer_key):
if not user_answers or not user_answers.strip():
return "✍️ Please enter your answers above before checking."
if not answer_key:
return "⚠️ Please generate a quiz first (use the panel above)."
messages = [
{"role": "system", "content": "You are a bioinformatics tutor grading a quiz. Compare student answers to correct answers. For each: mark βœ… or ❌, explain briefly, provide correct answer if wrong. Be encouraging. Give final score."},
{"role": "user", "content": f"QUIZ AND ANSWERS:\n{answer_key}\n\nSTUDENT ANSWERS:\n{user_answers}\n\nGrade each:"}
]
return llm_service.generate(messages, temperature=0.3, max_tokens=1500)
def generate_lesson(topic, level, include_exercises, include_quiz):
if not topic:
return "πŸ“š Please select or enter a topic."
rag_results = rag_service.search(topic, top_k=4)
context = ""
if rag_results:
context = "Reference:\n" + "\n".join(r["text"][:500] for r in rag_results)
prompt = LESSON_TEMPLATE.format(topic=topic, level=level)
if include_exercises:
prompt += "\n\nInclude 2-3 practical exercises with clear instructions."
if include_quiz:
prompt += "\n\nInclude a 5-question self-assessment quiz (with answers)."
messages = [{"role": "system", "content": SYSTEM_PROMPTS["build_lesson"]}]
if context:
messages.append({"role": "system", "content": context})
messages.append({"role": "user", "content": prompt})
return llm_service.generate(messages, temperature=0.7, max_tokens=3000)
def workflow_respond(message, history, selected_workflow, temperature):
if not message or not message.strip():
yield ""
return
workflow_context = ""
for wf_key, wf in WORKFLOWS.items():
if wf["name"] in selected_workflow or selected_workflow.lower() in wf["name"].lower():
workflow_context = f"WORKFLOW REFERENCE: {wf['name']}\n\n"
for step in wf["steps"]:
workflow_context += f"Step {step['step']}: {step['name']}\n"
workflow_context += f" {step['description']}\n"
if step.get("tools"):
workflow_context += f" Tools: {', '.join(step['tools'])}\n"
if step.get("common_mistakes"):
workflow_context += f" ⚠️ Common mistakes: {'; '.join(step['common_mistakes'])}\n"
workflow_context += "\n"
break
rag_results = rag_service.search(message, top_k=2)
if rag_results:
workflow_context += "\nADDITIONAL CONTEXT:\n" + "\n".join(r["text"][:500] for r in rag_results)
messages = [{"role": "system", "content": SYSTEM_PROMPTS["workflow_coach"]}]
if workflow_context:
messages.append({"role": "system", "content": workflow_context})
messages.extend(_format_history(history))
messages.append({"role": "user", "content": message})
for partial in llm_service.stream_chat(messages, temperature, 1500):
yield partial
def paper_to_lesson_respond(message, history, output_format, rag_state):
if not message or not message.strip():
yield ""
return
rag_state = rag_state or DEFAULT_RAG_STATE
user_chunks = rag_state.get("chunks", [])
user_embeddings = rag_state.get("embeddings")
context = ""
if user_chunks:
rag_results = rag_service.search(message, top_k=4, user_chunks=user_chunks, user_embeddings=user_embeddings)
if rag_results:
context = "PAPER CONTENT:\n" + "\n".join(r["text"][:600] for r in rag_results)
format_instruction = {
"Lesson Plan": "Create a structured lesson plan with learning objectives, sections, and exercises.",
"Slide Outline": "Create a slide-by-slide outline with key points for each slide.",
"Study Notes": "Create concise study notes highlighting key methods, tools, and findings.",
"Quiz Questions": "Generate 5-10 quiz questions based on the paper's methods and findings.",
}.get(output_format, "Create a structured lesson plan.")
messages = [{"role": "system", "content": SYSTEM_PROMPTS["paper_to_lesson"]}]
if context:
messages.append({"role": "system", "content": context})
messages.extend(_format_history(history))
full_msg = f"{message}\n\nOUTPUT FORMAT: {format_instruction}"
messages.append({"role": "user", "content": full_msg})
for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=2500):
yield partial
def viva_respond(message, history, topic, difficulty):
if not message or not message.strip():
yield ""
return
rag_results = rag_service.search(f"{topic} {message}", top_k=3)
context = ""
if rag_results:
context = "Reference:\n" + "\n".join(r["text"][:500] for r in rag_results)
messages = [
{"role": "system", "content": SYSTEM_PROMPTS["viva_practice"]},
{"role": "system", "content": f"VIVA TOPIC: {topic}\nDIFFICULTY: {difficulty}\n\n{context}"},
]
messages.extend(_format_history(history))
messages.append({"role": "user", "content": message})
for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=1000):
yield partial
# ============================================================================
# GRADIO APP ASSEMBLY
# ============================================================================
CUSTOM_CSS = """
.main-header {
text-align: center; padding: 20px;
background: linear-gradient(135deg, #1a5276 0%, #2e86c1 50%, #48c9b0 100%);
border-radius: 12px; margin-bottom: 20px; color: white;
}
.main-header h1 { color: white; font-size: 2em; margin: 0; }
.main-header p { color: #ecf0f1; margin: 5px 0; }
.module-info {
background: #f0f9ff; border-left: 4px solid #2e86c1;
padding: 12px 16px; margin-bottom: 16px; border-radius: 0 8px 8px 0;
}
.safety-notice {
background: #fff3e0; border-left: 4px solid #f39c12;
padding: 10px 14px; margin-top: 10px; border-radius: 0 8px 8px 0; font-size: 0.9em;
}
.status-badge {
display: inline-block; padding: 4px 12px; border-radius: 12px;
font-size: 0.85em; font-weight: bold;
}
.status-on { background: #d4edda; color: #155724; }
.status-off { background: #f8d7da; color: #721c24; }
"""
def build_app():
with gr.Blocks(title="Bioinformatics with BB Tutor", css=CUSTOM_CSS) as demo:
# ── Global shared state ─────────────────────────────────────────
rag_store = gr.State(DEFAULT_RAG_STATE)
# ── Status indicator ────────────────────────────────────────────
llm_status = "🟒 AI Enabled" if llm_service.is_available() else "πŸ”΄ AI Offline (Knowledge Base Active)"
# ── Header ─────────────────────────────────────────────────────
gr.HTML(f"""
<div class="main-header">
<h1>🧬 Bioinformatics with BB Tutor</h1>
<p>AI-powered bioinformatics teaching assistant</p>
<p style="font-size: 0.85em; opacity: 0.9;">
RNA-seq Β· Exome Β· Genome Β· Microbiome Β· Variants Β· Molecular Genetics Β· scRNA-seq Β· ATAC-seq Β· ChIP-seq Β· and more
</p>
<p style="font-size: 0.8em; margin-top: 8px;">
<span class="status-badge {'status-on' if llm_service.is_available() else 'status-off'}">{llm_status}</span>
</p>
</div>
""")
with gr.Tabs():
# ══════════════════════════════════════════════════════════════
# TAB 1: ASK THE TUTOR
# ══════════════════════════════════════════════════════════════
with gr.Tab("🧬 Ask the Tutor", id="ask"):
gr.HTML('<div class="module-info">πŸ’‘ Ask any bioinformatics question. RAG-augmented responses from a curated knowledge base covering 15+ domains.</div>')
# Examples must be list-of-lists matching fn signature: (message, history, system_prompt, temperature, max_tokens, rag_state)
# Additional inputs: [system_prompt, temperature, max_tokens, rag_store] β†’ 4 values per example
ask_examples = [
["What is the difference between DESeq2 and edgeR?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE],
["Explain the GATK variant calling pipeline step by step.", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE],
["What is the difference between alpha and beta diversity?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE],
["Why should I use adjusted p-values instead of raw p-values?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE],
["Explain the single-cell RNA-seq analysis workflow.", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE],
["What is BQSR and why is it important?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE],
["How do I choose between STAR and HISAT2 for alignment?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE],
["What common mistakes do students make with DESeq2?", DEFAULT_SYSTEM_PROMPT, 0.7, 1024, DEFAULT_RAG_STATE],
]
gr.ChatInterface(
fn=tutor_respond,
type="messages",
additional_inputs=[
gr.Textbox(value=DEFAULT_SYSTEM_PROMPT, label="System Prompt", lines=2, visible=False),
gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Temperature", visible=False),
gr.Slider(256, 4096, 1024, step=256, label="Max Tokens", visible=False),
rag_store,
],
additional_inputs_accordion=gr.Accordion("βš™οΈ Advanced", open=False, visible=False),
examples=ask_examples,
)
gr.HTML('<div class="safety-notice">⚠️ <strong>Educational use only.</strong> Not for clinical interpretation. Always consult qualified professionals for clinical genomics.</div>')
# ══════════════════════════════════════════════════════════════
# TAB 2: UPLOAD & EXPLAIN
# ══════════════════════════════════════════════════════════════
with gr.Tab("πŸ“„ Upload & Explain", id="upload"):
gr.HTML('<div class="module-info">πŸ“„ Upload bioinformatics documents (PDF, TXT, FASTA, VCF, etc.) and get AI-powered analysis. Content is indexed and searchable across all modules.</div>')
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload Document",
file_types=[".pdf", ".txt", ".md", ".csv", ".tsv",
".fasta", ".fa", ".fastq", ".vcf", ".bed",
".gff", ".gtf", ".sam", ".bam"],
file_count="single", type="filepath",
)
process_btn = gr.Button("πŸ” Analyze Document", variant="primary")
gr.Markdown("**Supported:** PDF, text, FASTA/FASTQ, VCF, BED, GFF/GTF, SAM/BAM, CSV/TSV")
with gr.Column(scale=2):
explanation_output = gr.Markdown(label="Analysis & Explanation")
with gr.Accordion("πŸ“ Raw Extracted Text", open=False):
raw_text_output = gr.Textbox(label="Extracted Text", lines=10, show_copy_button=True)
process_btn.click(
fn=process_upload,
inputs=[file_input, rag_store],
outputs=[explanation_output, raw_text_output, rag_store],
)
gr.Markdown("### πŸ’¬ Chat About Your Document")
# fn signature: (message, history, rag_state) β†’ 1 additional input
upload_chat_examples = [
["Summarize the key methods in this paper.", DEFAULT_RAG_STATE],
["What bioinformatics tools are mentioned?", DEFAULT_RAG_STATE],
["Explain the main findings in simple terms.", DEFAULT_RAG_STATE],
["What are the limitations of this analysis?", DEFAULT_RAG_STATE],
]
gr.ChatInterface(
fn=upload_chat_respond,
type="messages",
additional_inputs=[rag_store],
examples=upload_chat_examples,
)
# ══════════════════════════════════════════════════════════════
# TAB 3: QUIZ ME
# ══════════════════════════════════════════════════════════════
with gr.Tab("❓ Quiz Me", id="quiz"):
gr.HTML('<div class="module-info">🧠 Test your knowledge with AI-generated quizzes across all bioinformatics domains.</div>')
with gr.Row():
quiz_topic = gr.Dropdown(
choices=TOPIC_CHOICES, label="Select Topic",
allow_custom_value=True, value="RNA-seq: Differential Expression (DESeq2)"
)
quiz_type = gr.Radio(
choices=["Multiple Choice (MCQ)", "True/False", "Short Answer"],
value="Multiple Choice (MCQ)", label="Format"
)
with gr.Row():
quiz_difficulty = gr.Radio(
choices=DIFFICULTY_LEVELS, value="Intermediate", label="Difficulty"
)
num_questions = gr.Slider(1, 10, 5, step=1, label="# Questions")
generate_quiz_btn = gr.Button("🎲 Generate Quiz", variant="primary")
quiz_output = gr.Markdown(label="Generated Quiz")
answer_key_state = gr.State("")
generate_quiz_btn.click(
fn=generate_quiz,
inputs=[quiz_topic, quiz_type, num_questions, quiz_difficulty, rag_store],
outputs=[quiz_output, answer_key_state],
)
gr.Markdown("---")
gr.Markdown("### ✍️ Submit Your Answers")
with gr.Row():
user_answers = gr.Textbox(
label="Your Answers (e.g., '1: A, 2: B')",
lines=5, placeholder="Type your answers here...", scale=3
)
check_btn = gr.Button("βœ… Check", variant="primary", scale=1)
feedback_output = gr.Markdown(label="Feedback")
check_btn.click(
fn=check_quiz_answers,
inputs=[user_answers, answer_key_state],
outputs=[feedback_output],
)
# ══════════════════════════════════════════════════════════════
# TAB 4: BUILD A LESSON
# ══════════════════════════════════════════════════════════════
with gr.Tab("πŸ“š Build a Lesson", id="lesson"):
gr.HTML('<div class="module-info">πŸ“š Generate structured lessons with learning objectives, explanations, exercises, and self-assessment quizzes.</div>')
with gr.Row():
lesson_topic = gr.Dropdown(
choices=TOPIC_CHOICES, label="Lesson Topic",
allow_custom_value=True, value="RNA-seq: Differential Expression (DESeq2)"
)
lesson_level = gr.Radio(
choices=DIFFICULTY_LEVELS, value="Intermediate", label="Level"
)
with gr.Row():
include_exercises = gr.Checkbox(label="Include Exercises", value=True)
include_quiz = gr.Checkbox(label="Include Quiz", value=True)
generate_lesson_btn = gr.Button("πŸ“ Generate Lesson", variant="primary")
lesson_output = gr.Markdown(label="Generated Lesson")
generate_lesson_btn.click(
fn=generate_lesson,
inputs=[lesson_topic, lesson_level, include_exercises, include_quiz],
outputs=[lesson_output],
)
# ══════════════════════════════════════════════════════════════
# TAB 5: WORKFLOW COACH
# ══════════════════════════════════════════════════════════════
with gr.Tab("πŸ”¬ Workflow Coach", id="workflow"):
gr.HTML('<div class="module-info">πŸ”¬ Step-by-step guidance through bioinformatics analysis pipelines. Select a workflow and ask about any step.</div>')
workflow_selector = gr.Dropdown(
choices=WORKFLOW_CHOICES, label="Select Workflow",
value="Bulk RNA-seq: Full DE Analysis Pipeline", allow_custom_value=True,
)
# fn signature: (message, history, selected_workflow, temperature) β†’ 2 additional inputs
workflow_examples = [
["Walk me through the complete pipeline from raw FASTQ to DE results.", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7],
["I'm at alignment. What should I check before counting?", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7],
["My mapping rate is only 45%. What could be wrong?", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7],
["How do I choose between STAR and HISAT2?", "Bulk RNA-seq: Full DE Analysis Pipeline", 0.7],
["What parameters for GATK HaplotypeCaller on exome data?", "Exome Sequencing: Variant Calling Pipeline", 0.7],
["How do I set DADA2 truncation parameters?", "Microbiome: 16S Amplicon Analysis (QIIME2)", 0.7],
]
gr.ChatInterface(
fn=workflow_respond,
type="messages",
additional_inputs=[
workflow_selector,
gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Temperature", visible=False),
],
additional_inputs_accordion=gr.Accordion("βš™οΈ", open=False, visible=False),
examples=workflow_examples,
)
# ══════════════════════════════════════════════════════════════
# TAB 6: PAPER TO LESSON
# ══════════════════════════════════════════════════════════════
with gr.Tab("πŸ“° Paper to Lesson", id="paper"):
gr.HTML('<div class="module-info">πŸ“° Convert research papers into teaching material. Upload a paper in the Upload tab first, then generate lessons, slides, or quizzes from it.</div>')
output_format = gr.Radio(
choices=["Lesson Plan", "Slide Outline", "Study Notes", "Quiz Questions"],
value="Lesson Plan", label="Output Format"
)
# fn signature: (message, history, output_format, rag_state) β†’ 2 additional inputs
paper_examples = [
["Convert this paper into a 45-minute lecture plan.", "Lesson Plan", DEFAULT_RAG_STATE],
["Create a slide outline covering the key methods.", "Slide Outline", DEFAULT_RAG_STATE],
["Generate study notes on the bioinformatics methods.", "Study Notes", DEFAULT_RAG_STATE],
["Create quiz questions on this paper's methodology.", "Quiz Questions", DEFAULT_RAG_STATE],
]
gr.ChatInterface(
fn=paper_to_lesson_respond,
type="messages",
additional_inputs=[output_format, rag_store],
examples=paper_examples,
)
# ══════════════════════════════════════════════════════════════
# TAB 7: VIVA PRACTICE
# ══════════════════════════════════════════════════════════════
with gr.Tab("πŸŽ“ Viva Practice", id="viva"):
gr.HTML('<div class="module-info">πŸŽ“ Practice oral examinations. The AI examiner asks probing questions, evaluates your answers, and pushes deeper understanding.</div>')
with gr.Row():
viva_topic = gr.Dropdown(
choices=TOPIC_CHOICES, label="Viva Topic",
allow_custom_value=True, value="RNA-seq: Differential Expression (DESeq2)"
)
viva_difficulty = gr.Radio(
choices=DIFFICULTY_LEVELS, value="Intermediate", label="Difficulty"
)
# fn signature: (message, history, topic, difficulty) β†’ 2 additional inputs
viva_examples = [
["I'm ready for my viva. Start with your first question.", "RNA-seq: Differential Expression (DESeq2)", "Intermediate"],
["Focus on the statistical aspects of RNA-seq.", "RNA-seq: Differential Expression (DESeq2)", "Intermediate"],
["Ask me about variant calling and interpretation.", "Variant Interpretation: ACMG Guidelines", "Intermediate"],
["Test my understanding of microbiome diversity.", "Microbiome: Alpha & Beta Diversity", "Intermediate"],
]
gr.ChatInterface(
fn=viva_respond,
type="messages",
additional_inputs=[viva_topic, viva_difficulty],
examples=viva_examples,
)
# ── Footer ─────────────────────────────────────────────────────
gr.HTML("""
<div style="text-align: center; padding: 20px; margin-top: 20px; border-top: 1px solid #e0e0e0; color: #666; font-size: 0.85em;">
<p><strong>Bioinformatics with BB Tutor</strong> β€” Educational AI Assistant</p>
<p>⚠️ For educational purposes only. Not for clinical use.</p>
<p>RNA-seq Β· Exome Β· Genome Β· Microbiome Β· Variants Β· Molecular Genetics Β· scRNA-seq Β· ATAC-seq Β· ChIP-seq Β· Methylation Β· Small RNA Β· Targeted Panels Β· Long-read Β· Spatial Β· Multi-omics</p>
</div>
""")
return demo
if __name__ == "__main__":
demo = build_app()
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)