Babajaan commited on
Commit
855ef94
Β·
verified Β·
1 Parent(s): f59aca4

Fix app.py: lazy loading, robust error handling, proper State usage

Browse files
Files changed (1) hide show
  1. app.py +394 -397
app.py CHANGED
@@ -1,6 +1,13 @@
1
  """
2
  Bioinformatics with BB Tutor β€” Complete Application
3
- A production-oriented bioinformatics teaching assistant with 7 modules.
 
 
 
 
 
 
 
4
  """
5
 
6
  import gradio as gr
@@ -9,7 +16,6 @@ import json
9
  import os
10
  import re
11
  import time
12
- import hashlib
13
  from pathlib import Path
14
 
15
  # ── Conditional imports with fallbacks ────────────────────────────────────────
@@ -18,24 +24,21 @@ try:
18
  HAS_FITZ = True
19
  except ImportError:
20
  HAS_FITZ = False
 
21
 
22
  try:
23
  from sentence_transformers import SentenceTransformer
24
  HAS_ST = True
25
  except ImportError:
26
  HAS_ST = False
 
27
 
28
  try:
29
  from huggingface_hub import InferenceClient
30
  HAS_HF = True
31
  except ImportError:
32
  HAS_HF = False
33
-
34
- try:
35
- import pandas as pd
36
- HAS_PANDAS = True
37
- except ImportError:
38
- HAS_PANDAS = False
39
 
40
  # ── Import knowledge base ────────────────────────────────────────────────────
41
  from knowledge_base import (
@@ -49,41 +52,78 @@ from knowledge_base import (
49
  # CONFIGURATION
50
  # ============================================================================
51
 
52
- # Model configuration - uses HF Inference API
53
  LLM_MODEL = os.environ.get("LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
54
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
55
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
56
 
57
- # RAG configuration
58
- CHUNK_SIZE = 400 # words per chunk
59
- CHUNK_OVERLAP = 60 # words overlap
60
  TOP_K_RETRIEVAL = 3
61
 
62
 
63
  # ============================================================================
64
- # BACKEND SERVICES
65
  # ============================================================================
66
 
67
  class LLMService:
68
- """Singleton LLM inference service using HuggingFace Inference API."""
 
 
 
 
 
 
 
 
69
 
70
  def __init__(self):
 
 
 
71
  self.client = None
72
- if HAS_HF and HF_TOKEN:
73
- try:
74
- self.client = InferenceClient(
75
- model=LLM_MODEL,
76
- token=HF_TOKEN,
77
- timeout=120,
78
- )
79
- except Exception as e:
80
- print(f"Warning: Could not initialize InferenceClient: {e}")
 
 
 
 
 
 
 
 
 
 
81
 
82
  def is_available(self):
83
  return self.client is not None
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def stream_chat(self, messages, temperature=0.7, max_tokens=1024):
86
- """Stream a chat completion. Yields partial response strings."""
87
  if not self.is_available():
88
  yield self._fallback_response(messages)
89
  return
@@ -97,11 +137,16 @@ class LLMService:
97
  top_p=0.9,
98
  stream=True,
99
  ):
100
- token = chunk.choices[0].delta.content or ""
 
 
 
 
101
  partial += token
102
  yield partial
103
  except Exception as e:
104
- yield f"⚠️ LLM API error: {str(e)}\n\nPlease check that HF_TOKEN is set correctly in the Space settings and the model {LLM_MODEL} is accessible."
 
105
 
106
  def generate(self, messages, temperature=0.7, max_tokens=1024):
107
  """Non-streaming generation. Returns complete response."""
@@ -118,68 +163,89 @@ class LLMService:
118
  )
119
  return response.choices[0].message.content
120
  except Exception as e:
121
- return f"⚠️ LLM API error: {str(e)}"
 
122
 
123
  def _fallback_response(self, messages):
124
- """Knowledge-base powered fallback when LLM is not available."""
 
125
  user_msg = ""
126
  for m in reversed(messages):
127
- if m["role"] == "user":
128
- user_msg = m["content"].lower()
129
  break
130
 
131
- # Search knowledge base for relevant content
 
 
132
  response_parts = []
133
 
134
- # Check glossary
135
  for term, definition in GLOSSARY.items():
136
- if term.lower() in user_msg or any(w in user_msg for w in term.lower().split()):
137
  response_parts.append(f"**{term}**: {definition}")
 
 
138
 
139
- # Check workflows
140
  for wf_key, wf in WORKFLOWS.items():
141
- if any(keyword in user_msg for keyword in wf["name"].lower().split()):
142
- response_parts.append(f"\n### {wf['name']}\n")
143
  for step in wf["steps"][:3]:
144
  response_parts.append(f"**Step {step['step']}: {step['name']}**\n{step['description']}")
145
  break
146
 
147
- # Check misconceptions
148
  for misc in COMMON_MISCONCEPTIONS:
149
- keywords = misc["misconception"].lower().split()
150
- if any(w in user_msg for w in keywords if len(w) > 4):
151
  response_parts.append(f"\n⚠️ **Common Misconception**: {misc['misconception']}\n\nβœ… **Correction**: {misc['correction']}")
152
  break
153
 
154
  if response_parts:
155
  return "πŸ“š *Responding from knowledge base (LLM not configured):*\n\n" + "\n\n".join(response_parts)
156
- else:
157
- return (
158
- "⚠️ **LLM is not configured.** To enable AI-powered responses:\n\n"
159
- "1. Go to Space Settings β†’ Repository Secrets\n"
160
- "2. Add `HF_TOKEN` with your HuggingFace API token\n"
161
- "3. The token needs access to inference API\n\n"
162
- "Currently showing knowledge base results only. "
163
- "Try asking about specific topics like 'DESeq2', 'variant calling', or 'FASTQ quality'."
164
- )
165
 
166
 
167
  class RAGService:
168
- """Document retrieval service with embedding-based search."""
 
 
 
 
 
 
 
 
169
 
170
  def __init__(self):
 
 
 
171
  self.embedder = None
172
- if HAS_ST:
173
- try:
174
- self.embedder = SentenceTransformer(EMBED_MODEL)
175
- except Exception as e:
176
- print(f"Warning: Could not load embedding model: {e}")
177
-
178
- # Pre-build knowledge base index
179
- self.kb_chunks, self.kb_metadata = self._build_kb_index()
180
  self.kb_embeddings = None
181
- if self.embedder and self.kb_chunks:
182
- try:
 
 
 
 
 
 
 
 
 
 
 
 
183
  self.kb_embeddings = self.embedder.encode(
184
  self.kb_chunks,
185
  convert_to_numpy=True,
@@ -187,20 +253,23 @@ class RAGService:
187
  show_progress_bar=False,
188
  batch_size=32,
189
  )
190
- except Exception as e:
191
- print(f"Warning: Could not embed knowledge base: {e}")
 
 
 
192
 
193
  def _build_kb_index(self):
194
- """Build searchable chunks from the knowledge base."""
195
  chunks = []
196
  metadata = []
197
 
198
- # Index glossary terms
199
  for term, definition in GLOSSARY.items():
200
  chunks.append(f"{term}: {definition}")
201
  metadata.append({"source": "glossary", "topic": term, "type": "definition"})
202
 
203
- # Index workflow steps
204
  for wf_key, wf in WORKFLOWS.items():
205
  for step in wf["steps"]:
206
  step_text = f"{wf['name']} - Step {step['step']}: {step['name']}. {step['description']}"
@@ -217,7 +286,7 @@ class RAGService:
217
  "workflow": wf_key
218
  })
219
 
220
- # Index misconceptions
221
  for misc in COMMON_MISCONCEPTIONS:
222
  text = f"Misconception: {misc['misconception']} Correction: {misc['correction']}"
223
  chunks.append(text)
@@ -228,17 +297,19 @@ class RAGService:
228
  "severity": misc["severity"]
229
  })
230
 
231
- # Index domain taxonomy
232
  for key, domain in DOMAIN_TAXONOMY.items():
233
- text = f"{domain['name']} covers these subtopics: {', '.join(domain['subtopics'])}."
234
  chunks.append(text)
235
  metadata.append({"source": "taxonomy", "topic": key, "type": "domain_overview"})
236
 
237
- return chunks, metadata
 
 
238
 
239
  def search(self, query, top_k=TOP_K_RETRIEVAL, user_chunks=None, user_embeddings=None):
240
- """Search the knowledge base and optional user-uploaded content."""
241
- if not self.embedder:
242
  return self._keyword_search(query, top_k)
243
 
244
  try:
@@ -250,40 +321,38 @@ class RAGService:
250
 
251
  results = []
252
 
253
- # Search knowledge base
254
  if self.kb_embeddings is not None and len(self.kb_embeddings) > 0:
255
  kb_scores = np.dot(query_embedding, self.kb_embeddings.T)[0]
256
  top_indices = np.argsort(kb_scores)[::-1][:top_k]
257
  for idx in top_indices:
258
- if kb_scores[idx] > 0.2: # minimum relevance threshold
259
  results.append({
260
  "text": self.kb_chunks[idx],
261
  "score": float(kb_scores[idx]),
262
  "metadata": self.kb_metadata[idx]
263
  })
264
 
265
- # Search user-uploaded content
266
  if user_chunks and user_embeddings is not None and len(user_embeddings) > 0:
267
  user_scores = np.dot(query_embedding, user_embeddings.T)[0]
268
  top_user = np.argsort(user_scores)[::-1][:top_k]
269
  for idx in top_user:
270
- if user_scores[idx] > 0.2:
271
  results.append({
272
  "text": user_chunks[idx],
273
  "score": float(user_scores[idx]),
274
- "metadata": {"source": "uploaded_document", "type": "user_content"}
275
  })
276
 
277
- # Sort by score and return top_k
278
  results.sort(key=lambda x: x["score"], reverse=True)
279
  return results[:top_k]
280
-
281
  except Exception as e:
282
- print(f"Embedding search error: {e}")
283
  return self._keyword_search(query, top_k)
284
 
285
  def _keyword_search(self, query, top_k=3):
286
- """Fallback keyword-based search."""
287
  query_words = set(query.lower().split())
288
  scored = []
289
  for i, chunk in enumerate(self.kb_chunks):
@@ -299,8 +368,8 @@ class RAGService:
299
  return scored[:top_k]
300
 
301
  def embed_chunks(self, chunks):
302
- """Embed a list of text chunks. Returns numpy array or None."""
303
- if not self.embedder or not chunks:
304
  return None
305
  try:
306
  return self.embedder.encode(
@@ -308,28 +377,28 @@ class RAGService:
308
  convert_to_numpy=True,
309
  normalize_embeddings=True,
310
  show_progress_bar=False,
311
- batch_size=32,
312
  )
313
- except Exception:
 
314
  return None
315
 
316
 
317
  class DocumentParser:
318
- """Parse uploaded documents into text chunks."""
319
 
320
  @staticmethod
321
  def parse_file(filepath):
322
  """Extract text from uploaded file."""
323
  if filepath is None:
324
  return "", []
325
-
326
  filepath = str(filepath)
327
  ext = Path(filepath).suffix.lower()
328
 
329
  try:
330
  if ext == ".pdf" and HAS_FITZ:
331
  return DocumentParser._parse_pdf(filepath)
332
- elif ext in (".txt", ".md", ".csv", ".tsv", ".fasta", ".fa", ".fastq", ".fq", ".vcf", ".bed", ".gff", ".gtf", ".sam"):
333
  return DocumentParser._parse_text(filepath)
334
  else:
335
  return f"Unsupported file type: {ext}", []
@@ -341,8 +410,7 @@ class DocumentParser:
341
  doc = fitz.open(filepath)
342
  pages = []
343
  for page_num in range(len(doc)):
344
- page = doc[page_num]
345
- text = page.get_text()
346
  if text.strip():
347
  pages.append(text)
348
  doc.close()
@@ -370,155 +438,146 @@ class DocumentParser:
370
  return chunks
371
 
372
 
373
- # ============================================================================
374
- # INITIALIZE SERVICES
375
- # ============================================================================
376
-
377
- print("🧬 Initializing BB Tutor services...")
378
  llm_service = LLMService()
379
  rag_service = RAGService()
380
  doc_parser = DocumentParser()
381
- print(f" LLM available: {llm_service.is_available()}")
382
- print(f" RAG embedder available: {rag_service.embedder is not None}")
383
- print(f" Knowledge base chunks: {len(rag_service.kb_chunks)}")
384
- print("βœ… BB Tutor services initialized!")
385
 
386
 
387
  # ============================================================================
388
- # MODULE 1: ASK THE TUTOR
389
  # ============================================================================
390
 
391
- def tutor_respond(message, history, system_prompt, temperature, max_tokens, rag_store):
392
- """Main tutor chat handler with RAG-augmented responses."""
393
- if not message.strip():
394
- yield ""
395
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
- # Retrieve relevant context
398
- user_chunks = rag_store.get("chunks", []) if isinstance(rag_store, dict) else []
399
- user_embeddings = rag_store.get("embeddings") if isinstance(rag_store, dict) else None
400
 
401
- rag_results = rag_service.search(
402
- message,
403
- top_k=TOP_K_RETRIEVAL,
404
- user_chunks=user_chunks,
405
- user_embeddings=user_embeddings
406
- )
407
 
408
- # Build context from retrieved chunks
409
- context_parts = []
410
- if rag_results:
411
- context_parts.append("RELEVANT KNOWLEDGE BASE CONTEXT:")
412
- for r in rag_results:
413
- source = r["metadata"].get("source", "unknown")
414
- context_parts.append(f"[Source: {source}] {r['text']}")
415
 
416
- # Build messages
417
- messages = [{"role": "system", "content": system_prompt}]
418
- if context_parts:
419
- messages.append({
420
- "role": "system",
421
- "content": "\n".join(context_parts)
422
- })
423
 
424
- # Add conversation history
425
- for h in history:
426
- messages.append(h)
427
 
 
 
 
 
428
  messages.append({"role": "user", "content": message})
429
 
430
- # Stream response
431
- for partial in llm_service.stream_chat(messages, temperature=temperature, max_tokens=max_tokens):
432
  yield partial
433
 
434
 
435
- # ============================================================================
436
- # MODULE 2: UPLOAD AND EXPLAIN
437
- # ============================================================================
 
 
438
 
439
- def process_upload(file, rag_store):
440
- """Process an uploaded file: extract text, chunk, embed, explain."""
441
  if file is None:
442
- return "Please upload a file first.", "", rag_store
443
 
444
  full_text, chunks = doc_parser.parse_file(file)
445
 
446
  if not chunks:
447
- return "Could not extract text from the uploaded file.", full_text[:2000] if full_text else "", rag_store
448
 
449
- # Embed the chunks
450
  embeddings = rag_service.embed_chunks(chunks)
 
451
 
452
- # Update RAG store with uploaded content
453
- new_store = dict(rag_store) if isinstance(rag_store, dict) else {"chunks": [], "embeddings": None}
454
- new_store["chunks"] = chunks
455
- if embeddings is not None:
456
- new_store["embeddings"] = embeddings
457
-
458
- # Generate explanation
459
- preview = full_text[:3000] if len(full_text) > 3000 else full_text
460
- messages = [
461
  {"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]},
462
- {"role": "user", "content": f"Please analyze and explain this uploaded content:\n\n{preview}"}
463
  ]
464
- explanation = llm_service.generate(messages, temperature=0.5, max_tokens=1500)
465
 
466
- # Add stats
467
- stats = f"πŸ“Š **Document Stats:** {len(chunks)} chunks, ~{len(full_text.split())} words extracted\n\n---\n\n"
 
 
468
 
469
- return stats + explanation, full_text[:5000], new_store
470
 
471
 
472
- def upload_chat_respond(message, history, rag_store):
473
- """Chat about uploaded documents with RAG context."""
474
- if not message.strip():
475
  yield ""
476
  return
477
 
478
- user_chunks = rag_store.get("chunks", []) if isinstance(rag_store, dict) else []
479
- user_embeddings = rag_store.get("embeddings") if isinstance(rag_store, dict) else None
 
480
 
481
  if not user_chunks:
482
- yield "Please upload a document first using the upload panel above, then ask questions about it."
483
  return
484
 
485
- # Retrieve relevant chunks from uploaded doc
486
- rag_results = rag_service.search(
487
- message, top_k=4,
488
- user_chunks=user_chunks,
489
- user_embeddings=user_embeddings
490
- )
491
 
492
- context = "CONTEXT FROM UPLOADED DOCUMENT:\n"
493
- for r in rag_results:
494
- context += f"\n{r['text']}\n"
 
 
 
495
 
496
  messages = [
497
  {"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]},
498
- {"role": "system", "content": context},
499
  ]
500
- for h in history:
501
- messages.append(h)
502
  messages.append({"role": "user", "content": message})
503
 
504
  for partial in llm_service.stream_chat(messages, temperature=0.5, max_tokens=1024):
505
  yield partial
506
 
507
 
508
- # ============================================================================
509
- # MODULE 3: QUIZ ME
510
- # ============================================================================
511
 
512
- def generate_quiz(topic, quiz_type, num_questions, difficulty, rag_store):
513
- """Generate a quiz on a bioinformatics topic."""
514
  if not topic:
515
- return "Please select or enter a topic first.", ""
516
 
517
- # Get relevant context
518
  rag_results = rag_service.search(topic, top_k=3)
519
  context = ""
520
  if rag_results:
521
- context = "Use this reference material:\n" + "\n".join(r["text"] for r in rag_results)
522
 
523
  template_key = {
524
  "Multiple Choice (MCQ)": "mcq",
@@ -527,71 +586,56 @@ def generate_quiz(topic, quiz_type, num_questions, difficulty, rag_store):
527
  }.get(quiz_type, "mcq")
528
 
529
  quiz_prompt = QUIZ_TEMPLATES[template_key].format(
530
- n=int(num_questions),
531
- topic=topic,
532
- difficulty=difficulty
533
  )
534
 
535
- messages = [
536
- {"role": "system", "content": SYSTEM_PROMPTS["quiz_me"]},
537
- ]
538
  if context:
539
  messages.append({"role": "system", "content": context})
540
  messages.append({"role": "user", "content": quiz_prompt})
541
 
542
  response = llm_service.generate(messages, temperature=0.8, max_tokens=2000)
543
 
544
- # Format nicely
545
  formatted = f"## 🧠 {topic} Quiz β€” {difficulty}\n\n"
546
- formatted += f"*Type: {quiz_type} | Questions: {int(num_questions)}*\n\n---\n\n"
547
  formatted += response
548
 
549
- # Store answer key
550
- answer_key = response
551
-
552
- return formatted, answer_key
553
 
554
 
555
  def check_quiz_answers(user_answers, answer_key):
556
- """Provide feedback on quiz answers."""
557
- if not user_answers.strip():
558
- return "Please enter your answers first."
559
  if not answer_key:
560
- return "Please generate a quiz first."
561
 
562
  messages = [
563
- {"role": "system", "content": "You are a bioinformatics tutor grading a quiz. Compare the student's answers to the correct answers. For each answer: mark it βœ… correct or ❌ incorrect, explain why, and provide the correct answer if wrong. Be encouraging but accurate. Give a final score."},
564
- {"role": "user", "content": f"QUIZ AND ANSWER KEY:\n{answer_key}\n\nSTUDENT'S ANSWERS:\n{user_answers}\n\nPlease grade each answer:"}
565
  ]
566
-
567
  return llm_service.generate(messages, temperature=0.3, max_tokens=1500)
568
 
569
 
570
- # ============================================================================
571
- # MODULE 4: BUILD A LESSON
572
- # ============================================================================
573
 
574
  def generate_lesson(topic, level, include_exercises, include_quiz):
575
- """Generate a structured lesson on a bioinformatics topic."""
576
  if not topic:
577
- return "Please select or enter a topic."
578
 
579
- # Get relevant context
580
  rag_results = rag_service.search(topic, top_k=4)
581
  context = ""
582
  if rag_results:
583
- context = "Reference material:\n" + "\n".join(r["text"] for r in rag_results)
584
 
585
  prompt = LESSON_TEMPLATE.format(topic=topic, level=level)
586
-
587
  if include_exercises:
588
  prompt += "\n\nInclude 2-3 practical exercises with clear instructions."
589
  if include_quiz:
590
- prompt += "\n\nInclude a 5-question self-assessment quiz at the end (with answers)."
591
 
592
- messages = [
593
- {"role": "system", "content": SYSTEM_PROMPTS["build_lesson"]},
594
- ]
595
  if context:
596
  messages.append({"role": "system", "content": context})
597
  messages.append({"role": "user", "content": prompt})
@@ -599,148 +643,107 @@ def generate_lesson(topic, level, include_exercises, include_quiz):
599
  return llm_service.generate(messages, temperature=0.7, max_tokens=3000)
600
 
601
 
602
- # ============================================================================
603
- # MODULE 5: WORKFLOW COACH
604
- # ============================================================================
605
 
606
  def workflow_respond(message, history, selected_workflow, temperature):
607
- """Workflow coaching chat handler."""
608
- if not message.strip():
609
  yield ""
610
  return
611
 
612
- # Get workflow context
613
  workflow_context = ""
614
  for wf_key, wf in WORKFLOWS.items():
615
  if wf["name"] in selected_workflow or selected_workflow.lower() in wf["name"].lower():
616
  workflow_context = f"WORKFLOW REFERENCE: {wf['name']}\n\n"
617
  for step in wf["steps"]:
618
  workflow_context += f"Step {step['step']}: {step['name']}\n"
619
- workflow_context += f" Description: {step['description']}\n"
620
- workflow_context += f" Tools: {', '.join(step.get('tools', []))}\n"
 
621
  if step.get("common_mistakes"):
622
- workflow_context += f" Common mistakes: {'; '.join(step['common_mistakes'])}\n"
623
  workflow_context += "\n"
624
  break
625
 
626
- # Also search RAG
627
  rag_results = rag_service.search(message, top_k=2)
628
  if rag_results:
629
- workflow_context += "\nADDITIONAL CONTEXT:\n" + "\n".join(r["text"] for r in rag_results)
630
 
631
- messages = [
632
- {"role": "system", "content": SYSTEM_PROMPTS["workflow_coach"]},
633
- ]
634
  if workflow_context:
635
  messages.append({"role": "system", "content": workflow_context})
636
-
637
- for h in history:
638
- messages.append(h)
639
  messages.append({"role": "user", "content": message})
640
 
641
- for partial in llm_service.stream_chat(messages, temperature=temperature, max_tokens=1500):
642
  yield partial
643
 
644
 
645
- # ============================================================================
646
- # MODULE 6: PAPER TO LESSON
647
- # ============================================================================
648
 
649
- def paper_to_lesson_respond(message, history, output_format, rag_store):
650
- """Convert paper content into teaching material."""
651
- if not message.strip():
652
  yield ""
653
  return
654
 
655
- user_chunks = rag_store.get("chunks", []) if isinstance(rag_store, dict) else []
656
- user_embeddings = rag_store.get("embeddings") if isinstance(rag_store, dict) else None
 
657
 
658
  context = ""
659
  if user_chunks:
660
- rag_results = rag_service.search(
661
- message, top_k=4,
662
- user_chunks=user_chunks,
663
- user_embeddings=user_embeddings
664
- )
665
  if rag_results:
666
- context = "PAPER CONTENT:\n" + "\n".join(r["text"] for r in rag_results)
667
 
668
  format_instruction = {
669
  "Lesson Plan": "Create a structured lesson plan with learning objectives, sections, and exercises.",
670
- "Slide Outline": "Create a slide-by-slide outline with key points for each slide (title + 3-5 bullet points per slide).",
671
  "Study Notes": "Create concise study notes highlighting key methods, tools, and findings.",
672
  "Quiz Questions": "Generate 5-10 quiz questions based on the paper's methods and findings.",
673
  }.get(output_format, "Create a structured lesson plan.")
674
 
675
- messages = [
676
- {"role": "system", "content": SYSTEM_PROMPTS["paper_to_lesson"]},
677
- ]
678
  if context:
679
  messages.append({"role": "system", "content": context})
 
680
 
681
- for h in history:
682
- messages.append(h)
683
-
684
- full_message = f"{message}\n\nOUTPUT FORMAT: {format_instruction}"
685
- messages.append({"role": "user", "content": full_message})
686
 
687
  for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=2500):
688
  yield partial
689
 
690
 
691
- # ============================================================================
692
- # MODULE 7: VIVA PRACTICE
693
- # ============================================================================
694
 
695
  def viva_respond(message, history, topic, difficulty):
696
- """Viva voce practice session handler."""
697
- if not message.strip():
698
  yield ""
699
  return
700
 
701
- # Get topic context
702
  rag_results = rag_service.search(f"{topic} {message}", top_k=3)
703
  context = ""
704
  if rag_results:
705
- context = "REFERENCE MATERIAL:\n" + "\n".join(r["text"] for r in rag_results)
706
 
707
  messages = [
708
  {"role": "system", "content": SYSTEM_PROMPTS["viva_practice"]},
709
- {"role": "system", "content": f"VIVA TOPIC: {topic}\nDIFFICULTY LEVEL: {difficulty}\n\n{context}"},
710
  ]
711
-
712
- for h in history:
713
- messages.append(h)
714
  messages.append({"role": "user", "content": message})
715
 
716
  for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=1000):
717
  yield partial
718
 
719
 
720
- def start_viva(topic, difficulty):
721
- """Generate the opening viva question."""
722
- if not topic:
723
- return "Please select a topic to begin the viva."
724
-
725
- rag_results = rag_service.search(topic, top_k=2)
726
- context = ""
727
- if rag_results:
728
- context = "\n".join(r["text"] for r in rag_results)
729
-
730
- messages = [
731
- {"role": "system", "content": SYSTEM_PROMPTS["viva_practice"]},
732
- {"role": "system", "content": f"Topic: {topic}\nDifficulty: {difficulty}\n\nReference: {context}"},
733
- {"role": "user", "content": f"I'm ready for my viva on {topic}. Please start with your first question."}
734
- ]
735
-
736
- return llm_service.generate(messages, temperature=0.7, max_tokens=500)
737
-
738
-
739
  # ============================================================================
740
  # GRADIO APP ASSEMBLY
741
  # ============================================================================
742
 
743
- # Custom CSS
744
  CUSTOM_CSS = """
745
  .main-header {
746
  text-align: center;
@@ -750,8 +753,8 @@ CUSTOM_CSS = """
750
  margin-bottom: 20px;
751
  color: white;
752
  }
753
- .main-header h1 { color: white; font-size: 2em; margin-bottom: 5px; }
754
- .main-header p { color: #ecf0f1; font-size: 1.1em; }
755
  .module-info {
756
  background: #f0f9ff;
757
  border-left: 4px solid #2e86c1;
@@ -767,22 +770,38 @@ CUSTOM_CSS = """
767
  border-radius: 0 8px 8px 0;
768
  font-size: 0.9em;
769
  }
 
 
 
 
 
 
 
 
 
770
  """
771
 
 
772
  def build_app():
773
- with gr.Blocks(title="Bioinformatics with BB Tutor") as demo:
774
 
775
- # Shared state across all tabs
776
  rag_store = gr.State({"chunks": [], "embeddings": None})
777
 
778
- # ── Header ────────────────────────────────────────────────────────
779
- gr.HTML("""
 
 
 
780
  <div class="main-header">
781
  <h1>🧬 Bioinformatics with BB Tutor</h1>
782
- <p>Your AI-powered bioinformatics teaching assistant</p>
783
  <p style="font-size: 0.85em; opacity: 0.9;">
784
  RNA-seq Β· Exome Β· Genome Β· Microbiome Β· Variants Β· Molecular Genetics Β· scRNA-seq Β· ATAC-seq Β· ChIP-seq Β· and more
785
  </p>
 
 
 
786
  </div>
787
  """)
788
 
@@ -792,7 +811,7 @@ def build_app():
792
  # TAB 1: ASK THE TUTOR
793
  # ══════════════════════════════════════════════════════════════
794
  with gr.Tab("🧬 Ask the Tutor", id="ask"):
795
- gr.HTML('<div class="module-info">πŸ’‘ Ask any bioinformatics question. The tutor uses a curated knowledge base to provide accurate, educational answers with proper context.</div>')
796
 
797
  gr.ChatInterface(
798
  fn=tutor_respond,
@@ -801,38 +820,32 @@ def build_app():
801
  gr.Textbox(
802
  value=SYSTEM_PROMPTS["ask_tutor"],
803
  label="System Prompt",
804
- lines=3,
805
- visible=True,
806
- ),
807
- gr.Slider(
808
- minimum=0.1, maximum=1.5, value=0.7, step=0.1,
809
- label="Temperature (lower = more focused, higher = more creative)"
810
- ),
811
- gr.Slider(
812
- minimum=256, maximum=4096, value=1024, step=256,
813
- label="Max Response Length (tokens)"
814
  ),
 
 
815
  rag_store,
816
  ],
817
- additional_inputs_accordion=gr.Accordion("βš™οΈ Advanced Settings", open=False),
818
  examples=[
819
- "What is the difference between DESeq2 and edgeR for differential expression analysis?",
820
- "Explain the GATK Best Practices variant calling pipeline step by step.",
821
- "What is the difference between alpha and beta diversity in microbiome analysis?",
822
  "Why should I use adjusted p-values instead of raw p-values?",
823
- "Explain the single-cell RNA-seq analysis workflow from raw data to cell type annotation.",
824
- "What is BQSR and why is it important in variant calling?",
 
 
825
  ],
826
- save_history=True,
827
  )
828
-
829
- gr.HTML('<div class="safety-notice">⚠️ <strong>Educational use only.</strong> This tutor provides learning support, not clinical interpretations. Always consult qualified professionals for clinical genomics decisions.</div>')
830
 
831
  # ══════════════════════════════════════════════════════════════
832
- # TAB 2: UPLOAD AND EXPLAIN
833
  # ══════════════════════════════════════════════════════════════
834
  with gr.Tab("πŸ“„ Upload & Explain", id="upload"):
835
- gr.HTML('<div class="module-info">πŸ“„ Upload bioinformatics documents (PDFs, text files, VCFs, FASTA, etc.) and get AI-powered explanations. Uploaded content becomes available for Q&A across all modules.</div>')
836
 
837
  with gr.Row():
838
  with gr.Column(scale=1):
@@ -840,11 +853,12 @@ def build_app():
840
  label="Upload Document",
841
  file_types=[".pdf", ".txt", ".md", ".csv", ".tsv",
842
  ".fasta", ".fa", ".fastq", ".vcf", ".bed",
843
- ".gff", ".gtf", ".sam"],
844
  file_count="single",
845
  type="filepath",
846
  )
847
- process_btn = gr.Button("πŸ” Analyze Document", variant="primary", size="lg")
 
848
 
849
  with gr.Column(scale=2):
850
  explanation_output = gr.Markdown(label="Analysis & Explanation")
@@ -858,14 +872,14 @@ def build_app():
858
  outputs=[explanation_output, raw_text_output, rag_store],
859
  )
860
 
861
- gr.Markdown("### πŸ’¬ Ask Questions About Your Document")
862
  gr.ChatInterface(
863
  fn=upload_chat_respond,
864
  type="messages",
865
  additional_inputs=[rag_store],
866
  additional_inputs_accordion=gr.Accordion("", open=False, visible=False),
867
  examples=[
868
- "Summarize the key methods used in this paper.",
869
  "What bioinformatics tools are mentioned?",
870
  "Explain the main findings in simple terms.",
871
  "What are the limitations of this analysis?",
@@ -876,37 +890,29 @@ def build_app():
876
  # TAB 3: QUIZ ME
877
  # ══════════════════════════════════════════════════════════════
878
  with gr.Tab("❓ Quiz Me", id="quiz"):
879
- gr.HTML('<div class="module-info">🧠 Test your knowledge with auto-generated quizzes. Choose a topic, format, and difficulty level.</div>')
880
 
881
  with gr.Row():
882
- with gr.Column(scale=2):
883
- quiz_topic = gr.Dropdown(
884
- choices=TOPIC_CHOICES,
885
- label="Select Topic",
886
- allow_custom_value=True,
887
- value="RNA-seq: Differential Expression (DESeq2)"
888
- )
889
- with gr.Column(scale=1):
890
- quiz_type = gr.Radio(
891
- choices=["Multiple Choice (MCQ)", "True/False", "Short Answer"],
892
- value="Multiple Choice (MCQ)",
893
- label="Question Format"
894
- )
895
 
896
  with gr.Row():
897
- with gr.Column(scale=1):
898
- quiz_difficulty = gr.Radio(
899
- choices=DIFFICULTY_LEVELS,
900
- value="Intermediate",
901
- label="Difficulty"
902
- )
903
- with gr.Column(scale=1):
904
- num_questions = gr.Slider(
905
- minimum=1, maximum=10, value=5, step=1,
906
- label="Number of Questions"
907
- )
908
- with gr.Column(scale=1):
909
- generate_quiz_btn = gr.Button("🎲 Generate Quiz", variant="primary", size="lg")
910
 
911
  quiz_output = gr.Markdown(label="Generated Quiz")
912
  answer_key_state = gr.State("")
@@ -919,12 +925,14 @@ def build_app():
919
 
920
  gr.Markdown("---")
921
  gr.Markdown("### ✍️ Submit Your Answers")
922
- user_answers = gr.Textbox(
923
- label="Enter your answers (e.g., '1: A, 2: B, 3: True...')",
924
- lines=5,
925
- placeholder="Type your answers here..."
926
- )
927
- check_btn = gr.Button("βœ… Check Answers", variant="primary")
 
 
928
  feedback_output = gr.Markdown(label="Feedback")
929
 
930
  check_btn.click(
@@ -937,27 +945,25 @@ def build_app():
937
  # TAB 4: BUILD A LESSON
938
  # ══════════════════════════════════════════════════════════════
939
  with gr.Tab("πŸ“š Build a Lesson", id="lesson"):
940
- gr.HTML('<div class="module-info">πŸ“š Generate structured lessons with learning objectives, explanations, exercises, and quizzes for any bioinformatics topic.</div>')
941
 
942
  with gr.Row():
943
- with gr.Column(scale=2):
944
- lesson_topic = gr.Dropdown(
945
- choices=TOPIC_CHOICES,
946
- label="Lesson Topic",
947
- allow_custom_value=True,
948
- value="RNA-seq: Differential Expression (DESeq2)"
949
- )
950
- with gr.Column(scale=1):
951
- lesson_level = gr.Radio(
952
- choices=DIFFICULTY_LEVELS,
953
- value="Intermediate",
954
- label="Student Level"
955
- )
956
 
957
  with gr.Row():
958
- include_exercises = gr.Checkbox(label="Include Practical Exercises", value=True)
959
- include_quiz = gr.Checkbox(label="Include Self-Assessment Quiz", value=True)
960
- generate_lesson_btn = gr.Button("πŸ“ Generate Lesson", variant="primary", size="lg")
961
 
962
  lesson_output = gr.Markdown(label="Generated Lesson")
963
 
@@ -971,7 +977,7 @@ def build_app():
971
  # TAB 5: WORKFLOW COACH
972
  # ══════════════════════════════════════════════════════════════
973
  with gr.Tab("πŸ”¬ Workflow Coach", id="workflow"):
974
- gr.HTML('<div class="module-info">πŸ”¬ Get step-by-step guidance through bioinformatics analysis pipelines. Select a workflow and ask questions about any step.</div>')
975
 
976
  workflow_selector = gr.Dropdown(
977
  choices=WORKFLOW_CHOICES,
@@ -985,19 +991,16 @@ def build_app():
985
  type="messages",
986
  additional_inputs=[
987
  workflow_selector,
988
- gr.Slider(
989
- minimum=0.1, maximum=1.5, value=0.7, step=0.1,
990
- label="Temperature"
991
- ),
992
  ],
993
- additional_inputs_accordion=gr.Accordion("βš™οΈ Settings", open=False),
994
  examples=[
995
- "Walk me through the complete pipeline from raw FASTQ to differential expression results.",
996
- "I'm at the alignment step. What should I check before moving to counting?",
997
  "My mapping rate is only 45%. What could be wrong?",
998
- "How do I choose between STAR and HISAT2 for RNA-seq alignment?",
999
- "What parameters should I use for GATK HaplotypeCaller on exome data?",
1000
- "How do I set the truncation parameters for DADA2 in QIIME2?",
1001
  ],
1002
  )
1003
 
@@ -1005,7 +1008,7 @@ def build_app():
1005
  # TAB 6: PAPER TO LESSON
1006
  # ══════════════════════════════════════════════════════════════
1007
  with gr.Tab("πŸ“° Paper to Lesson", id="paper"):
1008
- gr.HTML('<div class="module-info">πŸ“° Convert research papers into teaching material. Upload a paper first in the "Upload & Explain" tab, then use this module to generate lessons, slide outlines, and quiz questions from it.</div>')
1009
 
1010
  output_format = gr.Radio(
1011
  choices=["Lesson Plan", "Slide Outline", "Study Notes", "Quiz Questions"],
@@ -1016,16 +1019,13 @@ def build_app():
1016
  gr.ChatInterface(
1017
  fn=paper_to_lesson_respond,
1018
  type="messages",
1019
- additional_inputs=[
1020
- output_format,
1021
- rag_store,
1022
- ],
1023
  additional_inputs_accordion=gr.Accordion("", open=False, visible=False),
1024
  examples=[
1025
  "Convert this paper into a 45-minute lecture plan.",
1026
- "Create a slide outline covering the key methods in this paper.",
1027
- "Generate study notes highlighting the bioinformatics methods used.",
1028
- "Create quiz questions testing understanding of this paper's methodology.",
1029
  ],
1030
  )
1031
 
@@ -1033,7 +1033,7 @@ def build_app():
1033
  # TAB 7: VIVA PRACTICE
1034
  # ══════════════════════════════════════════════════════════════
1035
  with gr.Tab("πŸŽ“ Viva Practice", id="viva"):
1036
- gr.HTML('<div class="module-info">πŸŽ“ Practice for oral examinations. The AI examiner asks probing questions, evaluates your answers, and pushes you to demonstrate deeper understanding.</div>')
1037
 
1038
  with gr.Row():
1039
  viva_topic = gr.Dropdown(
@@ -1045,31 +1045,28 @@ def build_app():
1045
  viva_difficulty = gr.Radio(
1046
  choices=DIFFICULTY_LEVELS,
1047
  value="Intermediate",
1048
- label="Exam Difficulty"
1049
  )
1050
 
1051
  gr.ChatInterface(
1052
  fn=viva_respond,
1053
  type="messages",
1054
- additional_inputs=[
1055
- viva_topic,
1056
- viva_difficulty,
1057
- ],
1058
- additional_inputs_accordion=gr.Accordion("βš™οΈ Settings", open=False),
1059
  examples=[
1060
- "I'm ready for my viva. Please start with your first question.",
1061
- "Can we focus on the statistical aspects of RNA-seq analysis?",
1062
  "Ask me about variant calling and interpretation.",
1063
- "Test my understanding of microbiome diversity analysis.",
1064
  ],
1065
  )
1066
 
1067
- # ── Footer ────────────────────────────────────────────────────────
1068
  gr.HTML("""
1069
  <div style="text-align: center; padding: 20px; margin-top: 20px; border-top: 1px solid #e0e0e0; color: #666; font-size: 0.85em;">
1070
  <p><strong>Bioinformatics with BB Tutor</strong> β€” Educational AI Assistant</p>
1071
- <p>⚠️ For educational purposes only. Not for clinical use. Always verify critical information with primary sources.</p>
1072
- <p>Domains: RNA-seq Β· Exome Β· Genome Β· Microbiome Β· Variants Β· Molecular Genetics Β· scRNA-seq Β· ATAC-seq Β· ChIP-seq Β· Methylation Β· Small RNA Β· Targeted Panels Β· Long-read Β· Spatial Transcriptomics Β· Multi-omics</p>
1073
  </div>
1074
  """)
1075
 
 
1
  """
2
  Bioinformatics with BB Tutor β€” Complete Application
3
+ A production bioinformatics teaching assistant with 7 modules.
4
+
5
+ Architecture:
6
+ - Backend: LLMService (HuggingFace InferenceClient), RAGService (sentence-transformers),
7
+ DocumentParser (PyMuPDF + text), knowledge_base (domain content)
8
+ - Frontend: 7 Gradio tabs with ChatInterface, file upload, quiz generation, lesson building
9
+ - Data flow: User query β†’ RAG retrieval β†’ LLM with context β†’ streaming response
10
+ - Shared state: rag_store (gr.State) holds uploaded document chunks + embeddings across tabs
11
  """
12
 
13
  import gradio as gr
 
16
  import os
17
  import re
18
  import time
 
19
  from pathlib import Path
20
 
21
  # ── Conditional imports with fallbacks ────────────────────────────────────────
 
24
  HAS_FITZ = True
25
  except ImportError:
26
  HAS_FITZ = False
27
+ print("Warning: PyMuPDF not available. PDF parsing disabled.")
28
 
29
  try:
30
  from sentence_transformers import SentenceTransformer
31
  HAS_ST = True
32
  except ImportError:
33
  HAS_ST = False
34
+ print("Warning: sentence-transformers not available. Embedding search disabled.")
35
 
36
  try:
37
  from huggingface_hub import InferenceClient
38
  HAS_HF = True
39
  except ImportError:
40
  HAS_HF = False
41
+ print("Warning: huggingface_hub not available. LLM service disabled.")
 
 
 
 
 
42
 
43
  # ── Import knowledge base ────────────────────────────────────────────────────
44
  from knowledge_base import (
 
52
  # CONFIGURATION
53
  # ============================================================================
54
 
 
55
  LLM_MODEL = os.environ.get("LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
56
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
57
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
58
 
59
+ CHUNK_SIZE = 400
60
+ CHUNK_OVERLAP = 60
 
61
  TOP_K_RETRIEVAL = 3
62
 
63
 
64
  # ============================================================================
65
+ # BACKEND SERVICES β€” Singleton Pattern
66
  # ============================================================================
67
 
68
  class LLMService:
69
+ """Lazy-initialized LLM inference service."""
70
+
71
+ _instance = None
72
+ _initialized = False
73
+
74
+ def __new__(cls):
75
+ if cls._instance is None:
76
+ cls._instance = super().__new__(cls)
77
+ return cls._instance
78
 
79
  def __init__(self):
80
+ if LLMService._initialized:
81
+ return
82
+ LLMService._initialized = True
83
  self.client = None
84
+ self._try_init()
85
+
86
+ def _try_init(self):
87
+ if not HAS_HF:
88
+ print("LLMService: huggingface_hub not available")
89
+ return
90
+ if not HF_TOKEN:
91
+ print("LLMService: HF_TOKEN not set in environment")
92
+ return
93
+ try:
94
+ self.client = InferenceClient(
95
+ model=LLM_MODEL,
96
+ token=HF_TOKEN,
97
+ timeout=120,
98
+ )
99
+ print("LLMService: Initialized successfully")
100
+ except Exception as e:
101
+ print(f"LLMService: Failed to initialize: {e}")
102
+ self.client = None
103
 
104
  def is_available(self):
105
  return self.client is not None
106
 
107
+ def _format_messages(self, messages, system_prompt=None, rag_context=None):
108
+ """Build message list with optional system prompt and RAG context."""
109
+ formatted = []
110
+ if system_prompt:
111
+ formatted.append({"role": "system", "content": system_prompt})
112
+ if rag_context:
113
+ formatted.append({"role": "system", "content": rag_context})
114
+ # Add conversation history (already formatted)
115
+ for m in messages:
116
+ if isinstance(m, dict) and "role" in m:
117
+ formatted.append(m)
118
+ elif isinstance(m, (list, tuple)) and len(m) >= 2:
119
+ # Handle tuple format (text, response)
120
+ formatted.append({"role": "user", "content": str(m[0])})
121
+ if len(m) > 1 and m[1]:
122
+ formatted.append({"role": "assistant", "content": str(m[1])})
123
+ return formatted
124
+
125
  def stream_chat(self, messages, temperature=0.7, max_tokens=1024):
126
+ """Stream chat completion. Yields partial response strings."""
127
  if not self.is_available():
128
  yield self._fallback_response(messages)
129
  return
 
137
  top_p=0.9,
138
  stream=True,
139
  ):
140
+ token = ""
141
+ if hasattr(chunk, 'choices') and chunk.choices:
142
+ choice = chunk.choices[0]
143
+ if hasattr(choice, 'delta') and hasattr(choice.delta, 'content'):
144
+ token = choice.delta.content or ""
145
  partial += token
146
  yield partial
147
  except Exception as e:
148
+ print(f"LLM stream error: {e}")
149
+ yield f"⚠️ LLM API error: {str(e)}\n\nPlease check your HF_TOKEN in Space settings and ensure the model '{LLM_MODEL}' is accessible.\n\nThe tutor is still functional using its knowledge base for many questions β€” try asking about specific bioinformatics topics!"
150
 
151
  def generate(self, messages, temperature=0.7, max_tokens=1024):
152
  """Non-streaming generation. Returns complete response."""
 
163
  )
164
  return response.choices[0].message.content
165
  except Exception as e:
166
+ print(f"LLM generate error: {e}")
167
+ return f"⚠️ LLM API error: {str(e)}\n\nThe tutor can still answer from its knowledge base. Try asking about specific concepts like 'DESeq2 normalization' or 'variant calling pipeline'!"
168
 
169
  def _fallback_response(self, messages):
170
+ """Knowledge-base fallback when LLM unavailable."""
171
+ # Extract user query
172
  user_msg = ""
173
  for m in reversed(messages):
174
+ if isinstance(m, dict) and m.get("role") == "user":
175
+ user_msg = m.get("content", "").lower()
176
  break
177
 
178
+ if not user_msg:
179
+ return "⚠️ **LLM not available.** Add HF_TOKEN in Space settings to enable AI responses.\n\nMeanwhile, the knowledge base covers: DESeq2, variant calling, microbiome diversity, scRNA-seq clustering, and more. Try asking a specific question!"
180
+
181
  response_parts = []
182
 
183
+ # Glossary match
184
  for term, definition in GLOSSARY.items():
185
+ if term.lower() in user_msg:
186
  response_parts.append(f"**{term}**: {definition}")
187
+ if len(response_parts) >= 3:
188
+ break
189
 
190
+ # Workflow match
191
  for wf_key, wf in WORKFLOWS.items():
192
+ if any(kw in user_msg for kw in wf["name"].lower().split()):
193
+ response_parts.append(f"\n### {wf['name']}")
194
  for step in wf["steps"][:3]:
195
  response_parts.append(f"**Step {step['step']}: {step['name']}**\n{step['description']}")
196
  break
197
 
198
+ # Misconception match
199
  for misc in COMMON_MISCONCEPTIONS:
200
+ if misc["domain"].replace("_", " ") in user_msg or any(w in user_msg for w in misc["misconception"].lower().split()[:5]):
 
201
  response_parts.append(f"\n⚠️ **Common Misconception**: {misc['misconception']}\n\nβœ… **Correction**: {misc['correction']}")
202
  break
203
 
204
  if response_parts:
205
  return "πŸ“š *Responding from knowledge base (LLM not configured):*\n\n" + "\n\n".join(response_parts)
206
+ return (
207
+ "⚠️ **AI responses require HF_TOKEN.**\n\n"
208
+ "To enable full AI-powered responses:\n"
209
+ "1. Go to your HuggingFace account β†’ Settings β†’ Access Tokens\n"
210
+ "2. Create a token with 'inference-api' scope\n"
211
+ "3. Add it as a Secret named `HF_TOKEN` in this Space's Settings\n\n"
212
+ "The knowledge base can still answer many questions. Try asking about 'RNA-seq workflow', 'variant calling', or 'microbiome diversity'!"
213
+ )
 
214
 
215
 
216
  class RAGService:
217
+ """Document retrieval with lazy embedding model loading."""
218
+
219
+ _instance = None
220
+ _initialized = False
221
+
222
+ def __new__(cls):
223
+ if cls._instance is None:
224
+ cls._instance = super().__new__(cls)
225
+ return cls._instance
226
 
227
  def __init__(self):
228
+ if RAGService._initialized:
229
+ return
230
+ RAGService._initialized = True
231
  self.embedder = None
232
+ self.kb_chunks = []
233
+ self.kb_metadata = []
 
 
 
 
 
 
234
  self.kb_embeddings = None
235
+ self._build_kb_index()
236
+
237
+ def _ensure_embedder(self):
238
+ """Lazy load the embedding model."""
239
+ if self.embedder is not None:
240
+ return True
241
+ if not HAS_ST:
242
+ return False
243
+ try:
244
+ print("RAGService: Loading embedding model (this may take a moment)...")
245
+ self.embedder = SentenceTransformer(EMBED_MODEL)
246
+ print("RAGService: Embedding model loaded")
247
+ # Now embed the KB
248
+ if self.kb_chunks:
249
  self.kb_embeddings = self.embedder.encode(
250
  self.kb_chunks,
251
  convert_to_numpy=True,
 
253
  show_progress_bar=False,
254
  batch_size=32,
255
  )
256
+ print(f"RAGService: KB embedded ({len(self.kb_chunks)} chunks)")
257
+ return True
258
+ except Exception as e:
259
+ print(f"RAGService: Failed to load embedder: {e}")
260
+ return False
261
 
262
  def _build_kb_index(self):
263
+ """Build searchable chunks from knowledge base."""
264
  chunks = []
265
  metadata = []
266
 
267
+ # Glossary
268
  for term, definition in GLOSSARY.items():
269
  chunks.append(f"{term}: {definition}")
270
  metadata.append({"source": "glossary", "topic": term, "type": "definition"})
271
 
272
+ # Workflows
273
  for wf_key, wf in WORKFLOWS.items():
274
  for step in wf["steps"]:
275
  step_text = f"{wf['name']} - Step {step['step']}: {step['name']}. {step['description']}"
 
286
  "workflow": wf_key
287
  })
288
 
289
+ # Misconceptions
290
  for misc in COMMON_MISCONCEPTIONS:
291
  text = f"Misconception: {misc['misconception']} Correction: {misc['correction']}"
292
  chunks.append(text)
 
297
  "severity": misc["severity"]
298
  })
299
 
300
+ # Taxonomy
301
  for key, domain in DOMAIN_TAXONOMY.items():
302
+ text = f"{domain['name']} covers: {', '.join(domain['subtopics'][:10])}"
303
  chunks.append(text)
304
  metadata.append({"source": "taxonomy", "topic": key, "type": "domain_overview"})
305
 
306
+ self.kb_chunks = chunks
307
+ self.kb_metadata = metadata
308
+ print(f"RAGService: Built KB with {len(chunks)} chunks")
309
 
310
  def search(self, query, top_k=TOP_K_RETRIEVAL, user_chunks=None, user_embeddings=None):
311
+ """Search KB and optionally user-uploaded content."""
312
+ if not self._ensure_embedder():
313
  return self._keyword_search(query, top_k)
314
 
315
  try:
 
321
 
322
  results = []
323
 
324
+ # Search KB
325
  if self.kb_embeddings is not None and len(self.kb_embeddings) > 0:
326
  kb_scores = np.dot(query_embedding, self.kb_embeddings.T)[0]
327
  top_indices = np.argsort(kb_scores)[::-1][:top_k]
328
  for idx in top_indices:
329
+ if kb_scores[idx] > 0.15:
330
  results.append({
331
  "text": self.kb_chunks[idx],
332
  "score": float(kb_scores[idx]),
333
  "metadata": self.kb_metadata[idx]
334
  })
335
 
336
+ # Search user content
337
  if user_chunks and user_embeddings is not None and len(user_embeddings) > 0:
338
  user_scores = np.dot(query_embedding, user_embeddings.T)[0]
339
  top_user = np.argsort(user_scores)[::-1][:top_k]
340
  for idx in top_user:
341
+ if user_scores[idx] > 0.15:
342
  results.append({
343
  "text": user_chunks[idx],
344
  "score": float(user_scores[idx]),
345
+ "metadata": {"source": "uploaded", "type": "user_content"}
346
  })
347
 
 
348
  results.sort(key=lambda x: x["score"], reverse=True)
349
  return results[:top_k]
 
350
  except Exception as e:
351
+ print(f"RAG search error: {e}")
352
  return self._keyword_search(query, top_k)
353
 
354
  def _keyword_search(self, query, top_k=3):
355
+ """Fallback keyword search."""
356
  query_words = set(query.lower().split())
357
  scored = []
358
  for i, chunk in enumerate(self.kb_chunks):
 
368
  return scored[:top_k]
369
 
370
  def embed_chunks(self, chunks):
371
+ """Embed text chunks. Returns numpy array or None."""
372
+ if not self._ensure_embedder() or not chunks:
373
  return None
374
  try:
375
  return self.embedder.encode(
 
377
  convert_to_numpy=True,
378
  normalize_embeddings=True,
379
  show_progress_bar=False,
380
+ batch_size=16,
381
  )
382
+ except Exception as e:
383
+ print(f"Embed chunks error: {e}")
384
  return None
385
 
386
 
387
  class DocumentParser:
388
+ """Parse uploaded documents."""
389
 
390
  @staticmethod
391
  def parse_file(filepath):
392
  """Extract text from uploaded file."""
393
  if filepath is None:
394
  return "", []
 
395
  filepath = str(filepath)
396
  ext = Path(filepath).suffix.lower()
397
 
398
  try:
399
  if ext == ".pdf" and HAS_FITZ:
400
  return DocumentParser._parse_pdf(filepath)
401
+ elif ext in (".txt", ".md", ".csv", ".tsv", ".fasta", ".fa", ".fastq", ".fq", ".vcf", ".bed", ".gff", ".gtf", ".sam", ".bam"):
402
  return DocumentParser._parse_text(filepath)
403
  else:
404
  return f"Unsupported file type: {ext}", []
 
410
  doc = fitz.open(filepath)
411
  pages = []
412
  for page_num in range(len(doc)):
413
+ text = doc[page_num].get_text()
 
414
  if text.strip():
415
  pages.append(text)
416
  doc.close()
 
438
  return chunks
439
 
440
 
441
+ # ── Initialize services (lightweight, no heavy downloads) ───────────────────────
 
 
 
 
442
  llm_service = LLMService()
443
  rag_service = RAGService()
444
  doc_parser = DocumentParser()
445
+ print(f"🧬 BB Tutor initialized. LLM: {llm_service.is_available()}, Embeddings: {rag_service.embedder is not None}")
 
 
 
446
 
447
 
448
  # ============================================================================
449
+ # HANDLER FUNCTIONS β€” All receive/return plain Python values (not components)
450
  # ============================================================================
451
 
452
+ def _rag_context(query, user_chunks=None, user_embeddings=None):
453
+ """Retrieve RAG context as formatted string."""
454
+ results = rag_service.search(query, top_k=TOP_K_RETRIEVAL,
455
+ user_chunks=user_chunks, user_embeddings=user_embeddings)
456
+ if not results:
457
+ return ""
458
+ parts = ["RELEVANT KNOWLEDGE BASE CONTEXT:"]
459
+ for r in results:
460
+ source = r["metadata"].get("source", "kb")
461
+ parts.append(f"[{source}] {r['text'][:800]}")
462
+ return "\n".join(parts)
463
+
464
+
465
+ def _format_history(history):
466
+ """Convert Gradio history to OpenAI-style messages."""
467
+ messages = []
468
+ for h in history:
469
+ if isinstance(h, dict):
470
+ messages.append(h)
471
+ elif isinstance(h, (list, tuple)):
472
+ if len(h) >= 1 and h[0]:
473
+ messages.append({"role": "user", "content": str(h[0])})
474
+ if len(h) >= 2 and h[1]:
475
+ messages.append({"role": "assistant", "content": str(h[1])})
476
+ return messages
477
 
 
 
 
478
 
479
+ # ── Module 1: Ask the Tutor ───────────────────────────────────────────────────
 
 
 
 
 
480
 
481
+ def tutor_respond(message, history, system_prompt, temperature, max_tokens, rag_state):
482
+ """Ask the Tutor handler β€” streaming."""
483
+ if not message or not message.strip():
484
+ yield ""
485
+ return
 
 
486
 
487
+ rag_state = rag_state or {"chunks": [], "embeddings": None}
488
+ user_chunks = rag_state.get("chunks", [])
489
+ user_embeddings = rag_state.get("embeddings")
 
 
 
 
490
 
491
+ rag_ctx = _rag_context(message, user_chunks, user_embeddings)
 
 
492
 
493
+ messages = [{"role": "system", "content": system_prompt}]
494
+ if rag_ctx:
495
+ messages.append({"role": "system", "content": rag_ctx})
496
+ messages.extend(_format_history(history))
497
  messages.append({"role": "user", "content": message})
498
 
499
+ for partial in llm_service.stream_chat(messages, temperature, max_tokens):
 
500
  yield partial
501
 
502
 
503
+ # ── Module 2: Upload & Explain ──────────────────────────────────────────────
504
+
505
+ def process_upload(file, rag_state):
506
+ """Process uploaded file. Returns (explanation_markdown, raw_text, new_rag_state)."""
507
+ rag_state = rag_state or {"chunks": [], "embeddings": None}
508
 
 
 
509
  if file is None:
510
+ return "πŸ“ Please upload a file first.", "", rag_state
511
 
512
  full_text, chunks = doc_parser.parse_file(file)
513
 
514
  if not chunks:
515
+ return "⚠️ Could not extract text from the uploaded file.", full_text[:2000] if full_text else "", rag_state
516
 
 
517
  embeddings = rag_service.embed_chunks(chunks)
518
+ new_state = {"chunks": chunks, "embeddings": embeddings}
519
 
520
+ preview = full_text[:2500] if len(full_text) > 2500 else full_text
521
+ msgs = [
 
 
 
 
 
 
 
522
  {"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]},
523
+ {"role": "user", "content": f"Analyze and explain this bioinformatics document:\n\n{preview}"}
524
  ]
525
+ explanation = llm_service.generate(msgs, temperature=0.5, max_tokens=1500)
526
 
527
+ stats = f"πŸ“Š **Document Stats:** {len(chunks)} chunks, ~{len(full_text.split())} words | "
528
+ stats += f"File type: {Path(str(file)).suffix} | "
529
+ stats += "πŸ€– AI-powered" if llm_service.is_available() else "πŸ“š Knowledge-base mode"
530
+ stats += f"\n\n---\n\n"
531
 
532
+ return stats + explanation, full_text[:5000], new_state
533
 
534
 
535
+ def upload_chat_respond(message, history, rag_state):
536
+ """Chat about uploaded documents."""
537
+ if not message or not message.strip():
538
  yield ""
539
  return
540
 
541
+ rag_state = rag_state or {"chunks": [], "embeddings": None}
542
+ user_chunks = rag_state.get("chunks", [])
543
+ user_embeddings = rag_state.get("embeddings")
544
 
545
  if not user_chunks:
546
+ yield "πŸ“ Please upload a document in the panel above, then ask questions about it.\n\nYour uploaded document will be indexed and searchable across all modules!"
547
  return
548
 
549
+ rag_results = rag_service.search(message, top_k=4, user_chunks=user_chunks, user_embeddings=user_embeddings)
 
 
 
 
 
550
 
551
+ ctx = "CONTEXT FROM UPLOADED DOCUMENT:\n"
552
+ if rag_results:
553
+ for r in rag_results:
554
+ ctx += f"\n{r['text'][:600]}\n"
555
+ else:
556
+ ctx += "(No highly relevant passages found β€” answering from general knowledge)\n"
557
 
558
  messages = [
559
  {"role": "system", "content": SYSTEM_PROMPTS["upload_explain"]},
560
+ {"role": "system", "content": ctx},
561
  ]
562
+ messages.extend(_format_history(history))
 
563
  messages.append({"role": "user", "content": message})
564
 
565
  for partial in llm_service.stream_chat(messages, temperature=0.5, max_tokens=1024):
566
  yield partial
567
 
568
 
569
+ # ── Module 3: Quiz Me ─────────────────────────────────────────────────────────
 
 
570
 
571
+ def generate_quiz(topic, quiz_type, num_questions, difficulty, rag_state):
572
+ """Generate quiz. Returns (quiz_markdown, answer_key_string)."""
573
  if not topic:
574
+ return "❓ Please select or enter a topic first.", ""
575
 
576
+ rag_state = rag_state or {"chunks": [], "embeddings": None}
577
  rag_results = rag_service.search(topic, top_k=3)
578
  context = ""
579
  if rag_results:
580
+ context = "Reference material:\n" + "\n".join(r["text"][:500] for r in rag_results)
581
 
582
  template_key = {
583
  "Multiple Choice (MCQ)": "mcq",
 
586
  }.get(quiz_type, "mcq")
587
 
588
  quiz_prompt = QUIZ_TEMPLATES[template_key].format(
589
+ n=int(num_questions), topic=topic, difficulty=difficulty
 
 
590
  )
591
 
592
+ messages = [{"role": "system", "content": SYSTEM_PROMPTS["quiz_me"]}]
 
 
593
  if context:
594
  messages.append({"role": "system", "content": context})
595
  messages.append({"role": "user", "content": quiz_prompt})
596
 
597
  response = llm_service.generate(messages, temperature=0.8, max_tokens=2000)
598
 
 
599
  formatted = f"## 🧠 {topic} Quiz β€” {difficulty}\n\n"
600
+ formatted += f"*Format: {quiz_type} | Questions: {int(num_questions)}*\n\n---\n\n"
601
  formatted += response
602
 
603
+ return formatted, response
 
 
 
604
 
605
 
606
  def check_quiz_answers(user_answers, answer_key):
607
+ """Grade quiz answers."""
608
+ if not user_answers or not user_answers.strip():
609
+ return "✍️ Please enter your answers above before checking."
610
  if not answer_key:
611
+ return "⚠️ Please generate a quiz first (use the panel above)."
612
 
613
  messages = [
614
+ {"role": "system", "content": "You are a bioinformatics tutor grading a quiz. Compare student answers to correct answers. For each: mark βœ… or ❌, explain briefly, provide correct answer if wrong. Be encouraging. Give final score."},
615
+ {"role": "user", "content": f"QUIZ AND ANSWERS:\n{answer_key}\n\nSTUDENT ANSWERS:\n{user_answers}\n\nGrade each:"}
616
  ]
 
617
  return llm_service.generate(messages, temperature=0.3, max_tokens=1500)
618
 
619
 
620
+ # ── Module 4: Build a Lesson ──────────────────────────────────────────────────
 
 
621
 
622
  def generate_lesson(topic, level, include_exercises, include_quiz):
623
+ """Generate structured lesson."""
624
  if not topic:
625
+ return "πŸ“š Please select or enter a topic."
626
 
 
627
  rag_results = rag_service.search(topic, top_k=4)
628
  context = ""
629
  if rag_results:
630
+ context = "Reference:\n" + "\n".join(r["text"][:500] for r in rag_results)
631
 
632
  prompt = LESSON_TEMPLATE.format(topic=topic, level=level)
 
633
  if include_exercises:
634
  prompt += "\n\nInclude 2-3 practical exercises with clear instructions."
635
  if include_quiz:
636
+ prompt += "\n\nInclude a 5-question self-assessment quiz (with answers)."
637
 
638
+ messages = [{"role": "system", "content": SYSTEM_PROMPTS["build_lesson"]}]
 
 
639
  if context:
640
  messages.append({"role": "system", "content": context})
641
  messages.append({"role": "user", "content": prompt})
 
643
  return llm_service.generate(messages, temperature=0.7, max_tokens=3000)
644
 
645
 
646
+ # ── Module 5: Workflow Coach ──────────────────────────────────────────────────
 
 
647
 
648
  def workflow_respond(message, history, selected_workflow, temperature):
649
+ """Workflow Coach handler."""
650
+ if not message or not message.strip():
651
  yield ""
652
  return
653
 
 
654
  workflow_context = ""
655
  for wf_key, wf in WORKFLOWS.items():
656
  if wf["name"] in selected_workflow or selected_workflow.lower() in wf["name"].lower():
657
  workflow_context = f"WORKFLOW REFERENCE: {wf['name']}\n\n"
658
  for step in wf["steps"]:
659
  workflow_context += f"Step {step['step']}: {step['name']}\n"
660
+ workflow_context += f" {step['description']}\n"
661
+ if step.get("tools"):
662
+ workflow_context += f" Tools: {', '.join(step['tools'])}\n"
663
  if step.get("common_mistakes"):
664
+ workflow_context += f" ⚠️ Common mistakes: {'; '.join(step['common_mistakes'])}\n"
665
  workflow_context += "\n"
666
  break
667
 
 
668
  rag_results = rag_service.search(message, top_k=2)
669
  if rag_results:
670
+ workflow_context += "\nADDITIONAL CONTEXT:\n" + "\n".join(r["text"][:500] for r in rag_results)
671
 
672
+ messages = [{"role": "system", "content": SYSTEM_PROMPTS["workflow_coach"]}]
 
 
673
  if workflow_context:
674
  messages.append({"role": "system", "content": workflow_context})
675
+ messages.extend(_format_history(history))
 
 
676
  messages.append({"role": "user", "content": message})
677
 
678
+ for partial in llm_service.stream_chat(messages, temperature, 1500):
679
  yield partial
680
 
681
 
682
+ # ── Module 6: Paper to Lesson ─────────────────────────────────────────────────
 
 
683
 
684
+ def paper_to_lesson_respond(message, history, output_format, rag_state):
685
+ """Convert papers to teaching material."""
686
+ if not message or not message.strip():
687
  yield ""
688
  return
689
 
690
+ rag_state = rag_state or {"chunks": [], "embeddings": None}
691
+ user_chunks = rag_state.get("chunks", [])
692
+ user_embeddings = rag_state.get("embeddings")
693
 
694
  context = ""
695
  if user_chunks:
696
+ rag_results = rag_service.search(message, top_k=4, user_chunks=user_chunks, user_embeddings=user_embeddings)
 
 
 
 
697
  if rag_results:
698
+ context = "PAPER CONTENT:\n" + "\n".join(r["text"][:600] for r in rag_results)
699
 
700
  format_instruction = {
701
  "Lesson Plan": "Create a structured lesson plan with learning objectives, sections, and exercises.",
702
+ "Slide Outline": "Create a slide-by-slide outline with key points for each slide.",
703
  "Study Notes": "Create concise study notes highlighting key methods, tools, and findings.",
704
  "Quiz Questions": "Generate 5-10 quiz questions based on the paper's methods and findings.",
705
  }.get(output_format, "Create a structured lesson plan.")
706
 
707
+ messages = [{"role": "system", "content": SYSTEM_PROMPTS["paper_to_lesson"]}]
 
 
708
  if context:
709
  messages.append({"role": "system", "content": context})
710
+ messages.extend(_format_history(history))
711
 
712
+ full_msg = f"{message}\n\nOUTPUT FORMAT: {format_instruction}"
713
+ messages.append({"role": "user", "content": full_msg})
 
 
 
714
 
715
  for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=2500):
716
  yield partial
717
 
718
 
719
+ # ── Module 7: Viva Practice ───────────────────────────────────────────────────
 
 
720
 
721
  def viva_respond(message, history, topic, difficulty):
722
+ """Viva practice handler."""
723
+ if not message or not message.strip():
724
  yield ""
725
  return
726
 
 
727
  rag_results = rag_service.search(f"{topic} {message}", top_k=3)
728
  context = ""
729
  if rag_results:
730
+ context = "Reference:\n" + "\n".join(r["text"][:500] for r in rag_results)
731
 
732
  messages = [
733
  {"role": "system", "content": SYSTEM_PROMPTS["viva_practice"]},
734
+ {"role": "system", "content": f"VIVA TOPIC: {topic}\nDIFFICULTY: {difficulty}\n\n{context}"},
735
  ]
736
+ messages.extend(_format_history(history))
 
 
737
  messages.append({"role": "user", "content": message})
738
 
739
  for partial in llm_service.stream_chat(messages, temperature=0.7, max_tokens=1000):
740
  yield partial
741
 
742
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743
  # ============================================================================
744
  # GRADIO APP ASSEMBLY
745
  # ============================================================================
746
 
 
747
  CUSTOM_CSS = """
748
  .main-header {
749
  text-align: center;
 
753
  margin-bottom: 20px;
754
  color: white;
755
  }
756
+ .main-header h1 { color: white; font-size: 2em; margin: 0; }
757
+ .main-header p { color: #ecf0f1; margin: 5px 0; }
758
  .module-info {
759
  background: #f0f9ff;
760
  border-left: 4px solid #2e86c1;
 
770
  border-radius: 0 8px 8px 0;
771
  font-size: 0.9em;
772
  }
773
+ .status-badge {
774
+ display: inline-block;
775
+ padding: 4px 12px;
776
+ border-radius: 12px;
777
+ font-size: 0.85em;
778
+ font-weight: bold;
779
+ }
780
+ .status-on { background: #d4edda; color: #155724; }
781
+ .status-off { background: #f8d7da; color: #721c24; }
782
  """
783
 
784
+
785
  def build_app():
786
+ with gr.Blocks(title="Bioinformatics with BB Tutor", css=CUSTOM_CSS) as demo:
787
 
788
+ # ── Global shared state ─────────────────────────────────────────
789
  rag_store = gr.State({"chunks": [], "embeddings": None})
790
 
791
+ # ── Status indicator ────────────────────────────────────────────
792
+ llm_status = "🟒 AI Enabled" if llm_service.is_available() else "πŸ”΄ AI Offline (Knowledge Base Active)"
793
+
794
+ # ── Header ─────────────────────────────────────────────────────
795
+ gr.HTML(f"""
796
  <div class="main-header">
797
  <h1>🧬 Bioinformatics with BB Tutor</h1>
798
+ <p>AI-powered bioinformatics teaching assistant</p>
799
  <p style="font-size: 0.85em; opacity: 0.9;">
800
  RNA-seq Β· Exome Β· Genome Β· Microbiome Β· Variants Β· Molecular Genetics Β· scRNA-seq Β· ATAC-seq Β· ChIP-seq Β· and more
801
  </p>
802
+ <p style="font-size: 0.8em; margin-top: 8px;">
803
+ <span class="status-badge {'status-on' if llm_service.is_available() else 'status-off'}">{llm_status}</span>
804
+ </p>
805
  </div>
806
  """)
807
 
 
811
  # TAB 1: ASK THE TUTOR
812
  # ══════════════════════════════════════════════════════════════
813
  with gr.Tab("🧬 Ask the Tutor", id="ask"):
814
+ gr.HTML('<div class="module-info">πŸ’‘ Ask any bioinformatics question. RAG-augmented responses from a curated knowledge base covering 15+ domains.</div>')
815
 
816
  gr.ChatInterface(
817
  fn=tutor_respond,
 
820
  gr.Textbox(
821
  value=SYSTEM_PROMPTS["ask_tutor"],
822
  label="System Prompt",
823
+ lines=2,
824
+ visible=False,
 
 
 
 
 
 
 
 
825
  ),
826
+ gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Temperature", visible=False),
827
+ gr.Slider(256, 4096, 1024, step=256, label="Max Tokens", visible=False),
828
  rag_store,
829
  ],
830
+ additional_inputs_accordion=gr.Accordion("βš™οΈ Advanced", open=False, visible=False),
831
  examples=[
832
+ "What is the difference between DESeq2 and edgeR?",
833
+ "Explain the GATK variant calling pipeline step by step.",
834
+ "What is the difference between alpha and beta diversity?",
835
  "Why should I use adjusted p-values instead of raw p-values?",
836
+ "Explain the single-cell RNA-seq analysis workflow.",
837
+ "What is BQSR and why is it important?",
838
+ "How do I choose between STAR and HISAT2 for alignment?",
839
+ "What common mistakes do students make with DESeq2?",
840
  ],
 
841
  )
842
+ gr.HTML('<div class="safety-notice">⚠️ <strong>Educational use only.</strong> Not for clinical interpretation. Always consult qualified professionals for clinical genomics.</div>')
 
843
 
844
  # ══════════════════════════════════════════════════════════════
845
+ # TAB 2: UPLOAD & EXPLAIN
846
  # ══════════════════════════════════════════════════════════════
847
  with gr.Tab("πŸ“„ Upload & Explain", id="upload"):
848
+ gr.HTML('<div class="module-info">πŸ“„ Upload bioinformatics documents (PDF, TXT, FASTA, VCF, etc.) and get AI-powered analysis. Content is indexed and searchable across all modules.</div>')
849
 
850
  with gr.Row():
851
  with gr.Column(scale=1):
 
853
  label="Upload Document",
854
  file_types=[".pdf", ".txt", ".md", ".csv", ".tsv",
855
  ".fasta", ".fa", ".fastq", ".vcf", ".bed",
856
+ ".gff", ".gtf", ".sam", ".bam"],
857
  file_count="single",
858
  type="filepath",
859
  )
860
+ process_btn = gr.Button("πŸ” Analyze Document", variant="primary")
861
+ gr.Markdown("**Supported:** PDF, text, FASTA/FASTQ, VCF, BED, GFF/GTF, SAM/BAM, CSV/TSV")
862
 
863
  with gr.Column(scale=2):
864
  explanation_output = gr.Markdown(label="Analysis & Explanation")
 
872
  outputs=[explanation_output, raw_text_output, rag_store],
873
  )
874
 
875
+ gr.Markdown("### πŸ’¬ Chat About Your Document")
876
  gr.ChatInterface(
877
  fn=upload_chat_respond,
878
  type="messages",
879
  additional_inputs=[rag_store],
880
  additional_inputs_accordion=gr.Accordion("", open=False, visible=False),
881
  examples=[
882
+ "Summarize the key methods in this paper.",
883
  "What bioinformatics tools are mentioned?",
884
  "Explain the main findings in simple terms.",
885
  "What are the limitations of this analysis?",
 
890
  # TAB 3: QUIZ ME
891
  # ══════════════════════════════════════════════════════════════
892
  with gr.Tab("❓ Quiz Me", id="quiz"):
893
+ gr.HTML('<div class="module-info">🧠 Test your knowledge with AI-generated quizzes across all bioinformatics domains.</div>')
894
 
895
  with gr.Row():
896
+ quiz_topic = gr.Dropdown(
897
+ choices=TOPIC_CHOICES,
898
+ label="Select Topic",
899
+ allow_custom_value=True,
900
+ value="RNA-seq: Differential Expression (DESeq2)"
901
+ )
902
+ quiz_type = gr.Radio(
903
+ choices=["Multiple Choice (MCQ)", "True/False", "Short Answer"],
904
+ value="Multiple Choice (MCQ)",
905
+ label="Format"
906
+ )
 
 
907
 
908
  with gr.Row():
909
+ quiz_difficulty = gr.Radio(
910
+ choices=DIFFICULTY_LEVELS,
911
+ value="Intermediate",
912
+ label="Difficulty"
913
+ )
914
+ num_questions = gr.Slider(1, 10, 5, step=1, label="# Questions")
915
+ generate_quiz_btn = gr.Button("🎲 Generate Quiz", variant="primary")
 
 
 
 
 
 
916
 
917
  quiz_output = gr.Markdown(label="Generated Quiz")
918
  answer_key_state = gr.State("")
 
925
 
926
  gr.Markdown("---")
927
  gr.Markdown("### ✍️ Submit Your Answers")
928
+ with gr.Row():
929
+ user_answers = gr.Textbox(
930
+ label="Your Answers (e.g., '1: A, 2: B')",
931
+ lines=5,
932
+ placeholder="Type your answers here...",
933
+ scale=3
934
+ )
935
+ check_btn = gr.Button("βœ… Check", variant="primary", scale=1)
936
  feedback_output = gr.Markdown(label="Feedback")
937
 
938
  check_btn.click(
 
945
  # TAB 4: BUILD A LESSON
946
  # ══════════════════════════════════════════════════════════════
947
  with gr.Tab("πŸ“š Build a Lesson", id="lesson"):
948
+ gr.HTML('<div class="module-info">πŸ“š Generate structured lessons with learning objectives, explanations, exercises, and self-assessment quizzes.</div>')
949
 
950
  with gr.Row():
951
+ lesson_topic = gr.Dropdown(
952
+ choices=TOPIC_CHOICES,
953
+ label="Lesson Topic",
954
+ allow_custom_value=True,
955
+ value="RNA-seq: Differential Expression (DESeq2)"
956
+ )
957
+ lesson_level = gr.Radio(
958
+ choices=DIFFICULTY_LEVELS,
959
+ value="Intermediate",
960
+ label="Level"
961
+ )
 
 
962
 
963
  with gr.Row():
964
+ include_exercises = gr.Checkbox(label="Include Exercises", value=True)
965
+ include_quiz = gr.Checkbox(label="Include Quiz", value=True)
966
+ generate_lesson_btn = gr.Button("πŸ“ Generate Lesson", variant="primary")
967
 
968
  lesson_output = gr.Markdown(label="Generated Lesson")
969
 
 
977
  # TAB 5: WORKFLOW COACH
978
  # ══════════════════════════════════════════════════════════════
979
  with gr.Tab("πŸ”¬ Workflow Coach", id="workflow"):
980
+ gr.HTML('<div class="module-info">πŸ”¬ Step-by-step guidance through bioinformatics analysis pipelines. Select a workflow and ask about any step.</div>')
981
 
982
  workflow_selector = gr.Dropdown(
983
  choices=WORKFLOW_CHOICES,
 
991
  type="messages",
992
  additional_inputs=[
993
  workflow_selector,
994
+ gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Temperature", visible=False),
 
 
 
995
  ],
996
+ additional_inputs_accordion=gr.Accordion("βš™οΈ", open=False, visible=False),
997
  examples=[
998
+ "Walk me through the complete pipeline from raw FASTQ to DE results.",
999
+ "I'm at alignment. What should I check before counting?",
1000
  "My mapping rate is only 45%. What could be wrong?",
1001
+ "How do I choose between STAR and HISAT2?",
1002
+ "What parameters for GATK HaplotypeCaller on exome data?",
1003
+ "How do I set DADA2 truncation parameters?",
1004
  ],
1005
  )
1006
 
 
1008
  # TAB 6: PAPER TO LESSON
1009
  # ══════════════════════════════════════════════════════════════
1010
  with gr.Tab("πŸ“° Paper to Lesson", id="paper"):
1011
+ gr.HTML('<div class="module-info">πŸ“° Convert research papers into teaching material. Upload a paper in the Upload tab first, then generate lessons, slides, or quizzes from it.</div>')
1012
 
1013
  output_format = gr.Radio(
1014
  choices=["Lesson Plan", "Slide Outline", "Study Notes", "Quiz Questions"],
 
1019
  gr.ChatInterface(
1020
  fn=paper_to_lesson_respond,
1021
  type="messages",
1022
+ additional_inputs=[output_format, rag_store],
 
 
 
1023
  additional_inputs_accordion=gr.Accordion("", open=False, visible=False),
1024
  examples=[
1025
  "Convert this paper into a 45-minute lecture plan.",
1026
+ "Create a slide outline covering the key methods.",
1027
+ "Generate study notes on the bioinformatics methods.",
1028
+ "Create quiz questions on this paper's methodology.",
1029
  ],
1030
  )
1031
 
 
1033
  # TAB 7: VIVA PRACTICE
1034
  # ══════════════════════════════════════════════════════════════
1035
  with gr.Tab("πŸŽ“ Viva Practice", id="viva"):
1036
+ gr.HTML('<div class="module-info">πŸŽ“ Practice oral examinations. The AI examiner asks probing questions, evaluates your answers, and pushes deeper understanding.</div>')
1037
 
1038
  with gr.Row():
1039
  viva_topic = gr.Dropdown(
 
1045
  viva_difficulty = gr.Radio(
1046
  choices=DIFFICULTY_LEVELS,
1047
  value="Intermediate",
1048
+ label="Difficulty"
1049
  )
1050
 
1051
  gr.ChatInterface(
1052
  fn=viva_respond,
1053
  type="messages",
1054
+ additional_inputs=[viva_topic, viva_difficulty],
1055
+ additional_inputs_accordion=gr.Accordion("βš™οΈ", open=False, visible=False),
 
 
 
1056
  examples=[
1057
+ "I'm ready for my viva. Start with your first question.",
1058
+ "Focus on the statistical aspects of RNA-seq.",
1059
  "Ask me about variant calling and interpretation.",
1060
+ "Test my understanding of microbiome diversity.",
1061
  ],
1062
  )
1063
 
1064
+ # ── Footer ─────────────────────────────────────────────────────
1065
  gr.HTML("""
1066
  <div style="text-align: center; padding: 20px; margin-top: 20px; border-top: 1px solid #e0e0e0; color: #666; font-size: 0.85em;">
1067
  <p><strong>Bioinformatics with BB Tutor</strong> β€” Educational AI Assistant</p>
1068
+ <p>⚠️ For educational purposes only. Not for clinical use.</p>
1069
+ <p>RNA-seq Β· Exome Β· Genome Β· Microbiome Β· Variants Β· Molecular Genetics Β· scRNA-seq Β· ATAC-seq Β· ChIP-seq Β· Methylation Β· Small RNA Β· Targeted Panels Β· Long-read Β· Spatial Β· Multi-omics</p>
1070
  </div>
1071
  """)
1072