Upload 3 files
Browse files- README.md +13 -7
- app.py +308 -19
- requirements.txt +3 -0
README.md
CHANGED
|
@@ -13,15 +13,16 @@ pinned: false
|
|
| 13 |
|
| 14 |
Pathshala AI is a bilingual AI tutor demo for rural primary students in Nepal.
|
| 15 |
|
| 16 |
-
The Gradio Space mirrors the local Streamlit/web app flow. It can
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
- English explanation
|
| 20 |
- Nepali explanation
|
| 21 |
- 3 simple quiz questions
|
| 22 |
- Retrieved textbook sources
|
| 23 |
-
-
|
| 24 |
-
- Parent/teacher summary
|
| 25 |
|
| 26 |
## Deploy To Hugging Face Spaces
|
| 27 |
|
|
@@ -49,8 +50,13 @@ git push
|
|
| 49 |
## Recommended Submission Mode
|
| 50 |
|
| 51 |
For the easiest hackathon submission, deploy the Space without `BACKEND_URL`.
|
| 52 |
-
It will
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
For the full RAG workflow, first deploy the FastAPI backend somewhere public, then set `BACKEND_URL` in the Space settings.
|
| 56 |
|
|
@@ -81,7 +87,7 @@ If the backend returns `normalized_question`, the Space shows the interpreted qu
|
|
| 81 |
|
| 82 |
## Mock Mode
|
| 83 |
|
| 84 |
-
If `BACKEND_URL` is missing or the backend is unavailable, the Space uses
|
| 85 |
|
| 86 |
Example question:
|
| 87 |
|
|
|
|
| 13 |
|
| 14 |
Pathshala AI is a bilingual AI tutor demo for rural primary students in Nepal.
|
| 15 |
|
| 16 |
+
The Gradio Space mirrors the local Streamlit/web app flow. It can upload a text-based
|
| 17 |
+
PDF directly inside Hugging Face Spaces, accept a student question in English, Nepali,
|
| 18 |
+
or romanized Nepali, retrieve relevant textbook portions, then returns:
|
| 19 |
|
| 20 |
- English explanation
|
| 21 |
- Nepali explanation
|
| 22 |
- 3 simple quiz questions
|
| 23 |
- Retrieved textbook sources
|
| 24 |
+
- Basic quiz grading in Space-local mode
|
| 25 |
+
- Parent/teacher summary note in Space-local mode
|
| 26 |
|
| 27 |
## Deploy To Hugging Face Spaces
|
| 28 |
|
|
|
|
| 50 |
## Recommended Submission Mode
|
| 51 |
|
| 52 |
For the easiest hackathon submission, deploy the Space without `BACKEND_URL`.
|
| 53 |
+
It will run a Space-local workflow:
|
| 54 |
+
|
| 55 |
+
1. Upload a text-based PDF.
|
| 56 |
+
2. Extract text with PyMuPDF.
|
| 57 |
+
3. Create embeddings with `sentence-transformers`.
|
| 58 |
+
4. Search the uploaded book in memory.
|
| 59 |
+
5. Show Nepali quiz questions and retrieved textbook portions.
|
| 60 |
|
| 61 |
For the full RAG workflow, first deploy the FastAPI backend somewhere public, then set `BACKEND_URL` in the Space settings.
|
| 62 |
|
|
|
|
| 87 |
|
| 88 |
## Mock Mode
|
| 89 |
|
| 90 |
+
If `BACKEND_URL` is missing or the backend is unavailable, the Space uses local PDF extraction and in-memory retrieval. This supports text-based PDFs. For scanned PDFs or persistent student progress, deploy the backend and set `BACKEND_URL`.
|
| 91 |
|
| 92 |
Example question:
|
| 93 |
|
app.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
from typing import Any
|
|
|
|
| 3 |
|
| 4 |
from dotenv import load_dotenv
|
| 5 |
import gradio as gr
|
|
|
|
| 6 |
import requests
|
| 7 |
|
| 8 |
|
|
@@ -18,14 +20,20 @@ EXAMPLE_CONTEXT = (
|
|
| 18 |
"Soil erosion is the removal of topsoil by wind, water, or other natural forces. "
|
| 19 |
"It can make farmland less fertile and can be reduced by planting trees and grass."
|
| 20 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
-
def upload_textbook(pdf_path: str | None) -> str:
|
| 24 |
if not pdf_path:
|
| 25 |
-
return "Choose a PDF first."
|
| 26 |
|
| 27 |
if not BACKEND_URL:
|
| 28 |
-
return
|
| 29 |
|
| 30 |
try:
|
| 31 |
with open(pdf_path, "rb") as pdf_file:
|
|
@@ -41,22 +49,55 @@ def upload_textbook(pdf_path: str | None) -> str:
|
|
| 41 |
method_text = f" Text extraction: {extraction_method}." if extraction_method else ""
|
| 42 |
return (
|
| 43 |
f"Uploaded {result['filename']} with {result['page_count']} pages "
|
| 44 |
-
f"and {result['chunk_count']} chunks.{method_text}"
|
|
|
|
|
|
|
| 45 |
)
|
| 46 |
|
| 47 |
-
return _response_error(response, "Upload failed.")
|
| 48 |
except requests.Timeout:
|
| 49 |
-
return "Backend is still processing the PDF. Try a smaller PDF for the demo."
|
| 50 |
except requests.RequestException as exc:
|
| 51 |
-
return f"Could not reach backend: {exc}"
|
| 52 |
except OSError as exc:
|
| 53 |
-
return f"Could not read uploaded PDF: {exc}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
def ask_tutor(
|
| 57 |
question: str,
|
| 58 |
student_id: str,
|
| 59 |
textbook_context: str,
|
|
|
|
| 60 |
) -> tuple[str, str, str, str, str, dict[str, Any]]:
|
| 61 |
question = question.strip()
|
| 62 |
student_id = (student_id or "hf-space-demo").strip()
|
|
@@ -78,7 +119,12 @@ def ask_tutor(
|
|
| 78 |
if backend_result and not is_insufficient_backend_result(backend_result):
|
| 79 |
return backend_result
|
| 80 |
|
| 81 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
def ask_backend(
|
|
@@ -145,12 +191,12 @@ def grade_quiz(
|
|
| 145 |
student_id: str,
|
| 146 |
quiz_state: dict[str, Any] | None,
|
| 147 |
) -> str:
|
| 148 |
-
if not BACKEND_URL:
|
| 149 |
-
return "Quiz grading needs the backend. Demo mode can show questions but cannot grade them."
|
| 150 |
-
|
| 151 |
quiz_state = quiz_state or {}
|
| 152 |
quiz_id = quiz_state.get("quiz_id")
|
| 153 |
|
|
|
|
|
|
|
|
|
|
| 154 |
if not quiz_id:
|
| 155 |
return "Ask the tutor first so a quiz can be created."
|
| 156 |
|
|
@@ -177,9 +223,59 @@ def grade_quiz(
|
|
| 177 |
return "Quiz grading returned an invalid response."
|
| 178 |
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
def parent_summary(student_id: str) -> str:
|
| 181 |
if not BACKEND_URL:
|
| 182 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
student_id = (student_id or "hf-space-demo").strip()
|
| 185 |
|
|
@@ -225,6 +321,103 @@ def is_insufficient_backend_result(result: tuple[str, str, str, str, str, dict[s
|
|
| 225 |
return any(marker in combined for marker in markers)
|
| 226 |
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
def mock_response(question: str, textbook_context: str) -> tuple[str, str, str, str, str, dict[str, Any]]:
|
| 229 |
context = textbook_context or EXAMPLE_CONTEXT
|
| 230 |
normalized_question = normalize_question_mock(question)
|
|
@@ -252,6 +445,54 @@ def mock_response(question: str, textbook_context: str) -> tuple[str, str, str,
|
|
| 252 |
)
|
| 253 |
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
def mock_english_explanation(normalized_question: str, context: str) -> str:
|
| 256 |
text = f"{normalized_question} {context}".lower()
|
| 257 |
|
|
@@ -330,6 +571,53 @@ def mock_nepali_explanation(normalized_question: str, context: str = "") -> str:
|
|
| 330 |
return "यो विषयलाई सरल रूपमा बुझ्न पाठ्यपुस्तकको सन्दर्भ पढेर मुख्य कुरा सम्झनुहोस्।"
|
| 331 |
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
def normalize_question_mock(question: str) -> str:
|
| 334 |
text = question.lower()
|
| 335 |
|
|
@@ -491,12 +779,13 @@ with gr.Blocks(title=APP_NAME, theme=gr.themes.Soft()) as demo:
|
|
| 491 |
gr.Markdown(
|
| 492 |
"""
|
| 493 |
# Pathshala AI
|
| 494 |
-
Bilingual AI tutor for rural primary students in Nepal. Upload a PDF
|
| 495 |
-
|
| 496 |
"""
|
| 497 |
)
|
| 498 |
|
| 499 |
quiz_state = gr.State({})
|
|
|
|
| 500 |
|
| 501 |
with gr.Row():
|
| 502 |
student_id_input = gr.Textbox(
|
|
@@ -508,7 +797,7 @@ with gr.Blocks(title=APP_NAME, theme=gr.themes.Soft()) as demo:
|
|
| 508 |
label="Status",
|
| 509 |
value=(
|
| 510 |
"Backend connected." if BACKEND_URL else
|
| 511 |
-
"
|
| 512 |
),
|
| 513 |
interactive=False,
|
| 514 |
scale=2,
|
|
@@ -581,18 +870,18 @@ with gr.Blocks(title=APP_NAME, theme=gr.themes.Soft()) as demo:
|
|
| 581 |
status_output,
|
| 582 |
quiz_state,
|
| 583 |
],
|
| 584 |
-
fn=lambda question, context: ask_tutor(question, "hf-space-demo", context),
|
| 585 |
cache_examples=False,
|
| 586 |
)
|
| 587 |
|
| 588 |
upload_button.click(
|
| 589 |
fn=upload_textbook,
|
| 590 |
inputs=[pdf_input],
|
| 591 |
-
outputs=[upload_output],
|
| 592 |
)
|
| 593 |
ask_button.click(
|
| 594 |
fn=ask_tutor,
|
| 595 |
-
inputs=[question_input, student_id_input, context_input],
|
| 596 |
outputs=[
|
| 597 |
english_output,
|
| 598 |
nepali_output,
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import Any
|
| 3 |
+
from functools import lru_cache
|
| 4 |
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
import gradio as gr
|
| 7 |
+
import numpy as np
|
| 8 |
import requests
|
| 9 |
|
| 10 |
|
|
|
|
| 20 |
"Soil erosion is the removal of topsoil by wind, water, or other natural forces. "
|
| 21 |
"It can make farmland less fertile and can be reduced by planting trees and grass."
|
| 22 |
)
|
| 23 |
+
MIN_CHUNK_CHARS = 250
|
| 24 |
+
MAX_CHUNK_CHARS = 900
|
| 25 |
+
EMBEDDING_MODEL = os.getenv(
|
| 26 |
+
"EMBEDDING_MODEL",
|
| 27 |
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
| 28 |
+
)
|
| 29 |
|
| 30 |
|
| 31 |
+
def upload_textbook(pdf_path: str | None) -> tuple[str, dict[str, Any], Any]:
|
| 32 |
if not pdf_path:
|
| 33 |
+
return "Choose a PDF first.", {}, gr.update()
|
| 34 |
|
| 35 |
if not BACKEND_URL:
|
| 36 |
+
return upload_textbook_locally(pdf_path)
|
| 37 |
|
| 38 |
try:
|
| 39 |
with open(pdf_path, "rb") as pdf_file:
|
|
|
|
| 49 |
method_text = f" Text extraction: {extraction_method}." if extraction_method else ""
|
| 50 |
return (
|
| 51 |
f"Uploaded {result['filename']} with {result['page_count']} pages "
|
| 52 |
+
f"and {result['chunk_count']} chunks.{method_text}",
|
| 53 |
+
{},
|
| 54 |
+
gr.update(value=""),
|
| 55 |
)
|
| 56 |
|
| 57 |
+
return _response_error(response, "Upload failed."), {}, gr.update()
|
| 58 |
except requests.Timeout:
|
| 59 |
+
return "Backend is still processing the PDF. Try a smaller PDF for the demo.", {}, gr.update()
|
| 60 |
except requests.RequestException as exc:
|
| 61 |
+
return f"Could not reach backend: {exc}", {}, gr.update()
|
| 62 |
except OSError as exc:
|
| 63 |
+
return f"Could not read uploaded PDF: {exc}", {}, gr.update()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def upload_textbook_locally(pdf_path: str) -> tuple[str, dict[str, Any], Any]:
|
| 67 |
+
try:
|
| 68 |
+
extracted = extract_pdf_text(pdf_path)
|
| 69 |
+
chunks = chunk_text(extracted["text"])
|
| 70 |
+
|
| 71 |
+
if not chunks:
|
| 72 |
+
return "No readable text chunks could be created from this PDF.", {}, gr.update()
|
| 73 |
+
|
| 74 |
+
embeddings = embed_texts(chunks)
|
| 75 |
+
state = {
|
| 76 |
+
"filename": os.path.basename(pdf_path),
|
| 77 |
+
"page_count": extracted["page_count"],
|
| 78 |
+
"chunk_count": len(chunks),
|
| 79 |
+
"extraction_method": extracted["extraction_method"],
|
| 80 |
+
"chunks": chunks,
|
| 81 |
+
"embeddings": embeddings.tolist(),
|
| 82 |
+
}
|
| 83 |
+
return (
|
| 84 |
+
(
|
| 85 |
+
f"Uploaded {state['filename']} inside this Space with "
|
| 86 |
+
f"{state['page_count']} pages and {state['chunk_count']} chunks. "
|
| 87 |
+
f"Text extraction: {state['extraction_method']}."
|
| 88 |
+
),
|
| 89 |
+
state,
|
| 90 |
+
gr.update(value=""),
|
| 91 |
+
)
|
| 92 |
+
except Exception as exc:
|
| 93 |
+
return f"Could not process uploaded PDF in this Space: {exc}", {}, gr.update()
|
| 94 |
|
| 95 |
|
| 96 |
def ask_tutor(
|
| 97 |
question: str,
|
| 98 |
student_id: str,
|
| 99 |
textbook_context: str,
|
| 100 |
+
textbook_state: dict[str, Any] | None,
|
| 101 |
) -> tuple[str, str, str, str, str, dict[str, Any]]:
|
| 102 |
question = question.strip()
|
| 103 |
student_id = (student_id or "hf-space-demo").strip()
|
|
|
|
| 119 |
if backend_result and not is_insufficient_backend_result(backend_result):
|
| 120 |
return backend_result
|
| 121 |
|
| 122 |
+
return local_response(
|
| 123 |
+
question=question,
|
| 124 |
+
student_id=student_id,
|
| 125 |
+
textbook_context=textbook_context,
|
| 126 |
+
textbook_state=textbook_state or {},
|
| 127 |
+
)
|
| 128 |
|
| 129 |
|
| 130 |
def ask_backend(
|
|
|
|
| 191 |
student_id: str,
|
| 192 |
quiz_state: dict[str, Any] | None,
|
| 193 |
) -> str:
|
|
|
|
|
|
|
|
|
|
| 194 |
quiz_state = quiz_state or {}
|
| 195 |
quiz_id = quiz_state.get("quiz_id")
|
| 196 |
|
| 197 |
+
if not BACKEND_URL:
|
| 198 |
+
return grade_quiz_locally([answer_1, answer_2, answer_3], quiz_state)
|
| 199 |
+
|
| 200 |
if not quiz_id:
|
| 201 |
return "Ask the tutor first so a quiz can be created."
|
| 202 |
|
|
|
|
| 223 |
return "Quiz grading returned an invalid response."
|
| 224 |
|
| 225 |
|
| 226 |
+
def grade_quiz_locally(answers: list[str], quiz_state: dict[str, Any]) -> str:
|
| 227 |
+
questions = quiz_state.get("quiz_questions", [])
|
| 228 |
+
expected_answers = quiz_state.get("expected_answers", [])
|
| 229 |
+
|
| 230 |
+
if not questions:
|
| 231 |
+
return "Ask the tutor first so a quiz can be created."
|
| 232 |
+
|
| 233 |
+
score = 0
|
| 234 |
+
lines = []
|
| 235 |
+
|
| 236 |
+
for index, question in enumerate(questions[:3]):
|
| 237 |
+
student_answer = answers[index].strip() if index < len(answers) else ""
|
| 238 |
+
expected_answer = str(expected_answers[index] if index < len(expected_answers) else "")
|
| 239 |
+
is_correct = is_answer_close(student_answer, expected_answer)
|
| 240 |
+
|
| 241 |
+
if is_correct:
|
| 242 |
+
score += 1
|
| 243 |
+
|
| 244 |
+
status = "Correct" if is_correct else "Needs practice"
|
| 245 |
+
lines.append(f"{status}: {question}")
|
| 246 |
+
|
| 247 |
+
if not is_correct and expected_answer:
|
| 248 |
+
lines.append(f"Expected idea: {expected_answer}")
|
| 249 |
+
|
| 250 |
+
return f"Score: {score} / {min(len(questions), 3)}\n" + "\n".join(lines)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def is_answer_close(student_answer: str, expected_answer: str) -> bool:
|
| 254 |
+
student_tokens = set(normalize_answer(student_answer).split())
|
| 255 |
+
expected_tokens = set(normalize_answer(expected_answer).split())
|
| 256 |
+
|
| 257 |
+
if not student_tokens or not expected_tokens:
|
| 258 |
+
return False
|
| 259 |
+
|
| 260 |
+
overlap = len(student_tokens & expected_tokens) / max(len(expected_tokens), 1)
|
| 261 |
+
return overlap >= 0.35 or normalize_answer(student_answer) in normalize_answer(expected_answer)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def normalize_answer(answer: str) -> str:
|
| 265 |
+
return " ".join(
|
| 266 |
+
word.strip(".,?!:;()[]{}\"'।").lower()
|
| 267 |
+
for word in answer.split()
|
| 268 |
+
if word.strip(".,?!:;()[]{}\"'।")
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
|
| 272 |
def parent_summary(student_id: str) -> str:
|
| 273 |
if not BACKEND_URL:
|
| 274 |
+
return (
|
| 275 |
+
"Parent/teacher summary\n\n"
|
| 276 |
+
"The student has practiced with the uploaded or pasted textbook context in this Space. "
|
| 277 |
+
"For persistent progress across sessions, deploy the FastAPI backend and set BACKEND_URL."
|
| 278 |
+
)
|
| 279 |
|
| 280 |
student_id = (student_id or "hf-space-demo").strip()
|
| 281 |
|
|
|
|
| 321 |
return any(marker in combined for marker in markers)
|
| 322 |
|
| 323 |
|
| 324 |
+
def extract_pdf_text(pdf_path: str) -> dict[str, Any]:
|
| 325 |
+
import fitz
|
| 326 |
+
|
| 327 |
+
page_texts = []
|
| 328 |
+
|
| 329 |
+
with fitz.open(pdf_path) as document:
|
| 330 |
+
for page in document:
|
| 331 |
+
text = page.get_text("text").strip()
|
| 332 |
+
if text:
|
| 333 |
+
page_texts.append(text)
|
| 334 |
+
|
| 335 |
+
page_count = document.page_count
|
| 336 |
+
|
| 337 |
+
text = "\n\n".join(page_texts).strip()
|
| 338 |
+
|
| 339 |
+
if not text:
|
| 340 |
+
raise ValueError(
|
| 341 |
+
"No selectable text was found. For scanned PDFs, deploy with a backend "
|
| 342 |
+
"or paste a short textbook paragraph into the context box."
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
return {
|
| 346 |
+
"text": text,
|
| 347 |
+
"page_count": page_count,
|
| 348 |
+
"extraction_method": "pymupdf-local",
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def chunk_text(text: str) -> list[str]:
|
| 353 |
+
paragraphs = [part.strip() for part in text.splitlines() if part.strip()]
|
| 354 |
+
chunks = []
|
| 355 |
+
current = ""
|
| 356 |
+
|
| 357 |
+
for paragraph in paragraphs:
|
| 358 |
+
if len(current) + len(paragraph) + 2 <= MAX_CHUNK_CHARS:
|
| 359 |
+
current = f"{current}\n{paragraph}".strip()
|
| 360 |
+
continue
|
| 361 |
+
|
| 362 |
+
if len(current) >= MIN_CHUNK_CHARS:
|
| 363 |
+
chunks.append(current)
|
| 364 |
+
current = paragraph
|
| 365 |
+
else:
|
| 366 |
+
current = f"{current}\n{paragraph}".strip()
|
| 367 |
+
|
| 368 |
+
if current:
|
| 369 |
+
chunks.append(current)
|
| 370 |
+
|
| 371 |
+
return chunks or ([text.strip()] if text.strip() else [])
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
@lru_cache(maxsize=1)
|
| 375 |
+
def get_embedding_model():
|
| 376 |
+
from sentence_transformers import SentenceTransformer
|
| 377 |
+
|
| 378 |
+
return SentenceTransformer(EMBEDDING_MODEL)
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def embed_texts(texts: list[str]) -> np.ndarray:
|
| 382 |
+
model = get_embedding_model()
|
| 383 |
+
return np.asarray(
|
| 384 |
+
model.encode(
|
| 385 |
+
texts,
|
| 386 |
+
convert_to_numpy=True,
|
| 387 |
+
normalize_embeddings=True,
|
| 388 |
+
show_progress_bar=False,
|
| 389 |
+
)
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
def retrieve_local_sources(
|
| 394 |
+
question: str,
|
| 395 |
+
textbook_state: dict[str, Any],
|
| 396 |
+
limit: int = 5,
|
| 397 |
+
) -> list[dict[str, Any]]:
|
| 398 |
+
chunks = [str(chunk) for chunk in textbook_state.get("chunks", [])]
|
| 399 |
+
embeddings = np.asarray(textbook_state.get("embeddings", []), dtype=float)
|
| 400 |
+
|
| 401 |
+
if not chunks or embeddings.size == 0:
|
| 402 |
+
return []
|
| 403 |
+
|
| 404 |
+
query_embedding = embed_texts([question])[0]
|
| 405 |
+
scores = embeddings @ query_embedding
|
| 406 |
+
top_indices = np.argsort(scores)[::-1][:limit]
|
| 407 |
+
|
| 408 |
+
return [
|
| 409 |
+
{
|
| 410 |
+
"score": float(scores[index]),
|
| 411 |
+
"text": chunks[index],
|
| 412 |
+
"metadata": {
|
| 413 |
+
"filename": textbook_state.get("filename", "uploaded-textbook"),
|
| 414 |
+
"chunk_index": int(index),
|
| 415 |
+
},
|
| 416 |
+
}
|
| 417 |
+
for index in top_indices
|
| 418 |
+
]
|
| 419 |
+
|
| 420 |
+
|
| 421 |
def mock_response(question: str, textbook_context: str) -> tuple[str, str, str, str, str, dict[str, Any]]:
|
| 422 |
context = textbook_context or EXAMPLE_CONTEXT
|
| 423 |
normalized_question = normalize_question_mock(question)
|
|
|
|
| 445 |
)
|
| 446 |
|
| 447 |
|
| 448 |
+
def local_response(
|
| 449 |
+
question: str,
|
| 450 |
+
student_id: str,
|
| 451 |
+
textbook_context: str,
|
| 452 |
+
textbook_state: dict[str, Any],
|
| 453 |
+
) -> tuple[str, str, str, str, str, dict[str, Any]]:
|
| 454 |
+
normalized_question = normalize_question_mock(question)
|
| 455 |
+
sources = []
|
| 456 |
+
|
| 457 |
+
if textbook_context.strip():
|
| 458 |
+
sources = [
|
| 459 |
+
{
|
| 460 |
+
"score": 1.0,
|
| 461 |
+
"text": chunk,
|
| 462 |
+
"metadata": {"filename": "pasted-context", "chunk_index": index},
|
| 463 |
+
}
|
| 464 |
+
for index, chunk in enumerate(chunk_text(textbook_context)[:5])
|
| 465 |
+
]
|
| 466 |
+
elif textbook_state.get("chunks") and textbook_state.get("embeddings"):
|
| 467 |
+
sources = retrieve_local_sources(normalized_question, textbook_state, limit=5)
|
| 468 |
+
|
| 469 |
+
context = "\n\n".join(str(source.get("text", "")) for source in sources).strip()
|
| 470 |
+
|
| 471 |
+
if not context:
|
| 472 |
+
return mock_response(question=question, textbook_context=textbook_context)
|
| 473 |
+
|
| 474 |
+
english = (
|
| 475 |
+
f"Interpreted question: {normalized_question}\n\n"
|
| 476 |
+
f"Answer from the uploaded textbook context:\n{truncate(context, max_length=700)}"
|
| 477 |
+
)
|
| 478 |
+
nepali = local_nepali_answer(normalized_question, context)
|
| 479 |
+
quiz_questions = local_nepali_quiz_questions(context)
|
| 480 |
+
quiz_state = {
|
| 481 |
+
"student_id": student_id,
|
| 482 |
+
"quiz_questions": quiz_questions,
|
| 483 |
+
"expected_answers": [source_answer(sources)] * 3,
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
return (
|
| 487 |
+
english,
|
| 488 |
+
nepali,
|
| 489 |
+
format_quiz(quiz_questions),
|
| 490 |
+
format_sources(sources),
|
| 491 |
+
"Answered with the Hugging Face Space local PDF workflow.",
|
| 492 |
+
quiz_state,
|
| 493 |
+
)
|
| 494 |
+
|
| 495 |
+
|
| 496 |
def mock_english_explanation(normalized_question: str, context: str) -> str:
|
| 497 |
text = f"{normalized_question} {context}".lower()
|
| 498 |
|
|
|
|
| 571 |
return "यो विषयलाई सरल रूपमा बुझ्न पाठ्यपुस्तकको सन्दर्भ पढेर मुख्य कुरा सम्झनुहोस्।"
|
| 572 |
|
| 573 |
|
| 574 |
+
def local_nepali_answer(normalized_question: str, context: str) -> str:
|
| 575 |
+
known_answer = mock_nepali_explanation(normalized_question, context)
|
| 576 |
+
|
| 577 |
+
if known_answer != "यो विषयलाई सरल रूपमा बुझ्न पाठ्यपुस्तकको सन्दर्भ पढेर मुख्य कुरा सम्झनुहोस्।":
|
| 578 |
+
return known_answer
|
| 579 |
+
|
| 580 |
+
if has_devanagari(context):
|
| 581 |
+
return (
|
| 582 |
+
"अपलोड गरिएको पाठ्यपुस्तकको सन्दर्भअनुसार मुख्य कुरा यस्तो छ:\n\n"
|
| 583 |
+
f"{truncate(context, max_length=700)}"
|
| 584 |
+
)
|
| 585 |
+
|
| 586 |
+
return (
|
| 587 |
+
"अपलोड गरिएको पाठ्यपुस्तकको सन्दर्भअनुसार यो विषय महत्त्वपूर्ण छ। "
|
| 588 |
+
"मुख्य शब्दहरू पढ्नुहोस्, उदाहरणसँग जोड्नुहोस्, र आफ्नै सरल शब्दमा उत्तर लेख्ने अभ्यास गर्नुहोस्।"
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
|
| 592 |
+
def local_nepali_quiz_questions(context: str) -> list[str]:
|
| 593 |
+
short_context = truncate(first_sentence(context), max_length=140)
|
| 594 |
+
return [
|
| 595 |
+
"प्राप्त पाठ्यपुस्तक सन्दर्भको मुख्य कुरा के हो?",
|
| 596 |
+
f"यो वाक्यले के बुझाउँछ: {short_context}",
|
| 597 |
+
"यस विषयलाई आफ्नै सरल शब्दमा कसरी भन्न सकिन्छ?",
|
| 598 |
+
]
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
def source_answer(sources: list[dict[str, Any]]) -> str:
|
| 602 |
+
if not sources:
|
| 603 |
+
return "पाठ्यपुस्तकको मुख्य कुरा।"
|
| 604 |
+
|
| 605 |
+
text = str(sources[0].get("text", "")).strip()
|
| 606 |
+
return truncate(first_sentence(text) or text, max_length=220)
|
| 607 |
+
|
| 608 |
+
|
| 609 |
+
def first_sentence(text: str) -> str:
|
| 610 |
+
for separator in ["।", ".", "?", "!"]:
|
| 611 |
+
if separator in text:
|
| 612 |
+
return text.split(separator, 1)[0].strip() + separator
|
| 613 |
+
|
| 614 |
+
return text.strip()
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
def has_devanagari(text: str) -> bool:
|
| 618 |
+
return any("\u0900" <= character <= "\u097f" for character in text)
|
| 619 |
+
|
| 620 |
+
|
| 621 |
def normalize_question_mock(question: str) -> str:
|
| 622 |
text = question.lower()
|
| 623 |
|
|
|
|
| 779 |
gr.Markdown(
|
| 780 |
"""
|
| 781 |
# Pathshala AI
|
| 782 |
+
Bilingual AI tutor for rural primary students in Nepal. Upload a PDF directly
|
| 783 |
+
in this Space, or connect a public backend for the full production workflow.
|
| 784 |
"""
|
| 785 |
)
|
| 786 |
|
| 787 |
quiz_state = gr.State({})
|
| 788 |
+
textbook_state = gr.State({})
|
| 789 |
|
| 790 |
with gr.Row():
|
| 791 |
student_id_input = gr.Textbox(
|
|
|
|
| 797 |
label="Status",
|
| 798 |
value=(
|
| 799 |
"Backend connected." if BACKEND_URL else
|
| 800 |
+
"Space-local PDF upload is active. Set BACKEND_URL for the full backend workflow."
|
| 801 |
),
|
| 802 |
interactive=False,
|
| 803 |
scale=2,
|
|
|
|
| 870 |
status_output,
|
| 871 |
quiz_state,
|
| 872 |
],
|
| 873 |
+
fn=lambda question, context: ask_tutor(question, "hf-space-demo", context, {}),
|
| 874 |
cache_examples=False,
|
| 875 |
)
|
| 876 |
|
| 877 |
upload_button.click(
|
| 878 |
fn=upload_textbook,
|
| 879 |
inputs=[pdf_input],
|
| 880 |
+
outputs=[upload_output, textbook_state, context_input],
|
| 881 |
)
|
| 882 |
ask_button.click(
|
| 883 |
fn=ask_tutor,
|
| 884 |
+
inputs=[question_input, student_id_input, context_input, textbook_state],
|
| 885 |
outputs=[
|
| 886 |
english_output,
|
| 887 |
nepali_output,
|
requirements.txt
CHANGED
|
@@ -1,3 +1,6 @@
|
|
| 1 |
gradio>=4.44.0
|
| 2 |
python-dotenv>=1.0.0
|
| 3 |
requests>=2.31.0
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio>=4.44.0
|
| 2 |
python-dotenv>=1.0.0
|
| 3 |
requests>=2.31.0
|
| 4 |
+
numpy>=1.26.0
|
| 5 |
+
PyMuPDF>=1.24.0
|
| 6 |
+
sentence-transformers==2.7.0
|