Benny-Tang commited on
Commit
54aef41
·
verified ·
1 Parent(s): 92fe51a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +259 -170
app.py CHANGED
@@ -1,183 +1,272 @@
1
- # agents.py
2
  import os
 
 
3
  import random
4
- from collections import Counter
5
- from typing import List, Dict, Any
6
-
7
- # Accept both env var names for backward compatibility
8
- GLM_API_KEY = os.getenv("ZHIPUAI_API_KEY") or os.getenv("zhipuai_api_key")
9
-
10
-
11
- class AnalyzerAgent:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  """
13
- Produces topic-level accuracy and weak-topic recommendations.
14
- Input: per_question dict {qid: {"user":..., "correct":..., "topics":[...]}}
 
15
  """
16
- def analyze(self, per_question: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
17
- topic_stats = {}
18
- for qid, info in per_question.items():
19
- topics = info.get("topics") or []
20
- user = info.get("user")
21
- correct = info.get("correct")
22
- is_correct = (correct is not None and user is not None and str(user).strip() == str(correct).strip())
23
- for t in topics:
24
- if t not in topic_stats:
25
- topic_stats[t] = {"correct": 0, "total": 0}
26
- topic_stats[t]["total"] += 1
27
- if is_correct:
28
- topic_stats[t]["correct"] += 1
29
-
30
- topic_accuracy = {}
31
- weak_topics = []
32
- for t, stats in topic_stats.items():
33
- total = stats["total"]
34
- correct = stats["correct"]
35
- acc = round((correct / total) * 100, 2) if total > 0 else 0.0
36
- topic_accuracy[t] = {"accuracy_percent": acc, "total": total}
37
- if total >= 3 and acc < 65.0:
38
- weak_topics.append(t)
39
-
40
- recommendation = "Focus on: " + ", ".join(weak_topics) if weak_topics else "No major weak topics detected."
41
-
42
- return {
43
- "topic_accuracy": topic_accuracy,
44
- "weak_topics": weak_topics,
45
- "recommendation": recommendation
46
- }
47
-
48
-
49
- class CoachAgent:
50
  """
51
- Short actionable coaching guidance for Form5 SPM students.
 
52
  """
53
- def coach(self, analysis: Dict[str, Any], level: str, subject: str) -> Dict[str, Any]:
54
- weak = analysis.get("weak_topics", [])
55
- if not weak:
56
- tips = [
57
- "Keep revising key topics and time yourself on mock papers.",
58
- "Review incorrect solutions and understand each step.",
59
- "Do a mixed-topic mock weekly to build stamina."
60
- ]
61
- else:
62
- tips = [
63
- f"Spend 20–30 minutes daily on {weak[0]} (split into focused tasks).",
64
- "Solve short targeted questions and check worked solutions.",
65
- "Teach a concept to someone else it stabilizes understanding."
66
- ]
67
-
68
- practice = []
69
- for i, t in enumerate(weak[:3], start=1):
70
- practice.append({
71
- "text": f"Short practice prompt on {t}: (write/solve one short item)",
72
- "topic": t
73
- })
74
-
75
- return {"tips": tips, "study_plan": "20 min/day for weak topics + weekly mock", "practice": practice}
76
-
77
-
78
- class PredictiveAgent:
 
 
 
 
 
 
 
79
  """
80
- Generates heuristic or LLM-based predicted Form5 questions (in-memory only).
81
- Public methods:
82
- - predict(subject, level, count) -> list of question dicts
83
- - summary(level, subject) -> dict
84
  """
85
-
86
- def __init__(self):
87
- self.api_key = GLM_API_KEY
88
-
89
- def _top_topics_from_bank(self, question_bank: List[Dict], subject_display: str, top_k=6):
90
- subj_key = f"Form5_{subject_display}"
91
- counter = Counter()
92
- total = 0
93
- for q in question_bank:
94
- if q.get("subject") != subj_key:
95
- continue
96
- total += 1
97
- for t in q.get("topics", []):
98
- counter[t] += 1
99
- if total == 0:
100
- return []
101
- return [t for t, _ in counter.most_common(top_k)]
102
-
103
- def predict(self, subject: str, level: str = "Form5", count: int = 5) -> List[Dict]:
104
- """
105
- Return `count` predicted MCQs. If no GLM key present, produce conservative heuristic items.
106
- Predictions have id >= 900000, source='predicted', and may include 'confidence'.
107
- """
108
- preds = []
109
- base = 900000
110
- # fallback topics per subject
111
- fallback_topics = {
112
- "BM": ["perbendaharaan_kata", "tatabahasa"],
113
- "English": ["vocabulary", "grammar"],
114
- "Math": ["algebra", "geometry"],
115
- "History": ["events", "dates"],
116
- "Science": ["physics", "chemistry"],
117
- "MoralStudies": ["ethics", "values"]
118
- }
119
- topics = fallback_topics.get(subject, ["general"])
120
-
121
- # Try to use a simple LLM call if API key present (non-blocking, conservative)
122
- # NOTE: We keep the interface simple: if GLM unavailable or fails, fall back to heuristics.
123
- if self.api_key:
124
- try:
125
- # Placeholder: implement GLM call here if you provide endpoint details.
126
- # For now, fall back to heuristics to avoid runtime dependency.
127
- raise RuntimeError("GLM call not implemented in this environment")
128
- except Exception:
129
- pass
130
-
131
- # Heuristic generation
132
- for i in range(count):
133
- t = topics[i % len(topics)]
134
- q = self._heuristic_question(subject, t, idx=i + 1)
135
- q["id"] = base + i
136
- q["source"] = "predicted"
137
- q["confidence"] = round(random.uniform(0.35, 0.75), 2)
138
- preds.append(q)
139
- return preds
140
-
141
- def _heuristic_question(self, subject: str, topic: str, idx: int) -> Dict:
142
- # provide realistic-looking stems & 4 choices tailored by subject
143
- if subject == "BM":
144
- stem = f"Pilih sinonim bagi perkataan 'gembira'."
145
- choices = ["Sedih", "Gembira", "Marah", "Letih"]
146
- correct = "Gembira"
147
- elif subject == "English":
148
- stem = "Choose the correct synonym for 'happy'."
149
- choices = ["Sad", "Joyful", "Angry", "Tired"]
150
- correct = "Joyful"
151
- elif subject == "Math":
152
- stem = "If 2x + 3 = 11, what is x?"
153
- choices = ["2", "3", "4", "5"]
154
- correct = "4"
155
- elif subject == "Science":
156
- stem = "What is the SI unit of force?"
157
- choices = ["Joule", "Newton", "Pascal", "Watt"]
158
- correct = "Newton"
159
- elif subject == "History":
160
- stem = "Which year is associated with Malayan independence?"
161
- choices = ["1945", "1957", "1963", "1975"]
162
- correct = "1957"
163
- elif subject == "MoralStudies":
164
- stem = "Which value best represents mutual respect?"
165
- choices = ["Greed", "Respect", "Laziness", "Selfishness"]
166
- correct = "Respect"
167
  else:
168
- stem = f"Practice predicted question on {topic}."
169
- choices = ["A", "B", "C", "D"]
170
- correct = "A"
171
-
172
- return {"text": stem, "choices": choices, "correct_answer": correct, "topics": [topic], "difficulty": 3}
173
-
174
- def summary(self, level: str, subject: str, question_bank: List[Dict] = None) -> Dict:
175
- # Provide simple summary: top topics from bank if available
176
- topics = self._top_topics_from_bank(question_bank or [], subject) if question_bank else []
177
- return {"level": level, "subject": subject, "top_topics": topics, "note": "Predictions are practice-oriented heuristics."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- if __name__ == "__main__":
180
- demo.launch(server_name="0.0.0.0", server_port=7860)
181
 
182
 
183
 
 
1
+ # app.py
2
  import os
3
+ import re
4
+ import json
5
  import random
6
+ import subprocess
7
+ import gradio as gr
8
+
9
+ from agents import AnalyzerAgent, CoachAgent, PredictiveAgent
10
+ from ocr_agent import OcrAgent
11
+
12
+ # Constants
13
+ DATA_DIR = "data"
14
+ QUESTIONS_FILE = "questions.json"
15
+ VALID_SUBJECTS = ["BM", "English", "Math", "History", "Science", "MoralStudies",
16
+ "Accounting", "Economics", "Business"]
17
+
18
+ os.makedirs(DATA_DIR, exist_ok=True)
19
+
20
+ # Agents and OCR
21
+ analyzer = AnalyzerAgent()
22
+ coach_agent = CoachAgent()
23
+ predictor = PredictiveAgent()
24
+ ocr_agent = OcrAgent()
25
+
26
+ # Load question bank safely
27
+ def load_question_bank():
28
+ if not os.path.exists(QUESTIONS_FILE):
29
+ return []
30
+ try:
31
+ with open(QUESTIONS_FILE, "r", encoding="utf-8") as f:
32
+ content = f.read().strip()
33
+ return json.loads(content) if content else []
34
+ except Exception:
35
+ return []
36
+
37
+ QUESTION_BANK = load_question_bank()
38
+
39
+
40
+ # Merge runner
41
+ def run_merge():
42
+ """Run merge_questions.py (rebuilds questions.json) and reload QUESTION_BANK."""
43
+ try:
44
+ subprocess.run(["python", "merge_questions.py"], check=True)
45
+ global QUESTION_BANK
46
+ QUESTION_BANK = load_question_bank()
47
+ return True, "Merge successful."
48
+ except subprocess.CalledProcessError as e:
49
+ return False, f"Merge failed: {e}"
50
+
51
+
52
+ # Utility: normalize subject token and display
53
+ def subject_token_from_display(display):
54
+ if not display:
55
+ return "bm"
56
+ return display.strip().lower()
57
+
58
+
59
+ def normalize_display_subject(token):
60
+ if not token:
61
+ return "BM"
62
+ t = token.strip().lower()
63
+ mapping = {
64
+ "bm": "BM",
65
+ "bahasa": "BM",
66
+ "bahasamelayu": "BM",
67
+ "english": "English",
68
+ "math": "Math",
69
+ "mathematics": "Math",
70
+ "history": "History",
71
+ "sejarah": "History",
72
+ "science": "Science",
73
+ "physics": "Science",
74
+ "moral": "MoralStudies",
75
+ "moralstudies": "MoralStudies",
76
+ }
77
+ return mapping.get(t, token.capitalize())
78
+
79
+
80
+ def autodetect_from_filename(path):
81
+ """Detect year and subject token from filename like spm_2018_bm.pdf"""
82
+ if not path:
83
+ return None, None
84
+ fname = os.path.basename(path)
85
+ m = re.search(r"spm[_\-]?(\d{4})[_\-]?([A-Za-z]+)", fname, re.IGNORECASE)
86
+ if not m:
87
+ return None, None
88
+ year = m.group(1)
89
+ subj = m.group(2).lower()
90
+ return year, subj
91
+
92
+
93
+ # ===== OCR upload + auto-merge =====
94
+ def process_pdf_and_merge(file_path, display_subject, year):
95
  """
96
+ file_path: local filepath (gr.File type='filepath')
97
+ display_subject: e.g. "BM"
98
+ year: "2018"
99
  """
100
+ if not file_path:
101
+ return "No file uploaded."
102
+
103
+ subj_token = subject_token_from_display(display_subject)
104
+ try:
105
+ qfile, scheme_file = ocr_agent.extract_questions_to_files(
106
+ pdf_path=file_path, year=str(year), subject_token=subj_token, out_dir=DATA_DIR
107
+ )
108
+ except Exception as e:
109
+ return f" OCR failed: {e}"
110
+
111
+ ok, msg = run_merge()
112
+ if ok:
113
+ return f"✅ OCR saved: {os.path.basename(qfile)} & {os.path.basename(scheme_file)}. Merge: {msg}"
114
+ else:
115
+ return f"⚠️ OCR saved: {os.path.basename(qfile)} & {os.path.basename(scheme_file)}. Merge: {msg}"
116
+
117
+
118
+ # ===== Exam generation =====
119
+ def generate_exam(display_subject, num_questions, include_predicted):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  """
121
+ display_subject: "BM" etc.
122
+ returns exam_data (list) and status text and exam_data (for state)
123
  """
124
+ subj_key = f"Form5_{display_subject}"
125
+ pool = [q for q in QUESTION_BANK if q.get("subject") == subj_key]
126
+
127
+ predicted_questions = []
128
+ if include_predicted:
129
+ predicted_questions = predictor.generate_predictions(level="Form5",
130
+ subject=display_subject,
131
+ n=min(10, max(1, num_questions // 2)),
132
+ question_bank=QUESTION_BANK)
133
+
134
+ combined = pool + predicted_questions
135
+ if not combined:
136
+ return [], f"No questions found for {display_subject}. Upload papers (2018–2024).", []
137
+
138
+ random.shuffle(combined)
139
+ selected = combined[:min(num_questions, len(combined))]
140
+
141
+ # For safety, return minimal exam objects
142
+ exam_data = []
143
+ for q in selected:
144
+ # if predicted questions include correct_answer, it can be included (but they are in-memory)
145
+ exam_data.append({
146
+ "id": q.get("id"),
147
+ "text": q.get("text"),
148
+ "choices": q.get("choices", []),
149
+ "topics": q.get("topics", []),
150
+ "source": q.get("source", "pastpaper")
151
+ })
152
+ return exam_data, f"Prepared {len(exam_data)} questions ({len(predicted_questions)} predicted)", exam_data
153
+
154
+
155
+ # ===== Submit & grade =====
156
+ def submit_exam(answers_json, exam_state, display_subject):
157
  """
158
+ answers_json: dict where keys are stringified ids -> answer text (or choice text)
159
+ exam_state: the exam_data (list) saved in gr.State
 
 
160
  """
161
+ exam_data = exam_state or []
162
+ if not exam_data:
163
+ return "No exam data found.", {}, {}, {}, gr.update(visible=False), gr.update(visible=True)
164
+
165
+ correct = 0
166
+ graded = 0
167
+ per_question = {}
168
+
169
+ for q in exam_data:
170
+ qid = q.get("id")
171
+ key = str(qid)
172
+ user_ans = answers_json.get(key) if isinstance(answers_json, dict) else None
173
+
174
+ # determine correct answer
175
+ correct_ans = None
176
+ if q.get("source") == "predicted":
177
+ # predicted question may have correct_answer inside QUESTION_BANK? predictor sets it when generating.
178
+ # We didn't include correct_answer in exam state by default; attempt to find inside QUESTION_BANK (unlikely)
179
+ correct_ans = q.get("correct_answer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  else:
181
+ orig = next((item for item in QUESTION_BANK if item.get("id") == qid), None)
182
+ if orig:
183
+ correct_ans = orig.get("correct_answer")
184
+
185
+ per_question[str(qid)] = {"user": user_ans, "correct": correct_ans, "topics": q.get("topics", [])}
186
+
187
+ # grade only when correct_answer is not None
188
+ if correct_ans is not None:
189
+ graded += 1
190
+ if user_ans is not None and str(user_ans).strip() == str(correct_ans).strip():
191
+ correct += 1
192
+
193
+ score = round(100 * correct / graded, 2) if graded > 0 else "N/A (no answer keys)"
194
+
195
+ analysis = analyzer.analyze(per_question)
196
+ coach = coach_agent.coach(analysis, "Form5", display_subject)
197
+ pred_summary = predictor.summary(level="Form5", subject=display_subject, question_bank=QUESTION_BANK)
198
+
199
+ return (
200
+ f"Your Score: {score}%",
201
+ analysis,
202
+ coach,
203
+ pred_summary,
204
+ gr.update(visible=True),
205
+ gr.update(visible=True)
206
+ )
207
+
208
+
209
+ # ===== Prefill handler for upload UI =====
210
+ def prefill_subject_year_from_file(file_path):
211
+ if not file_path:
212
+ return "BM", "2018"
213
+ year, subj_token = autodetect_from_filename(file_path)
214
+ subj_display = normalize_display_subject(subj_token) if subj_token else "BM"
215
+ return subj_display, year if year else "2018"
216
+
217
+
218
+ # ===== Gradio UI =====
219
+ with gr.Blocks() as demo:
220
+ gr.Markdown("## SPM Exam Simulator — Form 5 (Past papers 2018–2024)")
221
+
222
+ with gr.Tab("Upload (OCR → JSON → Merge)"):
223
+ pdf_file = gr.File(label="Upload SPM PDF (e.g., spm_2018_bm.pdf)", type="filepath")
224
+ subject_dropdown = gr.Dropdown(choices=VALID_SUBJECTS, value="BM", label="Subject (override)")
225
+ year_dropdown = gr.Dropdown(choices=[str(y) for y in range(2018, 2025)], value="2018", label="Year")
226
+ process_btn = gr.Button("Process PDF → JSON + Merge")
227
+ ocr_status = gr.Textbox(label="Status", interactive=False)
228
+
229
+ pdf_file.change(prefill_subject_year_from_file, inputs=[pdf_file], outputs=[subject_dropdown, year_dropdown])
230
+ process_btn.click(process_pdf_and_merge, inputs=[pdf_file, subject_dropdown, year_dropdown], outputs=[ocr_status])
231
+
232
+ with gr.Tab("Exam Simulator"):
233
+ subject_sel = gr.Dropdown(choices=["BM", "English", "Math", "History", "Science", "MoralStudies"],
234
+ value="Math", label="Subject")
235
+ num_q = gr.Slider(minimum=5, maximum=50, step=5, value=10, label="Number of Questions")
236
+ include_pred = gr.Checkbox(value=True, label="Include AI-predicted questions (in-memory only)")
237
+ start_btn = gr.Button("Start Exam")
238
+ exam_state = gr.State()
239
+
240
+ exam_display = gr.JSON(label="Exam Questions")
241
+ status_display = gr.Textbox(label="Status", interactive=False)
242
+
243
+ start_btn.click(generate_exam,
244
+ inputs=[subject_sel, num_q, include_pred],
245
+ outputs=[exam_display, status_display, exam_state])
246
+
247
+ with gr.Tab("Submit & Results"):
248
+ answers_input = gr.JSON(label='Submit Your Answers as JSON (e.g. {"1001":"Seronok", "900000":"4"})')
249
+ submit_btn = gr.Button("Submit Answers")
250
+
251
+ score_out = gr.Textbox(label="Score")
252
+ analysis_out = gr.JSON(label="Weakness Analysis")
253
+ coach_out = gr.JSON(label="Personalized Coaching")
254
+ pred_out = gr.JSON(label="Prediction Summary")
255
+
256
+ back_btn = gr.Button("← Back to Exam", visible=False)
257
+ retry_btn = gr.Button("Retry", visible=False)
258
+
259
+ # submit takes (answers_input, exam_state, subject_sel)
260
+ submit_btn.click(
261
+ submit_exam,
262
+ inputs=[answers_input, exam_state, subject_sel],
263
+ outputs=[score_out, analysis_out, coach_out, pred_out, back_btn, retry_btn]
264
+ )
265
+
266
+ # Launch
267
+ if __name__ == "__main__":
268
+ demo.launch(server_name="0.0.0.0", server_port=7860)
269
 
 
 
270
 
271
 
272