Benny-Tang commited on
Commit
8f96aba
·
verified ·
1 Parent(s): 069219d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +217 -145
app.py CHANGED
@@ -1,210 +1,281 @@
1
  import os
 
2
  import json
3
  import random
 
4
  import gradio as gr
 
5
  from agents import AnalyzerAgent, CoachAgent, PredictiveAgent
6
  from ocr_agent import OcrAgent
7
- import subprocess
8
 
9
  # Paths
10
- QUESTION_BANK_PATH = "questions.json"
11
  DATA_DIR = "data"
 
 
 
 
12
 
13
- # Initialize agents
14
  analyzer = AnalyzerAgent()
15
  coach_agent = CoachAgent()
16
  predictor = PredictiveAgent()
17
  ocr_agent = OcrAgent()
18
 
19
- # Load question bank
20
  def load_question_bank():
21
- if not os.path.exists(QUESTION_BANK_PATH):
 
 
 
 
 
 
 
22
  return []
23
- with open(QUESTION_BANK_PATH, "r", encoding="utf-8") as f:
24
- try:
25
- return json.load(f)
26
- except json.JSONDecodeError:
27
- return []
28
 
29
  QUESTION_BANK = load_question_bank()
30
 
31
 
32
- # Merge JSONs after OCR extraction
33
  def merge_questions():
 
34
  try:
35
  subprocess.run(["python", "merge_questions.py"], check=True)
36
  global QUESTION_BANK
37
  QUESTION_BANK = load_question_bank()
38
- return True, "Merge successful."
39
  except subprocess.CalledProcessError as e:
40
  return False, f"Merge failed: {e}"
41
 
42
 
43
- # Extract questions via OCR and save as JSON
44
- def process_pdf(file, subject, year):
45
- if not file:
46
- return "No file uploaded."
47
- output_path = os.path.join(DATA_DIR, f"spm_{year}_{subject.lower()}.json")
48
- os.makedirs(DATA_DIR, exist_ok=True)
49
- ocr_agent.extract_questions(file, output_path)
50
- ok, msg = merge_questions()
51
- return f"OCR extracted data → {output_path}, merge status: {msg}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
 
 
 
 
 
53
 
54
- # Select questions for simulation
55
- def start_exam(level, subject, num_questions, include_predicted=False):
56
- if not QUESTION_BANK:
57
- return [], "Question bank is empty. Upload PDFs first."
58
 
59
- # Filter by subject
60
- pool = [q for q in QUESTION_BANK if q.get("subject") == subject]
 
 
 
 
 
 
 
61
 
62
- # Add predicted if chosen
63
  if include_predicted:
64
- predicted = predictor.predict(subject=subject, level=level, count=num_questions // 2)
65
- pool.extend(predicted)
66
-
67
- if not pool:
68
- return [], f"No questions available for {subject}."
69
-
70
- selected = random.sample(pool, min(num_questions, len(pool)))
71
- return selected, f"Loaded {len(selected)} questions."
72
-
73
-
74
- # Submit exam
75
- def submit_exam(answers, exam_data, level, subject):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  if not exam_data:
77
- return "No questions found.", {}, {}, {}, gr.update(visible=False), gr.update(visible=True)
78
 
79
  correct = 0
80
- graded = 0 # only grade if answer key exists
81
  per_question = {}
82
 
83
  for q in exam_data:
84
- qid = str(q["id"])
85
- user_ans = answers.get(qid)
 
 
86
  correct_ans = None
87
-
88
- if q["id"] < 900000: # past paper
89
- orig = next((item for item in QUESTION_BANK if item["id"] == q["id"]), None)
90
- correct_ans = orig.get("correct_answer") if orig else None
91
- else: # predicted
92
- correct_ans = q.get("correct_answer")
93
-
94
- per_question[qid] = {"user": user_ans, "correct": correct_ans, "topics": q.get("topics", [])}
95
-
96
- if correct_ans is not None: # only grade if valid answer key exists
 
 
 
 
 
 
 
 
 
97
  graded += 1
98
- if user_ans and str(user_ans).strip() == str(correct_ans).strip():
 
99
  correct += 1
100
 
101
- score = round(100 * correct / graded, 2) if graded > 0 else "N/A (no answer keys)"
102
 
103
  analysis = analyzer.analyze(per_question)
104
- coach = coach_agent.coach(analysis, level, subject)
105
- predictions_summary = predictor.summary(level, subject)
106
 
107
  return (
108
  f"Your Score: {score}%",
109
  analysis,
110
  coach,
111
- predictions_summary,
112
  gr.update(visible=False),
113
  gr.update(visible=True)
114
  )
115
 
116
 
117
- # Prefill subject/year from filename
118
- def auto_detect(file):
119
- if not file:
120
- return None, None
121
- fname = os.path.basename(file).lower()
122
- year = None
123
- subject = None
124
- for y in range(2018, 2025):
125
- if str(y) in fname:
126
- year = str(y)
127
- break
128
- subjects = ["bm", "english", "math", "history", "science", "moralstudies"]
129
- for s in subjects:
130
- if s in fname:
131
- subject = s
132
- break
133
- return year, subject
134
-
135
-
136
- def prefill_subject_year(file):
137
- """Return auto-detected subject/year for UI prefill"""
138
- if not file:
139
- return "BM", "2018"
140
- year, subject = auto_detect(file)
141
-
142
- valid_subjects = ["BM", "English", "Math", "History", "Science", "MoralStudies"]
143
- if subject:
144
- subject = subject.upper()
145
- if subject in ["B.M", "BAHASA", "BAHASAMELAYU"]:
146
- subject = "BM"
147
- if subject not in valid_subjects:
148
- subject = "BM"
149
- else:
150
- subject = "BM"
151
-
152
- return subject, year if year else "2018"
153
-
154
-
155
- # UI
156
  with gr.Blocks() as demo:
157
- gr.Markdown("# 📘 SPM Exam Simulation Platform")
158
-
159
- with gr.Tab("📤 Upload Papers"):
160
- pdf_file = gr.File(label="Upload SPM PDF", type="filepath")
161
- subject_dropdown = gr.Dropdown(
162
- label="Subject", choices=["BM", "English", "Math", "History", "Science", "MoralStudies"], value="BM"
163
- )
164
- year_dropdown = gr.Dropdown(
165
- label="Year", choices=[str(y) for y in range(2018, 2025)], value="2018"
166
- )
167
- upload_btn = gr.Button("Process PDF")
168
- upload_status = gr.Textbox(label="Upload Status")
169
-
170
- upload_btn.click(
171
- process_pdf, inputs=[pdf_file, subject_dropdown, year_dropdown], outputs=upload_status
172
- )
173
- pdf_file.change(prefill_subject_year, inputs=pdf_file, outputs=[subject_dropdown, year_dropdown])
174
-
175
- with gr.Tab("📝 Exam Simulation"):
176
- level = gr.Dropdown(label="Level", choices=["Form2", "Form3", "Higher1", "Higher2", "Higher3"], value="Form3")
177
- subject = gr.Dropdown(
178
- label="Subject", choices=["BM", "English", "Math", "History", "Science", "MoralStudies"], value="Math"
179
- )
180
- num_questions = gr.Slider(5, 50, value=10, step=1, label="Number of Questions")
181
- include_predicted = gr.Checkbox(label="Include AI-predicted questions", value=True)
 
 
 
 
182
  start_btn = gr.Button("Start Exam")
 
183
 
184
- exam_output = gr.State()
185
- exam_interface = gr.JSON(label="Exam Questions")
186
-
187
- start_btn.click(
188
- start_exam, inputs=[level, subject, num_questions, include_predicted], outputs=[exam_interface, upload_status]
189
- )
190
 
191
- with gr.Tab("📊 Results"):
192
- answers_input = gr.JSON(label="Submit Your Answers (JSON: {id: choice})")
193
- submit_btn = gr.Button("Submit Exam")
 
 
 
 
194
 
195
- score_output = gr.Textbox(label="Score")
196
- analysis_output = gr.JSON(label="Weakness Analysis")
197
- coach_output = gr.JSON(label="Personalized Coaching")
198
- predictor_output = gr.JSON(label="Predicted Trends")
199
-
200
- back_btn = gr.Button("← Back to Exam", visible=False)
201
- retry_btn = gr.Button("Retry", visible=False)
202
-
203
- submit_btn.click(
204
- submit_exam,
205
- inputs=[answers_input, exam_interface, level, subject],
206
- outputs=[score_output, analysis_output, coach_output, predictor_output, back_btn, retry_btn],
207
- )
208
 
209
  demo.launch()
210
 
@@ -218,3 +289,4 @@ demo.launch()
218
 
219
 
220
 
 
 
1
  import os
2
+ import re
3
  import json
4
  import random
5
+ import subprocess
6
  import gradio as gr
7
+
8
  from agents import AnalyzerAgent, CoachAgent, PredictiveAgent
9
  from ocr_agent import OcrAgent
 
10
 
11
  # Paths
 
12
  DATA_DIR = "data"
13
+ QUESTIONS_FILE = "questions.json"
14
+
15
+ # Ensure data dir exists
16
+ os.makedirs(DATA_DIR, exist_ok=True)
17
 
18
+ # Agents
19
  analyzer = AnalyzerAgent()
20
  coach_agent = CoachAgent()
21
  predictor = PredictiveAgent()
22
  ocr_agent = OcrAgent()
23
 
24
+
25
  def load_question_bank():
26
+ """Load merged question bank safely; return [] if file missing/invalid."""
27
+ if not os.path.exists(QUESTIONS_FILE):
28
+ return []
29
+ try:
30
+ with open(QUESTIONS_FILE, "r", encoding="utf-8") as f:
31
+ content = f.read().strip()
32
+ return json.loads(content) if content else []
33
+ except Exception:
34
  return []
35
+
 
 
 
 
36
 
37
  QUESTION_BANK = load_question_bank()
38
 
39
 
40
+ # ---------------- Merge helper ----------------
41
  def merge_questions():
42
+ """Run merge_questions.py to rebuild questions.json and reload in memory."""
43
  try:
44
  subprocess.run(["python", "merge_questions.py"], check=True)
45
  global QUESTION_BANK
46
  QUESTION_BANK = load_question_bank()
47
+ return True, "Merge successful"
48
  except subprocess.CalledProcessError as e:
49
  return False, f"Merge failed: {e}"
50
 
51
 
52
+ # ---------------- OCR / Upload ----------------
53
+ def auto_detect_from_filename(path):
54
+ """Try to detect year and subject (lowercase subject token) from filename.
55
+ Matches patterns like: spm_2018_bm.pdf or spm-2019-math.pdf etc."""
56
+ if not path:
57
+ return None, None
58
+ fname = os.path.basename(path)
59
+ m = re.search(r"spm[_\-]?(\d{4})[_\-]?([A-Za-z]+)", fname, re.IGNORECASE)
60
+ if not m:
61
+ return None, None
62
+ year = m.group(1)
63
+ subj = m.group(2).lower()
64
+ return year, subj
65
+
66
+
67
+ SUBJECT_DISPLAY_ORDER = ["BM", "English", "Math", "History", "Science", "MoralStudies",
68
+ "Accounting", "Economics", "Business"]
69
+
70
+
71
+ def normalize_display_subject(token):
72
+ """Return display subject label (capitalized BM / English / Math / MoralStudies, etc.)."""
73
+ if not token:
74
+ return "BM"
75
+ t = token.strip().lower()
76
+ mapping = {
77
+ "bm": "BM",
78
+ "bahasa": "BM",
79
+ "bahasamelayu": "BM",
80
+ "english": "English",
81
+ "eng": "English",
82
+ "math": "Math",
83
+ "mathematics": "Math",
84
+ "history": "History",
85
+ "sejarah": "History",
86
+ "science": "Science",
87
+ "moral": "MoralStudies",
88
+ "moralstudies": "MoralStudies",
89
+ "accounting": "Accounting",
90
+ "economics": "Economics",
91
+ "business": "Business",
92
+ }
93
+ return mapping.get(t, token.capitalize())
94
+
95
+
96
+ def subject_token_from_display(display_subj):
97
+ """Convert display subject (BM, English) to token used in filenames (lowercase)."""
98
+ if not display_subj:
99
+ return "bm"
100
+ dsp = display_subj.strip()
101
+ return dsp.lower()
102
+
103
+
104
+ def process_pdf_and_merge(file_path, display_subject, year):
105
+ """
106
+ - Run OCR -> write data/spm_{year}_{subject}.json + scheme file.
107
+ - Auto-run merge_questions.py to create/refresh questions.json
108
+ """
109
+ if not file_path:
110
+ return "No file provided."
111
+
112
+ subj_token = subject_token_from_display(display_subject)
113
+ # call OCR agent to extract and write files
114
+ try:
115
+ out_qfile, out_scheme = ocr_agent.extract_questions_to_files(pdf_path=file_path,
116
+ year=str(year),
117
+ subject_token=subj_token,
118
+ out_dir=DATA_DIR)
119
+ except Exception as e:
120
+ return f"❌ OCR failed: {e}"
121
 
122
+ ok, msg = merge_questions()
123
+ if ok:
124
+ return f"✅ OCR saved {out_qfile} and {out_scheme}. Merge result: {msg}"
125
+ else:
126
+ return f"⚠️ OCR saved {out_qfile} and {out_scheme}. Merge result: {msg}"
127
 
 
 
 
 
128
 
129
+ # ---------------- Exam logic ----------------
130
+ def generate_exam(subject_display, num_questions, include_predicted):
131
+ """
132
+ Returns (exam_data (list), status_message, exam_data) to store exam_data in state.
133
+ exam_data items: {id:int, text:str, choices:list, topics:list, source:str}
134
+ """
135
+ # internal lookup subject key stored in questions.json is "Form5_<DisplaySubject>" e.g., Form5_BM
136
+ subj_key = f"Form5_{subject_display}"
137
+ pool = [q for q in QUESTION_BANK if q.get("subject") == subj_key]
138
 
139
+ predicted_questions = []
140
  if include_predicted:
141
+ # ask predictor to generate predictions using the current bank (so trend info is used)
142
+ predicted_questions = predictor.generate_predictions(level="Form5",
143
+ subject=subject_display,
144
+ n=8,
145
+ question_bank=QUESTION_BANK)
146
+
147
+ combined = pool + predicted_questions
148
+ if not combined:
149
+ return [], f"No questions available for {subject_display}. Upload papers (2018–2024) first.", []
150
+
151
+ random.shuffle(combined)
152
+ selected = combined[:min(num_questions, len(combined))]
153
+
154
+ # Standardize output shape (do not expose 'correct_answer' for predicted? we include it,
155
+ # but the UI can show choices; predicted questions have correct_answer set by predictor)
156
+ exam_data = []
157
+ for q in selected:
158
+ # ensure minimal fields exist
159
+ exam_data.append({
160
+ "id": q.get("id"),
161
+ "text": q.get("text"),
162
+ "choices": q.get("choices", []),
163
+ "topics": q.get("topics", []),
164
+ "source": q.get("source", "pastpaper")
165
+ })
166
+
167
+ return exam_data, f"Prepared {len(exam_data)} questions (includes {len(predicted_questions)} predicted)" , exam_data
168
+
169
+
170
+ def submit_exam_answers(answers_json, exam_data, subject_display):
171
+ """
172
+ answers_json: dict mapping question id (string) -> answer string (the answer text or choice text)
173
+ exam_data: list (from start)
174
+ We grade only questions where a correct_answer exists (not None).
175
+ """
176
  if not exam_data:
177
+ return "No exam data found.", {}, {}, {}, gr.update(visible=False), gr.update(visible=True)
178
 
179
  correct = 0
180
+ graded = 0
181
  per_question = {}
182
 
183
  for q in exam_data:
184
+ qid = q.get("id")
185
+ k = str(qid)
186
+ user_ans = answers_json.get(k)
187
+ # find canonical correct_answer: for past paper, from QUESTION_BANK; for predicted, from q itself if present
188
  correct_ans = None
189
+ if q.get("source") == "predicted":
190
+ # predicted question object may include a 'correct_answer'
191
+ # in our design predictor attaches 'correct_answer' to predicted questions
192
+ # but it's still probabilistic (has 'confidence' field)
193
+ # q (from exam_data) did not include correct_answer (we stripped), so find from QUESTION_BANK? Not present
194
+ # We need to find original predicted object — predictor returns dicts; but since predicted questions were not saved to QUESTION_BANK,
195
+ # the simple way: during generate_exam we should have kept the predicted correct_answer in the exam_data object.
196
+ # To keep things robust, first attempt to find a matching question in QUESTION_BANK (unlikely),
197
+ # then try to see if exam_data contains 'correct_answer' directly (shouldn't in UI). We'll assume predicted questions include correct_answer in exam_data if they are to be graded.
198
+ correct_ans = q.get("correct_answer") # may be None
199
+ else:
200
+ # pastpaper: find in QUESTION_BANK by id
201
+ orig = next((item for item in QUESTION_BANK if item.get("id") == qid), None)
202
+ if orig:
203
+ correct_ans = orig.get("correct_answer")
204
+
205
+ per_question[str(qid)] = {"user": user_ans, "correct": correct_ans, "topics": q.get("topics", [])}
206
+
207
+ if correct_ans is not None:
208
  graded += 1
209
+ # compare string-normalized answers
210
+ if user_ans is not None and str(user_ans).strip() == str(correct_ans).strip():
211
  correct += 1
212
 
213
+ score = round(100 * correct / graded, 2) if graded > 0 else "N/A (no answer keys available)"
214
 
215
  analysis = analyzer.analyze(per_question)
216
+ coach = coach_agent.coach(analysis, "Form5", subject_display)
217
+ pred_summary = predictor.summary(level="Form5", subject=subject_display, question_bank=QUESTION_BANK)
218
 
219
  return (
220
  f"Your Score: {score}%",
221
  analysis,
222
  coach,
223
+ pred_summary,
224
  gr.update(visible=False),
225
  gr.update(visible=True)
226
  )
227
 
228
 
229
+ # ----------------- UI -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  with gr.Blocks() as demo:
231
+ gr.Markdown("## SPM Exam Simulator — Form 5 (Past papers 2018–2024) with AI Predictions & OCR")
232
+
233
+ with gr.Tab("Upload Papers (OCR → JSON → Merge)"):
234
+ pdf_file = gr.File(label="Upload SPM PDF (filename like spm_2018_bm.pdf helps auto-detect)",
235
+ type="filepath")
236
+ subject_dropdown = gr.Dropdown(choices=SUBJECT_DISPLAY_ORDER, value="BM", label="Subject (override)")
237
+ year_dropdown = gr.Dropdown(choices=[str(y) for y in range(2018, 2025)], value="2018", label="Year")
238
+ process_btn = gr.Button("Process PDF → JSON + Merge")
239
+ ocr_status = gr.Textbox(label="Status", interactive=False)
240
+
241
+ # When a file is uploaded, auto-fill subject/year fields
242
+ def prefill(file_path):
243
+ if not file_path:
244
+ return "BM", "2018"
245
+ year, subj_token = auto_detect_from_filename(file_path)
246
+ subj_display = normalize_display_subject(subj_token) if subj_token else "BM"
247
+ return subj_display, year if year else "2018"
248
+
249
+ pdf_file.change(fn=prefill, inputs=[pdf_file], outputs=[subject_dropdown, year_dropdown])
250
+ process_btn.click(fn=process_pdf_and_merge,
251
+ inputs=[pdf_file, subject_dropdown, year_dropdown],
252
+ outputs=[ocr_status])
253
+
254
+ with gr.Tab("Exam Simulator"):
255
+ subject_sel = gr.Dropdown(choices=["BM", "English", "Math", "History", "Science", "MoralStudies",
256
+ "Accounting", "Economics", "Business"],
257
+ value="Math", label="Subject")
258
+ num_q = gr.Slider(minimum=5, maximum=50, step=5, value=10, label="Number of Questions")
259
+ include_pred = gr.Checkbox(value=True, label="Include AI-predicted questions (in-memory only)")
260
  start_btn = gr.Button("Start Exam")
261
+ exam_state = gr.State() # will store exam_data (list)
262
 
263
+ exam_display = gr.JSON(label="Exam Questions (read-only)")
264
+ start_btn.click(fn=generate_exam,
265
+ inputs=[subject_sel, num_q, include_pred],
266
+ outputs=[exam_display, gr.Textbox(label="Status"), exam_state])
 
 
267
 
268
+ with gr.Tab("Submit & Results"):
269
+ answers_input = gr.JSON(label="Your Answers (JSON dictionary: {\"<id>\": \"<choice text>\"})")
270
+ submit_btn = gr.Button("Submit Answers")
271
+ score_out = gr.Textbox(label="Score")
272
+ analysis_out = gr.JSON(label="Weakness Analysis")
273
+ coach_out = gr.JSON(label="Study Coach")
274
+ pred_out = gr.JSON(label="Predictions Summary")
275
 
276
+ submit_btn.click(fn=submit_exam_answers,
277
+ inputs=[answers_input, gr.State(), subject_sel, ],
278
+ outputs=[score_out, analysis_out, coach_out, pred_out, gr.Update(), gr.Update()])
 
 
 
 
 
 
 
 
 
 
279
 
280
  demo.launch()
281
 
 
289
 
290
 
291
 
292
+