Benny-Tang commited on
Commit
069219d
·
verified ·
1 Parent(s): fc1f560

Update ocr_agent.py

Browse files
Files changed (1) hide show
  1. ocr_agent.py +23 -46
ocr_agent.py CHANGED
@@ -1,55 +1,32 @@
1
- import os
2
- import json
3
  import fitz # PyMuPDF
 
 
4
 
5
- class OcrAgent:
6
- def __init__(self):
7
- pass
8
 
9
- def extract_from_pdf(self, pdf_path):
10
- """Extract text from a PDF using PyMuPDF."""
11
- text = ""
12
  doc = fitz.open(pdf_path)
13
- for page in doc:
14
- text += page.get_text()
15
- return text
16
-
17
- def clean_text(self, raw_text):
18
- """Basic text cleanup (stub)."""
19
- return raw_text.replace("\n", " ").strip()
20
-
21
- def text_to_json(self, text, subject, year, output_dir="data"):
22
- """
23
- Save extracted questions into JSON format and
24
- auto-create a blank scheme JSON with null answers.
25
- """
26
- os.makedirs(output_dir, exist_ok=True)
27
-
28
- base_name = f"spm_{year}_{subject.lower()}"
29
- questions_file = os.path.join(output_dir, f"{base_name}.json")
30
- scheme_file = os.path.join(output_dir, f"{base_name}_scheme.json")
31
-
32
- # For now, split text into fake MCQs (stub)
33
  questions = []
34
- for i, chunk in enumerate(text.split(".")[:10], start=1):
35
- q = {
36
- "id": int(f"{year}{i:03}"),
37
- "text": chunk.strip() if chunk.strip() else f"Question {i} placeholder",
38
- "choices": ["A", "B", "C", "D"],
39
- "subject": subject,
40
- "year": year
41
- }
42
- questions.append(q)
43
-
44
- # Write questions.json
45
- with open(questions_file, "w", encoding="utf-8") as f:
 
 
 
 
 
 
46
  json.dump(questions, f, indent=2, ensure_ascii=False)
 
47
 
48
- # Create blank scheme.json with null answers
49
- scheme_data = {str(q["id"]): {"correct_answer": None} for q in questions}
50
- with open(scheme_file, "w", encoding="utf-8") as f:
51
- json.dump(scheme_data, f, indent=2, ensure_ascii=False)
52
-
53
- return questions_file
54
 
55
 
 
 
 
1
  import fitz # PyMuPDF
2
+ import json
3
+ import os
4
 
 
 
 
5
 
6
+ class OcrAgent:
7
+ def extract_questions(self, pdf_path, output_path):
 
8
  doc = fitz.open(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  questions = []
10
+ qid = 1000
11
+ for page in doc:
12
+ text = page.get_text("text")
13
+ for line in text.splitlines():
14
+ if line.strip().endswith("?") or line.strip().startswith("Q"):
15
+ questions.append(
16
+ {
17
+ "id": qid,
18
+ "text": line.strip(),
19
+ "choices": ["A", "B", "C", "D"],
20
+ "topics": [],
21
+ "correct_answer": None,
22
+ }
23
+ )
24
+ qid += 1
25
+
26
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
27
+ with open(output_path, "w", encoding="utf-8") as f:
28
  json.dump(questions, f, indent=2, ensure_ascii=False)
29
+ print(f"✅ Extracted {len(questions)} questions → {output_path}")
30
 
 
 
 
 
 
 
31
 
32