Spaces:
Runtime error
Runtime error
Update ocr_agent.py
Browse files- ocr_agent.py +17 -14
ocr_agent.py
CHANGED
|
@@ -16,18 +16,20 @@ except Exception:
|
|
| 16 |
pytesseract = None
|
| 17 |
|
| 18 |
|
| 19 |
-
def
|
| 20 |
-
|
| 21 |
-
return
|
| 22 |
|
| 23 |
|
| 24 |
-
def
|
|
|
|
|
|
|
| 25 |
m = re.search(r"\bA[\)\.]\s*", block)
|
| 26 |
if m:
|
| 27 |
start = m.start()
|
| 28 |
qtext = block[:start].strip()
|
| 29 |
-
|
| 30 |
-
items = re.split(r'(?=\b[A-D][\)\.]\s*)',
|
| 31 |
choices = []
|
| 32 |
for it in items:
|
| 33 |
it = it.strip()
|
|
@@ -35,9 +37,10 @@ def _extract_choices(block):
|
|
| 35 |
continue
|
| 36 |
it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
|
| 37 |
choices.append(it2)
|
| 38 |
-
|
|
|
|
| 39 |
|
| 40 |
-
# fallback
|
| 41 |
lines = block.splitlines()
|
| 42 |
q_lines = []
|
| 43 |
choices = []
|
|
@@ -85,28 +88,27 @@ class OcrAgent:
|
|
| 85 |
if pdfplumber:
|
| 86 |
text = self._extract_pdfplumber(pdf_path)
|
| 87 |
if not text or len(text.strip()) < 120:
|
| 88 |
-
# fallback
|
| 89 |
text = self._extract_tesseract(pdf_path)
|
| 90 |
except Exception:
|
| 91 |
-
text = self._extract_tesseract(pdf_path) if convert_from_path and pytesseract else ""
|
| 92 |
return text
|
| 93 |
|
| 94 |
-
def
|
| 95 |
-
blocks =
|
| 96 |
questions = []
|
| 97 |
for blk in blocks:
|
| 98 |
blk = blk.strip()
|
| 99 |
if not blk:
|
| 100 |
continue
|
| 101 |
blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
|
| 102 |
-
qtext, choices =
|
| 103 |
qtype = "mcq" if choices else "short_answer"
|
| 104 |
questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
|
| 105 |
return questions
|
| 106 |
|
| 107 |
def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
|
| 108 |
text = self.extract_text(pdf_path)
|
| 109 |
-
questions = self.
|
| 110 |
|
| 111 |
subject_token = subject_token.lower()
|
| 112 |
qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
|
|
@@ -126,3 +128,4 @@ class OcrAgent:
|
|
| 126 |
|
| 127 |
|
| 128 |
|
|
|
|
|
|
| 16 |
pytesseract = None
|
| 17 |
|
| 18 |
|
| 19 |
+
def _split_blocks_by_number(text: str):
|
| 20 |
+
# split on lines starting with "1. " or similar
|
| 21 |
+
return re.split(r"\n(?=\s*\d+\.)", text)
|
| 22 |
|
| 23 |
|
| 24 |
+
def _extract_choices_from_block(block: str):
|
| 25 |
+
block = block.strip()
|
| 26 |
+
# look for A) or A. markers
|
| 27 |
m = re.search(r"\bA[\)\.]\s*", block)
|
| 28 |
if m:
|
| 29 |
start = m.start()
|
| 30 |
qtext = block[:start].strip()
|
| 31 |
+
opts_text = block[start:].strip()
|
| 32 |
+
items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
|
| 33 |
choices = []
|
| 34 |
for it in items:
|
| 35 |
it = it.strip()
|
|
|
|
| 37 |
continue
|
| 38 |
it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
|
| 39 |
choices.append(it2)
|
| 40 |
+
if choices:
|
| 41 |
+
return qtext, choices
|
| 42 |
|
| 43 |
+
# fallback: lines style
|
| 44 |
lines = block.splitlines()
|
| 45 |
q_lines = []
|
| 46 |
choices = []
|
|
|
|
| 88 |
if pdfplumber:
|
| 89 |
text = self._extract_pdfplumber(pdf_path)
|
| 90 |
if not text or len(text.strip()) < 120:
|
|
|
|
| 91 |
text = self._extract_tesseract(pdf_path)
|
| 92 |
except Exception:
|
| 93 |
+
text = self._extract_tesseract(pdf_path) if (convert_from_path and pytesseract) else ""
|
| 94 |
return text
|
| 95 |
|
| 96 |
+
def parse_questions_from_text(self, raw_text: str):
|
| 97 |
+
blocks = _split_blocks_by_number(raw_text)
|
| 98 |
questions = []
|
| 99 |
for blk in blocks:
|
| 100 |
blk = blk.strip()
|
| 101 |
if not blk:
|
| 102 |
continue
|
| 103 |
blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
|
| 104 |
+
qtext, choices = _extract_choices_from_block(blk2)
|
| 105 |
qtype = "mcq" if choices else "short_answer"
|
| 106 |
questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
|
| 107 |
return questions
|
| 108 |
|
| 109 |
def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
|
| 110 |
text = self.extract_text(pdf_path)
|
| 111 |
+
questions = self.parse_questions_from_text(text)
|
| 112 |
|
| 113 |
subject_token = subject_token.lower()
|
| 114 |
qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
|
|
|
|
| 128 |
|
| 129 |
|
| 130 |
|
| 131 |
+
|