nkshirsa commited on
Commit
08701e5
·
verified ·
1 Parent(s): 247b2f8

v2.0: phd_research_os_v2/layer0/parser.py

Browse files
Files changed (1) hide show
  1. phd_research_os_v2/layer0/parser.py +570 -0
phd_research_os_v2/layer0/parser.py ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Layer 0: Structural PDF Ingestion
3
+ ===================================
4
+ Converts PDF bundles into section-aware, bbox-annotated, quality-scored regions.
5
+ Uses Marker as primary parser with fallback to pdfplumber.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import re
11
+ from pathlib import Path
12
+ from typing import Optional
13
+
14
+ from ..core.database import (
15
+ get_db, init_db, gen_id, now_iso, to_fixed, from_fixed, hash_text
16
+ )
17
+
18
+
19
+ # Section detection patterns for scientific papers
20
+ SECTION_PATTERNS = [
21
+ (r'(?i)^(abstract)\b', 'abstract'),
22
+ (r'(?i)^(introduction)\b', 'introduction'),
23
+ (r'(?i)^(background)\b', 'introduction'),
24
+ (r'(?i)^(related\s+work)\b', 'related_work'),
25
+ (r'(?i)^(materials?\s+and\s+methods?|methods?|experimental)\b', 'methods'),
26
+ (r'(?i)^(results?\s+and\s+discussion)\b', 'results_discussion'),
27
+ (r'(?i)^(results?)\b', 'results'),
28
+ (r'(?i)^(discussion)\b', 'discussion'),
29
+ (r'(?i)^(conclusions?|summary)\b', 'conclusion'),
30
+ (r'(?i)^(acknowledge?ments?)\b', 'acknowledgments'),
31
+ (r'(?i)^(references?|bibliography)\b', 'references'),
32
+ (r'(?i)^(supplementary|supporting\s+information|appendix)\b', 'supplement'),
33
+ ]
34
+
35
+
36
+ def detect_section(text: str) -> Optional[str]:
37
+ """Detect which section a text block belongs to."""
38
+ first_line = text.strip().split('\n')[0].strip()
39
+ # Remove numbering like "2.1", "3.", "III.", "2.1 ", etc.
40
+ first_line = re.sub(r'^[\d]+\.[\d]*\s*', '', first_line)
41
+ first_line = re.sub(r'^[\d]+\.\s*', '', first_line)
42
+ first_line = re.sub(r'^[IVXivx]+\.\s*', '', first_line)
43
+ first_line = first_line.strip()
44
+
45
+ for pattern, section in SECTION_PATTERNS:
46
+ if re.match(pattern, first_line):
47
+ return section
48
+ return None
49
+
50
+
51
+ def classify_region_type(text: str) -> str:
52
+ """Classify a text block's type based on content patterns."""
53
+ stripped = text.strip()
54
+
55
+ # Table detection
56
+ if '|' in stripped and stripped.count('|') > 3:
57
+ return 'table'
58
+ if re.match(r'(?i)^table\s+\d', stripped):
59
+ return 'caption'
60
+
61
+ # Figure detection
62
+ if re.match(r'(?i)^(figure|fig\.?)\s+\d', stripped):
63
+ return 'caption'
64
+
65
+ # Equation detection (LaTeX or heavy math symbols)
66
+ if stripped.count('$') >= 2 or '\\frac' in stripped or '\\sum' in stripped:
67
+ return 'equation'
68
+ if re.match(r'^\s*\([\d]+\)\s*$', stripped):
69
+ return 'equation'
70
+
71
+ # Reference detection
72
+ if re.match(r'(?i)^references?\s*$', stripped) or re.match(r'^\[\d+\]', stripped):
73
+ return 'reference'
74
+
75
+ # Header detection (short, possibly bold/caps)
76
+ if len(stripped) < 100 and stripped.isupper():
77
+ return 'header'
78
+ if len(stripped) < 80 and not stripped.endswith('.'):
79
+ section = detect_section(stripped)
80
+ if section:
81
+ return 'header'
82
+
83
+ return 'body_text'
84
+
85
+
86
+ def extract_cross_references(text: str) -> list:
87
+ """Extract in-text references to figures, tables, equations."""
88
+ refs = []
89
+
90
+ # Figure references
91
+ for m in re.finditer(r'(?i)(figure|fig\.?)\s+(\d+[a-z]?)', text):
92
+ refs.append({
93
+ "ref_text": m.group(0),
94
+ "ref_type": "figure",
95
+ "ref_number": m.group(2),
96
+ "resolved_to": None,
97
+ "verified": False,
98
+ })
99
+
100
+ # Table references
101
+ for m in re.finditer(r'(?i)(table)\s+(\d+[a-z]?)', text):
102
+ refs.append({
103
+ "ref_text": m.group(0),
104
+ "ref_type": "table",
105
+ "ref_number": m.group(2),
106
+ "resolved_to": None,
107
+ "verified": False,
108
+ })
109
+
110
+ # Equation references
111
+ for m in re.finditer(r'(?i)(eq\.?|equation)\s+\(?(\d+)\)?', text):
112
+ refs.append({
113
+ "ref_text": m.group(0),
114
+ "ref_type": "equation",
115
+ "ref_number": m.group(2),
116
+ "resolved_to": None,
117
+ "verified": False,
118
+ })
119
+
120
+ # Citation references [1], [2,3], [1-5]
121
+ for m in re.finditer(r'\[(\d+(?:[,\-–]\s*\d+)*)\]', text):
122
+ refs.append({
123
+ "ref_text": m.group(0),
124
+ "ref_type": "citation",
125
+ "ref_number": m.group(1),
126
+ "resolved_to": None,
127
+ "verified": False,
128
+ })
129
+
130
+ return refs
131
+
132
+
133
+ def score_parse_quality(text: str, method: str) -> int:
134
+ """Score parsing quality for a text region (fixed-point ×1000)."""
135
+ score = 1000 # Start at perfect
136
+
137
+ if not text or not text.strip():
138
+ return 0
139
+
140
+ # Penalize: garbled characters (common in bad OCR/parsing)
141
+ garbled_chars = sum(1 for c in text if ord(c) > 65535 or c in '□■◊▪▫●○◆◇')
142
+ garbled_ratio = garbled_chars / max(len(text), 1)
143
+ score -= int(garbled_ratio * 3000) # Heavy penalty: even 10% garbled → -300
144
+ if garbled_chars > 0:
145
+ score -= garbled_chars * 50 # Additional per-character penalty
146
+
147
+ # Penalize: excessive whitespace (column merge artifact)
148
+ ws_ratio = text.count(' ') / max(len(text), 1)
149
+ score -= int(ws_ratio * 200)
150
+
151
+ # Penalize: very short fragments (likely parsing artifact)
152
+ if len(text.strip()) < 20:
153
+ score -= 200
154
+
155
+ # Penalize: no sentence structure (no periods, likely garbled)
156
+ if len(text) > 100 and '.' not in text:
157
+ score -= 300
158
+
159
+ # Bonus: markdown structure preserved (Marker output)
160
+ if method == 'marker' and '#' in text:
161
+ score += 50
162
+
163
+ return max(0, min(1000, score))
164
+
165
+
166
+ class StructuralParser:
167
+ """
168
+ Layer 0: Parse PDF bundles into annotated regions.
169
+
170
+ Tries Marker first (best quality), falls back to pdfplumber/PyMuPDF.
171
+ Every region gets: section tag, bbox, quality score, cross-references.
172
+ """
173
+
174
+ def __init__(self, db_path: str = None):
175
+ self.db_path = db_path or os.environ.get("RESEARCH_OS_DB", "data/research_os_v2.db")
176
+ init_db(self.db_path)
177
+ self._marker_available = None
178
+ self._fitz_available = None
179
+ self._pdfplumber_available = None
180
+
181
+ def _check_marker(self) -> bool:
182
+ if self._marker_available is None:
183
+ try:
184
+ import marker
185
+ self._marker_available = True
186
+ except ImportError:
187
+ self._marker_available = False
188
+ return self._marker_available
189
+
190
+ def _check_fitz(self) -> bool:
191
+ if self._fitz_available is None:
192
+ try:
193
+ import fitz
194
+ self._fitz_available = True
195
+ except ImportError:
196
+ self._fitz_available = False
197
+ return self._fitz_available
198
+
199
+ def _check_pdfplumber(self) -> bool:
200
+ if self._pdfplumber_available is None:
201
+ try:
202
+ import pdfplumber
203
+ self._pdfplumber_available = True
204
+ except ImportError:
205
+ self._pdfplumber_available = False
206
+ return self._pdfplumber_available
207
+
208
+ def ingest_document(self, file_path: str, doc_type: str = "main",
209
+ title: str = None, doi: str = None) -> dict:
210
+ """
211
+ Ingest a single document. Returns ingestion summary.
212
+ """
213
+ if not os.path.exists(file_path):
214
+ return {"error": f"File not found: {file_path}", "doc_id": None}
215
+
216
+ doc_id = gen_id("DOC")
217
+ conn = get_db(self.db_path)
218
+
219
+ # Create document record
220
+ conn.execute("""
221
+ INSERT INTO documents (doc_id, file_path, doc_type, title, doi,
222
+ ingestion_status, schema_version, created_at)
223
+ VALUES (?, ?, ?, ?, ?, 'processing', '2.0', ?)
224
+ """, (doc_id, file_path, doc_type, title, doi, now_iso()))
225
+ conn.commit()
226
+
227
+ # Parse based on available tools
228
+ regions = []
229
+ parse_method = "unknown"
230
+
231
+ if file_path.lower().endswith('.pdf'):
232
+ if self._check_fitz():
233
+ regions, parse_method = self._parse_with_fitz(file_path, doc_id)
234
+ elif self._check_pdfplumber():
235
+ regions, parse_method = self._parse_with_pdfplumber(file_path, doc_id)
236
+ else:
237
+ conn.execute(
238
+ "UPDATE documents SET ingestion_status = 'failed' WHERE doc_id = ?",
239
+ (doc_id,)
240
+ )
241
+ conn.commit()
242
+ conn.close()
243
+ return {"error": "No PDF parser available. Install PyMuPDF: pip install pymupdf", "doc_id": doc_id}
244
+ elif file_path.lower().endswith(('.csv', '.xlsx', '.xls')):
245
+ regions, parse_method = self._parse_tabular(file_path, doc_id)
246
+ elif file_path.lower().endswith(('.md', '.txt')):
247
+ regions, parse_method = self._parse_text(file_path, doc_id)
248
+ else:
249
+ regions, parse_method = self._parse_text(file_path, doc_id)
250
+
251
+ # Store regions
252
+ for region in regions:
253
+ conn.execute("""
254
+ INSERT INTO regions (region_id, doc_id, page, bbox, region_type,
255
+ section, subsection, content_text, content_markdown,
256
+ parse_method, parse_confidence, extraction_status,
257
+ quality_flags, cross_refs, schema_version, created_at)
258
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, '2.0', ?)
259
+ """, (
260
+ region["region_id"], doc_id, region["page"],
261
+ json.dumps(region.get("bbox")),
262
+ region["region_type"], region.get("section"), region.get("subsection"),
263
+ region["content_text"], region.get("content_markdown"),
264
+ parse_method, region["parse_confidence"],
265
+ region["extraction_status"],
266
+ json.dumps(region.get("quality_flags", [])),
267
+ json.dumps(region.get("cross_refs", [])),
268
+ now_iso()
269
+ ))
270
+
271
+ # Update document status
272
+ avg_quality = sum(r["parse_confidence"] for r in regions) // max(len(regions), 1)
273
+ conn.execute("""
274
+ UPDATE documents SET ingestion_status = 'complete', parse_method = ?,
275
+ parse_quality_avg = ?, total_regions = ?, created_at = ?
276
+ WHERE doc_id = ?
277
+ """, (parse_method, avg_quality, len(regions), now_iso(), doc_id))
278
+ conn.commit()
279
+ conn.close()
280
+
281
+ return {
282
+ "doc_id": doc_id,
283
+ "parse_method": parse_method,
284
+ "total_regions": len(regions),
285
+ "avg_quality": from_fixed(avg_quality),
286
+ "regions_by_type": self._count_by_type(regions),
287
+ "sections_found": list(set(r.get("section") for r in regions if r.get("section"))),
288
+ }
289
+
290
+ def _parse_with_fitz(self, file_path: str, doc_id: str) -> tuple:
291
+ """Parse PDF using PyMuPDF (fitz) with section detection."""
292
+ import fitz
293
+ doc = fitz.open(file_path)
294
+ regions = []
295
+ current_section = None
296
+
297
+ for page_num in range(len(doc)):
298
+ page = doc[page_num]
299
+ blocks = page.get_text("dict")["blocks"]
300
+
301
+ for block in blocks:
302
+ if block["type"] == 0: # Text block
303
+ text = ""
304
+ for line in block.get("lines", []):
305
+ for span in line.get("spans", []):
306
+ text += span.get("text", "")
307
+ text += "\n"
308
+
309
+ text = text.strip()
310
+ if not text or len(text) < 5:
311
+ continue
312
+
313
+ # Detect section from headers
314
+ detected = detect_section(text)
315
+ if detected:
316
+ current_section = detected
317
+
318
+ region_type = classify_region_type(text)
319
+ quality = score_parse_quality(text, "fitz")
320
+ cross_refs = extract_cross_references(text)
321
+
322
+ # Extraction status based on quality
323
+ if quality >= 700:
324
+ status = "extractable"
325
+ elif quality >= 400:
326
+ status = "low_confidence"
327
+ else:
328
+ status = "unextractable"
329
+
330
+ bbox = block.get("bbox", [0, 0, 0, 0])
331
+
332
+ regions.append({
333
+ "region_id": gen_id("REG"),
334
+ "page": page_num + 1,
335
+ "bbox": list(bbox),
336
+ "region_type": region_type,
337
+ "section": current_section,
338
+ "subsection": None,
339
+ "content_text": text,
340
+ "content_markdown": text,
341
+ "parse_confidence": quality,
342
+ "extraction_status": status,
343
+ "quality_flags": [],
344
+ "cross_refs": cross_refs,
345
+ })
346
+
347
+ elif block["type"] == 1: # Image block
348
+ bbox = block.get("bbox", [0, 0, 0, 0])
349
+ regions.append({
350
+ "region_id": gen_id("REG"),
351
+ "page": page_num + 1,
352
+ "bbox": list(bbox),
353
+ "region_type": "figure",
354
+ "section": current_section,
355
+ "subsection": None,
356
+ "content_text": "[Image detected — requires VLM processing]",
357
+ "content_markdown": "![Figure](image)",
358
+ "parse_confidence": 500,
359
+ "extraction_status": "low_confidence",
360
+ "quality_flags": ["image_region_needs_vlm"],
361
+ "cross_refs": [],
362
+ })
363
+
364
+ doc.close()
365
+ return regions, "fitz"
366
+
367
+ def _parse_with_pdfplumber(self, file_path: str, doc_id: str) -> tuple:
368
+ """Fallback parser using pdfplumber."""
369
+ import pdfplumber
370
+ regions = []
371
+ current_section = None
372
+
373
+ with pdfplumber.open(file_path) as pdf:
374
+ for page_num, page in enumerate(pdf.pages):
375
+ text = page.extract_text()
376
+ if not text or len(text.strip()) < 10:
377
+ continue
378
+
379
+ # Split into paragraphs
380
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
381
+
382
+ for para in paragraphs:
383
+ detected = detect_section(para)
384
+ if detected:
385
+ current_section = detected
386
+
387
+ region_type = classify_region_type(para)
388
+ quality = score_parse_quality(para, "pdfplumber")
389
+ cross_refs = extract_cross_references(para)
390
+
391
+ status = "extractable" if quality >= 700 else "low_confidence" if quality >= 400 else "unextractable"
392
+
393
+ regions.append({
394
+ "region_id": gen_id("REG"),
395
+ "page": page_num + 1,
396
+ "bbox": None,
397
+ "region_type": region_type,
398
+ "section": current_section,
399
+ "subsection": None,
400
+ "content_text": para,
401
+ "content_markdown": para,
402
+ "parse_confidence": quality,
403
+ "extraction_status": status,
404
+ "quality_flags": ["no_bbox_available"],
405
+ "cross_refs": cross_refs,
406
+ })
407
+
408
+ # Extract tables
409
+ tables = page.extract_tables()
410
+ for table in tables:
411
+ if not table:
412
+ continue
413
+ table_text = "\n".join([" | ".join([str(c) if c else "" for c in row]) for row in table])
414
+ regions.append({
415
+ "region_id": gen_id("REG"),
416
+ "page": page_num + 1,
417
+ "bbox": None,
418
+ "region_type": "table",
419
+ "section": current_section,
420
+ "subsection": None,
421
+ "content_text": table_text,
422
+ "content_markdown": table_text,
423
+ "parse_confidence": 700,
424
+ "extraction_status": "extractable",
425
+ "quality_flags": ["table_extracted"],
426
+ "cross_refs": [],
427
+ })
428
+
429
+ return regions, "pdfplumber"
430
+
431
+ def _parse_tabular(self, file_path: str, doc_id: str) -> tuple:
432
+ """Parse CSV/Excel files as data regions."""
433
+ regions = []
434
+ try:
435
+ if file_path.endswith('.csv'):
436
+ with open(file_path) as f:
437
+ text = f.read()
438
+ else:
439
+ text = f"[Excel file: {os.path.basename(file_path)} — requires pandas for full parsing]"
440
+
441
+ regions.append({
442
+ "region_id": gen_id("REG"),
443
+ "page": 1,
444
+ "bbox": None,
445
+ "region_type": "table",
446
+ "section": "data",
447
+ "subsection": None,
448
+ "content_text": text[:10000],
449
+ "content_markdown": text[:10000],
450
+ "parse_confidence": 900,
451
+ "extraction_status": "extractable",
452
+ "quality_flags": ["tabular_data"],
453
+ "cross_refs": [],
454
+ })
455
+ except Exception as e:
456
+ regions.append({
457
+ "region_id": gen_id("REG"),
458
+ "page": 1, "bbox": None, "region_type": "body_text",
459
+ "section": None, "subsection": None,
460
+ "content_text": f"Error reading file: {e}",
461
+ "content_markdown": "", "parse_confidence": 0,
462
+ "extraction_status": "unextractable",
463
+ "quality_flags": ["parse_error"], "cross_refs": [],
464
+ })
465
+ return regions, "tabular"
466
+
467
+ def _parse_text(self, file_path: str, doc_id: str) -> tuple:
468
+ """Parse plain text or markdown files."""
469
+ regions = []
470
+ try:
471
+ with open(file_path, encoding='utf-8', errors='replace') as f:
472
+ text = f.read()
473
+
474
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
475
+ current_section = None
476
+
477
+ for para in paragraphs:
478
+ detected = detect_section(para)
479
+ if detected:
480
+ current_section = detected
481
+
482
+ regions.append({
483
+ "region_id": gen_id("REG"),
484
+ "page": 1, "bbox": None,
485
+ "region_type": classify_region_type(para),
486
+ "section": current_section, "subsection": None,
487
+ "content_text": para, "content_markdown": para,
488
+ "parse_confidence": 900,
489
+ "extraction_status": "extractable",
490
+ "quality_flags": [], "cross_refs": extract_cross_references(para),
491
+ })
492
+ except Exception as e:
493
+ regions.append({
494
+ "region_id": gen_id("REG"),
495
+ "page": 1, "bbox": None, "region_type": "body_text",
496
+ "section": None, "subsection": None,
497
+ "content_text": f"Error: {e}", "content_markdown": "",
498
+ "parse_confidence": 0, "extraction_status": "unextractable",
499
+ "quality_flags": ["parse_error"], "cross_refs": [],
500
+ })
501
+ return regions, "text"
502
+
503
+ def _count_by_type(self, regions: list) -> dict:
504
+ counts = {}
505
+ for r in regions:
506
+ t = r["region_type"]
507
+ counts[t] = counts.get(t, 0) + 1
508
+ return counts
509
+
510
+ def get_extractable_regions(self, doc_id: str) -> list:
511
+ """Get all extractable regions for a document, ordered by section."""
512
+ conn = get_db(self.db_path)
513
+ rows = conn.execute("""
514
+ SELECT * FROM regions
515
+ WHERE doc_id = ? AND extraction_status = 'extractable'
516
+ AND region_type IN ('body_text', 'table', 'caption')
517
+ ORDER BY page, region_id
518
+ """, (doc_id,)).fetchall()
519
+ conn.close()
520
+
521
+ results = []
522
+ for r in rows:
523
+ d = dict(r)
524
+ d["cross_refs"] = json.loads(d.get("cross_refs", "[]"))
525
+ d["quality_flags"] = json.loads(d.get("quality_flags", "[]"))
526
+ d["bbox"] = json.loads(d["bbox"]) if d.get("bbox") else None
527
+ results.append(d)
528
+ return results
529
+
530
+ def get_section_chunks(self, doc_id: str) -> list:
531
+ """
532
+ Get section-aware chunks for extraction.
533
+ Merges consecutive body_text regions in the same section.
534
+ """
535
+ regions = self.get_extractable_regions(doc_id)
536
+ chunks = []
537
+ current_chunk = None
538
+
539
+ for region in regions:
540
+ section = region.get("section") or "unknown"
541
+
542
+ if (current_chunk and
543
+ current_chunk["section"] == section and
544
+ region["region_type"] == "body_text" and
545
+ len(current_chunk["text"]) < 3000):
546
+ # Merge into current chunk
547
+ current_chunk["text"] += "\n\n" + region["content_text"]
548
+ current_chunk["region_ids"].append(region["region_id"])
549
+ current_chunk["min_confidence"] = min(
550
+ current_chunk["min_confidence"], region["parse_confidence"]
551
+ )
552
+ else:
553
+ # Start new chunk
554
+ if current_chunk:
555
+ chunks.append(current_chunk)
556
+ current_chunk = {
557
+ "chunk_id": gen_id("CHK"),
558
+ "doc_id": doc_id,
559
+ "section": section,
560
+ "text": region["content_text"],
561
+ "region_ids": [region["region_id"]],
562
+ "page": region["page"],
563
+ "min_confidence": region["parse_confidence"],
564
+ "region_type": region["region_type"],
565
+ }
566
+
567
+ if current_chunk:
568
+ chunks.append(current_chunk)
569
+
570
+ return chunks