dwijverma2 commited on
Commit
f7e9181
·
verified ·
1 Parent(s): 2bf07d4

Upload folder using huggingface_hub

Browse files
__pycache__/main.cpython-313.pyc ADDED
Binary file (524 Bytes). View file
 
controllers/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .ingest_file_controller import ingest_file_controller
controllers/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (203 Bytes). View file
 
controllers/__pycache__/ingest_file_controller.cpython-313.pyc ADDED
Binary file (493 Bytes). View file
 
controllers/ingest_file_controller.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from fastapi import UploadFile
2
+ from app.parser import parse_doc
3
+
4
+ async def ingest_file_controller(file: UploadFile):
5
+ blocks = await parse_doc(file)
6
+ return
main.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from app.routes import ingest_router
3
+
4
+ app = FastAPI()
5
+
6
+ app.include_router(ingest_router)
7
+
8
+ @app.get("/")
9
+ def health_check():
10
+ return {"status": "ok"}
parser/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .parser import parse_doc
parser/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (195 Bytes). View file
 
parser/__pycache__/parser.cpython-313.pyc ADDED
Binary file (8.34 kB). View file
 
parser/claude.py ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ layout_aware_parser.py
3
+ -----------------------
4
+ A layout-aware document parser that handles both PDF and Word (.docx) files.
5
+ Detects and tags: TEXT blocks, TABLES, and IMAGES with their positional metadata.
6
+
7
+ Output is a structured list of ParsedBlock objects — ready to feed into a chunking pipeline.
8
+ """
9
+
10
+ import os
11
+ import io
12
+ import json
13
+ import base64
14
+ from enum import Enum
15
+ from dataclasses import dataclass, field, asdict
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+
20
+ # ─────────────────────────────────────────────
21
+ # Data Models
22
+ # ─────────────────────────────────────────────
23
+
24
+ class BlockType(str, Enum):
25
+ TEXT = "text"
26
+ TABLE = "table"
27
+ IMAGE = "image"
28
+ HEADING = "heading"
29
+
30
+
31
+
32
+ @dataclass
33
+ class ParsedBlock:
34
+ """
35
+ A single logical unit extracted from the document.
36
+ Every block carries enough metadata to reconstruct
37
+ its position and origin for downstream retrieval.
38
+ """
39
+ block_type: BlockType
40
+ content: str # Text content OR markdown table OR image caption placeholder
41
+ page_or_index: int # Page number (PDF) or element index (DOCX)
42
+ heading_level: Optional[int] = None # 1–9 for HEADING blocks, None otherwise
43
+ table_data: Optional[list] = None # Raw 2D list of cell strings for TABLE blocks
44
+ image_bytes: Optional[bytes] = None # Raw image bytes for IMAGE blocks (save or send to vision model)
45
+ image_format: Optional[str] = None # e.g. "png", "jpeg"
46
+ source_file: str = ""
47
+ metadata: dict = field(default_factory=dict)
48
+
49
+ def to_dict(self) -> dict:
50
+ d = asdict(self)
51
+ # bytes are not JSON-serialisable — encode as base64 string for inspection
52
+ if d["image_bytes"]:
53
+ d["image_bytes"] = base64.b64encode(d["image_bytes"]).decode()
54
+ return d
55
+
56
+
57
+ # ─────────────────────────────────────────────
58
+ # Helpers
59
+ # ─────────────────────────────────────────────
60
+
61
+ def _table_to_markdown(table_data: list[list[str]]) -> str:
62
+ """Convert a 2D list of cell strings into a Markdown table."""
63
+ if not table_data:
64
+ return ""
65
+ header = table_data[0]
66
+ separator = ["---"] * len(header)
67
+ rows = table_data[1:]
68
+ lines = []
69
+ lines.append("| " + " | ".join(str(c) for c in header) + " |")
70
+ lines.append("| " + " | ".join(separator) + " |")
71
+ for row in rows:
72
+ # Pad short rows to match header width
73
+ padded = list(row) + [""] * (len(header) - len(row))
74
+ lines.append("| " + " | ".join(str(c) for c in padded) + " |")
75
+ return "\n".join(lines)
76
+
77
+
78
+ # ─────────────────────────────────────────────
79
+ # PDF Parser (uses PyMuPDF / fitz)
80
+ # ─────────────────────────────────────────────
81
+
82
+ def parse_pdf(file_path: str) -> list[ParsedBlock]:
83
+ """
84
+ Parses a PDF file page-by-page.
85
+
86
+ Strategy per page:
87
+ 1. Extract the text dictionary with block-level granularity.
88
+ Each block carries (x0, y0, x1, y1, text, block_no, block_type)
89
+ where block_type == 0 is text, block_type == 1 is image.
90
+ 2. Extract tables using PyMuPDF's built-in find_tables() (available
91
+ since v1.23). Table cells are read as text.
92
+ 3. Extract embedded images and store their raw bytes.
93
+
94
+ Blocks are yielded in top-to-bottom, left-to-right reading order.
95
+ """
96
+ import fitz # PyMuPDF
97
+
98
+ blocks: list[ParsedBlock] = []
99
+ doc = fitz.open(file_path)
100
+ source = Path(file_path).name
101
+
102
+ for page_num, page in enumerate(doc, start=1):
103
+ # ── Tables ──────────────────────────────────────────────────────
104
+ # Extract tables first so we can record their bounding boxes and
105
+ # skip the underlying text blocks that fall inside them.
106
+ table_rects = []
107
+ try:
108
+ tabs = page.find_tables()
109
+ for tab in tabs.tables:
110
+ table_rects.append(tab.bbox) # fitz.Rect
111
+ raw_data = tab.extract() # list[list[str]]
112
+ md_table = _table_to_markdown(raw_data)
113
+ blocks.append(ParsedBlock(
114
+ block_type = BlockType.TABLE,
115
+ content = md_table,
116
+ page_or_index = page_num,
117
+ table_data = raw_data,
118
+ source_file = source,
119
+ metadata = {
120
+ "bbox": list(tab.bbox),
121
+ "row_count": len(raw_data),
122
+ "col_count": len(raw_data[0]) if raw_data else 0,
123
+ }
124
+ ))
125
+ except Exception:
126
+ # find_tables() is only in newer PyMuPDF; degrade gracefully
127
+ pass
128
+
129
+ # ── Text blocks ─────────────────────────────────────────────────
130
+ text_dict = page.get_text("dict", sort=True) # sort=True → reading order
131
+ for block in text_dict.get("blocks", []):
132
+ btype = block.get("type", -1)
133
+
134
+ if btype == 0: # Text block
135
+ # Skip if this block's bbox overlaps a detected table region
136
+ bx0, by0, bx1, by1 = block["bbox"]
137
+ in_table = any(
138
+ bx0 >= rx0 - 2 and by0 >= ry0 - 2
139
+ and bx1 <= rx1 + 2 and by1 <= ry1 + 2
140
+ for (rx0, ry0, rx1, ry1) in table_rects
141
+ )
142
+ if in_table:
143
+ continue
144
+
145
+ # Collect text and detect heading via font size heuristic
146
+ full_text = ""
147
+ max_font_size = 0
148
+ is_bold = False
149
+ for line in block.get("lines", []):
150
+ for span in line.get("spans", []):
151
+ full_text += span.get("text", "")
152
+ size = span.get("size", 0)
153
+ if size > max_font_size:
154
+ max_font_size = size
155
+ if "bold" in span.get("font", "").lower():
156
+ is_bold = True
157
+ full_text += "\n"
158
+
159
+ full_text = full_text.strip()
160
+ if not full_text:
161
+ continue
162
+
163
+ # Heuristic: large or bold short text = heading
164
+ is_heading = (max_font_size >= 14 or is_bold) and len(full_text) < 200
165
+
166
+ if is_heading:
167
+ # Map font size to heading level (rough heuristic)
168
+ if max_font_size >= 22:
169
+ h_level = 1
170
+ elif max_font_size >= 18:
171
+ h_level = 2
172
+ elif max_font_size >= 14:
173
+ h_level = 3
174
+ else:
175
+ h_level = 4
176
+
177
+ blocks.append(ParsedBlock(
178
+ block_type = BlockType.HEADING,
179
+ content = full_text,
180
+ page_or_index = page_num,
181
+ heading_level = h_level,
182
+ source_file = source,
183
+ metadata = {
184
+ "font_size": max_font_size,
185
+ "bold": is_bold,
186
+ "bbox": list(block["bbox"]),
187
+ }
188
+ ))
189
+ else:
190
+ blocks.append(ParsedBlock(
191
+ block_type = BlockType.TEXT,
192
+ content = full_text,
193
+ page_or_index = page_num,
194
+ source_file = source,
195
+ metadata = {
196
+ "font_size": max_font_size,
197
+ "bbox": list(block["bbox"]),
198
+ }
199
+ ))
200
+
201
+ elif btype == 1: # Image block — skip here, handled below via get_images
202
+ pass
203
+
204
+ # ── Images ──────────────────────────────────────────────────────
205
+ image_list = page.get_images(full=True)
206
+ for img_index, img_info in enumerate(image_list):
207
+ xref = img_info[0]
208
+ base_image = doc.extract_image(xref)
209
+ img_bytes = base_image["image"]
210
+ img_ext = base_image["ext"] # e.g. "png", "jpeg"
211
+ width = base_image["width"]
212
+ height = base_image["height"]
213
+
214
+ # Skip tiny images (likely decorative icons / bullets)
215
+ if width < 50 or height < 50:
216
+ continue
217
+
218
+ blocks.append(ParsedBlock(
219
+ block_type = BlockType.IMAGE,
220
+ content = f"[IMAGE on page {page_num}, index {img_index} — send to vision model for caption]",
221
+ page_or_index = page_num,
222
+ image_bytes = img_bytes,
223
+ image_format = img_ext,
224
+ source_file = source,
225
+ metadata = {
226
+ "width": width,
227
+ "height": height,
228
+ "xref": xref,
229
+ "image_index": img_index,
230
+ }
231
+ ))
232
+
233
+ doc.close()
234
+ return blocks
235
+
236
+
237
+ # ─────────────────────────────────────────────
238
+ # DOCX Parser (uses python-docx)
239
+ # ─────────────────────────────────────────────
240
+
241
+ def parse_docx(file_path: str) -> list[ParsedBlock]:
242
+ """
243
+ Parses a Word (.docx) file by iterating over the document body
244
+ in document order (paragraphs and tables are siblings under <body>).
245
+
246
+ Strategy:
247
+ - Paragraphs with a 'Heading' style → HEADING blocks
248
+ - Normal paragraphs → TEXT blocks
249
+ - Table elements → TABLE blocks (cells read as text)
250
+ - Inline images (runs with <pic:pic>) → IMAGE blocks
251
+
252
+ python-docx gives us document order for free via document.element.body,
253
+ which is the raw XML body. We iterate over it to preserve interleaving.
254
+ """
255
+ from docx import Document
256
+ from docx.oxml.ns import qn
257
+ from docx.table import Table
258
+ from docx.text.paragraph import Paragraph
259
+ import zipfile
260
+
261
+ doc = Document(file_path)
262
+ blocks : list[ParsedBlock] = []
263
+ source = Path(file_path).name
264
+ elem_index = 0 # position counter (DOCX has no page numbers at parse time)
265
+
266
+ # We need access to embedded images → open the docx as a zip
267
+ docx_zip = zipfile.ZipFile(file_path)
268
+
269
+ # Build a map: relationship_id → image bytes
270
+ # Images in docx are stored in word/media/ and referenced via rId in document.xml.rels
271
+ image_map: dict[str, tuple[bytes, str]] = {}
272
+ try:
273
+ rels_xml = docx_zip.read("word/_rels/document.xml.rels")
274
+ import xml.etree.ElementTree as ET
275
+ rels_tree = ET.fromstring(rels_xml)
276
+ for rel in rels_tree:
277
+ rel_type = rel.get("Type", "")
278
+ rel_target = rel.get("Target", "")
279
+ rel_id = rel.get("Id", "")
280
+ if "image" in rel_type.lower():
281
+ img_path = "word/" + rel_target.lstrip("/")
282
+ try:
283
+ img_bytes = docx_zip.read(img_path)
284
+ img_ext = Path(rel_target).suffix.lstrip(".").lower()
285
+ image_map[rel_id] = (img_bytes, img_ext)
286
+ except Exception:
287
+ pass
288
+ except Exception:
289
+ pass
290
+
291
+ # Helper: extract text from a paragraph element
292
+ def para_text(para: Paragraph) -> str:
293
+ return para.text.strip()
294
+
295
+ # Helper: determine heading level from paragraph style
296
+ def heading_level(para: Paragraph) -> Optional[int]:
297
+ style_name = para.style.name if para.style else ""
298
+ if "Heading" in style_name:
299
+ try:
300
+ return int(style_name.split()[-1])
301
+ except ValueError:
302
+ return 1
303
+ return None
304
+
305
+ # Helper: extract images from a paragraph's runs
306
+ def extract_images_from_para(para: Paragraph, index: int) -> list[ParsedBlock]:
307
+ img_blocks = []
308
+ for run in para.runs:
309
+ # Check for drawing/image XML in the run
310
+ drawing_elems = run._r.findall(".//" + qn("a:blip"), run._r.nsmap) if hasattr(run._r, 'nsmap') else []
311
+ # Simpler: look for blip elements which reference images via r:embed
312
+ for elem in run._r.iter():
313
+ tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
314
+ if tag == "blip":
315
+ r_embed = elem.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
316
+ if r_embed and r_embed in image_map:
317
+ img_bytes, img_ext = image_map[r_embed]
318
+ img_blocks.append(ParsedBlock(
319
+ block_type = BlockType.IMAGE,
320
+ content = f"[IMAGE at element index {index} — send to vision model for caption]",
321
+ page_or_index = index,
322
+ image_bytes = img_bytes,
323
+ image_format = img_ext,
324
+ source_file = source,
325
+ metadata = {
326
+ "r_embed": r_embed,
327
+ "element_index": index,
328
+ }
329
+ ))
330
+ return img_blocks
331
+
332
+ # Helper: read a docx Table into a 2D list
333
+ def read_table(table: Table) -> list[list[str]]:
334
+ data = []
335
+ for row in table.rows:
336
+ row_data = []
337
+ for cell in row.cells:
338
+ row_data.append(cell.text.strip())
339
+ data.append(row_data)
340
+ return data
341
+
342
+ # ── Iterate document body in order ──────────────────────────────────
343
+ # document.element.body children are either <w:p> (paragraph) or <w:tbl> (table)
344
+ body = doc.element.body
345
+ for child in body:
346
+ tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
347
+ elem_index += 1
348
+
349
+ if tag == "p": # Paragraph
350
+ # Wrap in Paragraph object for style access
351
+ para = Paragraph(child, doc)
352
+ text = para_text(para)
353
+ h_level = heading_level(para)
354
+
355
+ # Check for images inside this paragraph
356
+ img_blocks = extract_images_from_para(para, elem_index)
357
+ blocks.extend(img_blocks)
358
+
359
+ if not text:
360
+ continue # empty paragraph (spacer)
361
+
362
+ if h_level is not None:
363
+ blocks.append(ParsedBlock(
364
+ block_type = BlockType.HEADING,
365
+ content = text,
366
+ page_or_index = elem_index,
367
+ heading_level = h_level,
368
+ source_file = source,
369
+ metadata = {
370
+ "style": para.style.name,
371
+ "element_index": elem_index,
372
+ }
373
+ ))
374
+ else:
375
+ blocks.append(ParsedBlock(
376
+ block_type = BlockType.TEXT,
377
+ content = text,
378
+ page_or_index = elem_index,
379
+ source_file = source,
380
+ metadata = {
381
+ "style": para.style.name if para.style else "",
382
+ "element_index": elem_index,
383
+ }
384
+ ))
385
+
386
+ elif tag == "tbl": # Table
387
+ table = Table(child, doc)
388
+ raw_data = read_table(table)
389
+ md_table = _table_to_markdown(raw_data)
390
+ blocks.append(ParsedBlock(
391
+ block_type = BlockType.TABLE,
392
+ content = md_table,
393
+ page_or_index = elem_index,
394
+ table_data = raw_data,
395
+ source_file = source,
396
+ metadata = {
397
+ "element_index": elem_index,
398
+ "row_count": len(raw_data),
399
+ "col_count": len(raw_data[0]) if raw_data else 0,
400
+ }
401
+ ))
402
+
403
+ docx_zip.close()
404
+ return blocks
405
+
406
+
407
+ # ─────────────────────────────────────────────
408
+ # Unified Entry Point
409
+ # ─────────────────────────────────────────────
410
+
411
+ def parse_document(file_path: str) -> list[ParsedBlock]:
412
+ """
413
+ Auto-detects file type and routes to the appropriate parser.
414
+ Returns a flat list of ParsedBlock objects in document order.
415
+ """
416
+ ext = Path(file_path).suffix.lower()
417
+ if ext == ".pdf":
418
+ return parse_pdf(file_path)
419
+ elif ext in (".docx", ".doc"):
420
+ if ext == ".doc":
421
+ raise ValueError(".doc (legacy format) is not supported. Please convert to .docx first.")
422
+ return parse_docx(file_path)
423
+ else:
424
+ raise ValueError(f"Unsupported file type: {ext}. Supported: .pdf, .docx")
425
+
426
+
427
+ # ─────────────────────────────────────────────
428
+ # Pretty Printer (for development/debugging)
429
+ # ─────────────────────────────────────────────
430
+
431
+ def print_parse_summary(blocks: list[ParsedBlock], show_content_preview: bool = True) -> None:
432
+ """Print a human-readable summary of what was parsed."""
433
+ from collections import Counter
434
+ counts = Counter(b.block_type for b in blocks)
435
+
436
+ print("=" * 60)
437
+ print(f" PARSE SUMMARY — {len(blocks)} total blocks")
438
+ print("=" * 60)
439
+ for btype, count in counts.items():
440
+ print(f" {btype.value.upper():<10} {count} block(s)")
441
+ print("-" * 60)
442
+
443
+ for i, block in enumerate(blocks):
444
+ prefix = {
445
+ BlockType.HEADING: f"H{block.heading_level}",
446
+ BlockType.TEXT: "TXT",
447
+ BlockType.TABLE: "TBL",
448
+ BlockType.IMAGE: "IMG",
449
+ }.get(block.block_type, "???")
450
+
451
+ location = f"page={block.page_or_index}" if block.source_file.endswith(".pdf") \
452
+ else f"idx={block.page_or_index}"
453
+
454
+ print(f"\n[{i:03d}] {prefix:<4} {location} source={block.source_file}")
455
+
456
+ if show_content_preview:
457
+ preview = block.content[:180].replace("\n", " ↵ ")
458
+ print(f" {preview}{'...' if len(block.content) > 180 else ''}")
459
+
460
+ if block.block_type == BlockType.TABLE and block.table_data:
461
+ print(f" rows={block.metadata.get('row_count')} cols={block.metadata.get('col_count')}")
462
+
463
+ if block.block_type == BlockType.IMAGE:
464
+ size_kb = len(block.image_bytes) / 1024 if block.image_bytes else 0
465
+ print(f" format={block.image_format} size={size_kb:.1f}KB "
466
+ f"dims={block.metadata.get('width')}x{block.metadata.get('height')}")
467
+
468
+ print("=" * 60)
469
+
470
+
471
+ def save_images(blocks: list[ParsedBlock], output_dir: str = "./parsed_images") -> None:
472
+ """
473
+ Saves all IMAGE blocks to disk.
474
+ Useful for visual inspection or before sending to a vision model.
475
+ """
476
+ os.makedirs(output_dir, exist_ok=True)
477
+ saved = 0
478
+ for block in blocks:
479
+ if block.block_type == BlockType.IMAGE and block.image_bytes:
480
+ fname = (
481
+ f"{Path(block.source_file).stem}"
482
+ f"_p{block.page_or_index}"
483
+ f"_i{block.metadata.get('image_index', block.metadata.get('r_embed', saved))}"
484
+ f".{block.image_format or 'png'}"
485
+ )
486
+ out_path = os.path.join(output_dir, fname)
487
+ with open(out_path, "wb") as f:
488
+ f.write(block.image_bytes)
489
+ print(f" Saved: {out_path}")
490
+ saved += 1
491
+ print(f" Total images saved: {saved}")
492
+
493
+
494
+ # ─────────────────────────────────────────────
495
+ # Usage Example
496
+ # ─────────────────────────────────────────────
497
+
498
+ if __name__ == "__main__":
499
+ import sys
500
+
501
+ if len(sys.argv) < 2:
502
+ print("Usage: python document_parser.py <path_to_file.pdf_or_docx>")
503
+ print("\nRunning self-test with a synthetic DOCX...")
504
+
505
+ # ── Self-test: create a tiny DOCX and parse it ──────────────────
506
+ from docx import Document as DocxDoc
507
+ from docx.oxml.ns import qn as docx_qn
508
+ import tempfile
509
+
510
+ tmp = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
511
+ tmp.close()
512
+
513
+ d = DocxDoc()
514
+ d.add_heading("Standard Operating Procedure: Onboarding", level=1)
515
+ d.add_heading("1. Introduction", level=2)
516
+ d.add_paragraph(
517
+ "This SOP outlines the steps required to onboard a new employee "
518
+ "into the organisation. All steps must be followed in order."
519
+ )
520
+ d.add_heading("2. Approval Matrix", level=2)
521
+ t = d.add_table(rows=3, cols=3)
522
+ t.cell(0, 0).text = "Step"
523
+ t.cell(0, 1).text = "Responsible"
524
+ t.cell(0, 2).text = "Deadline"
525
+ t.cell(1, 0).text = "Send welcome email"
526
+ t.cell(1, 1).text = "HR"
527
+ t.cell(1, 2).text = "Day 1"
528
+ t.cell(2, 0).text = "Assign laptop"
529
+ t.cell(2, 1).text = "IT"
530
+ t.cell(2, 2).text = "Day 1"
531
+ d.add_heading("3. Process Notes", level=2)
532
+ d.add_paragraph(
533
+ "If the employee requires special equipment, raise a ticket with IT "
534
+ "at least 5 working days before the start date."
535
+ )
536
+ d.save(tmp.name)
537
+
538
+ blocks = parse_document(tmp.name)
539
+ print_parse_summary(blocks)
540
+ os.unlink(tmp.name)
541
+
542
+ else:
543
+ file_path = sys.argv[1]
544
+ print(f"Parsing: {file_path}")
545
+ blocks = parse_document(file_path)
546
+ print_parse_summary(blocks)
547
+
548
+ # Optionally save images
549
+ img_blocks = [b for b in blocks if b.block_type == BlockType.IMAGE]
550
+ if img_blocks:
551
+ print(f"\nFound {len(img_blocks)} image(s). Saving to ./parsed_images/")
552
+ save_images(blocks)
553
+
554
+ # Optionally dump JSON
555
+ out_json = Path(file_path).stem + "_parsed.json"
556
+ with open(out_json, "w") as f:
557
+ json.dump([b.to_dict() for b in blocks], f, indent=2)
558
+ print(f"\nFull output saved to: {out_json}")
parser/parser.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import UploadFile
2
+ from docx import Document
3
+ from docx.text.paragraph import Paragraph
4
+ from docx.table import Table
5
+ from enum import Enum
6
+ from dataclasses import dataclass, field
7
+ from typing import Optional
8
+ import uuid
9
+ from pathlib import Path
10
+ import zipfile
11
+ import xml.etree.ElementTree as ET
12
+ import io
13
+
14
+
15
+ class BlockType(str, Enum):
16
+ TEXT = "text"
17
+ TABLE = "table"
18
+ IMAGE = "image"
19
+ HEADING = "heading"
20
+
21
+ @dataclass
22
+ class ParsedBlock:
23
+ BLOCK_TYPE: BlockType
24
+ content: str
25
+ elem_index: int
26
+ page_index: int
27
+ heading_level: Optional[int] = None
28
+ table_data: Optional[int] = None
29
+ image_id: str = None
30
+ image_format: str = None
31
+ storage_id: str = None
32
+ source_file: str = ""
33
+ metadata: dict = field(default_factory=dict)
34
+
35
+ #the logic depends on the tag encountered right now. images are inside p tags, and table are there own tag just like p
36
+ #the main return value is the block here, so when we encounter a p tag we append its block to our List -> block
37
+ #so for reusablity i will define reusable helper functions for each tag
38
+
39
+ def para_text(para: Paragraph) -> str:
40
+ text = para.text.strip()
41
+ return text
42
+
43
+ def read_table(table: Table) -> str:
44
+ data = []
45
+ for row in table.rows:
46
+ row_data = []
47
+ for cell in row.cells:
48
+ row_data.append(cell.text.strip())
49
+ data.append(row_data)
50
+ return data
51
+
52
+ #later add a description for a table along with the table, same can be done for image
53
+
54
+ def table_to_markdown(table_data: list[list[str]]) -> str:
55
+ lines = []
56
+ headings = table_data[0]
57
+ seperator = ["---"] * len(headings)
58
+ rows = table_data[1:]
59
+
60
+ heading_line = []
61
+ for heading in headings:
62
+ heading_line.append("| " + str(heading) + " ")
63
+ lines.append("".join(heading_line) + " |")
64
+
65
+ seperator_line = []
66
+ for sep in seperator:
67
+ seperator_line.append("| " + str(sep) + " ")
68
+ lines.append("".join(seperator_line) + " |")
69
+
70
+ for row in rows:
71
+ row_line = []
72
+ for cell in row:
73
+ row_line.append(" | " + str(cell) + " ")
74
+ lines.append("".join(row_line) + " |")
75
+ return "\n".join(lines)
76
+
77
+ def get_level_score(para: Paragraph) -> int:
78
+ f_size = 0
79
+ font_size = []
80
+ score = 0
81
+ for run in para.runs:
82
+ font_size.append(run.font.size.pt)
83
+ if(len(font_size) > 0):
84
+ f_size = sum(font_size) / len(font_size)
85
+
86
+ if(para.style.name == "SectionHeader"):
87
+ score += 1
88
+
89
+ if(f_size > 11):
90
+ score += 2
91
+
92
+ return score
93
+
94
+ def get_image_from_para(para: Paragraph, i: int, image_map: dict[str, tuple[bytes, str]]) -> list[ParsedBlock]:
95
+ data = []
96
+ for run in para.runs:
97
+ for elem in run._r.iter():
98
+ tag = elem.tag.split("}")[-1]
99
+ if(tag == "blip"):
100
+ r_id = elem.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
101
+ if r_id and r_id in image_map:
102
+ image_bytes, image_ext = image_map[r_id]
103
+ s_id = uuid.uuid4()
104
+ data.append(ParsedBlock(
105
+ BLOCK_TYPE= "image",
106
+ content = f"[Image Here, IMG ID - `{r_id}`]",
107
+ elem_index= i,
108
+ page_index = 0,
109
+ image_id = r_id,
110
+ image_format = image_ext,
111
+ storage_id = s_id
112
+ ))
113
+ return data
114
+
115
+
116
+
117
+ async def parse_doc(file: UploadFile) -> list[ParsedBlock]:
118
+ content = await file.read()
119
+ file_stream = io.BytesIO(content)
120
+ doc = Document(file_stream)
121
+ doc_zip = zipfile.ZipFile(file_stream)
122
+ body = doc.element.body
123
+ blocks : list[ParsedBlock] = []
124
+ rels_xml = doc_zip.read("word/_rels/document.xml.rels")
125
+ rel_tree = ET.fromstring(rels_xml)
126
+
127
+ image_map: dict[str, tuple[bytes, str]] = {}
128
+ for rel in rel_tree:
129
+ rel_type = rel.get("Type", "")
130
+ rel_target = rel.get("Target", "")
131
+ rel_id = rel.get("Id", "")
132
+ if "image" in rel_type.lower():
133
+ image_path = "word/" + rel_target
134
+ image_bytes = doc_zip.read(image_path)
135
+ image_ext = Path(rel_target).suffix.lstrip(".").lower()
136
+ image_map[rel_id] = (image_bytes, image_ext)
137
+
138
+ for i, child in enumerate(body):
139
+ tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
140
+ para = Paragraph(child, doc)
141
+ if(tag == "p"):
142
+ text = para_text(para)
143
+ level = get_level_score(para)
144
+
145
+ if(text != ""):
146
+ block = ParsedBlock(
147
+ BLOCK_TYPE= "text",
148
+ content = text,
149
+ elem_index= i,
150
+ page_index = 0,
151
+ heading_level=level
152
+ )
153
+ blocks.append(block)
154
+ image_blocks = get_image_from_para(para, i, image_map)
155
+ if(len(image_blocks) > 0):
156
+ blocks.extend(image_blocks)
157
+
158
+ elif(tag == "tbl"):
159
+ para = Table(child, doc)
160
+ raw_data = read_table(para)
161
+ table_data = table_to_markdown(raw_data)
162
+ block = ParsedBlock(
163
+ BLOCK_TYPE= "table",
164
+ content = table_data,
165
+ elem_index = i,
166
+ page_index = 0
167
+ )
168
+ blocks.append(block)
169
+ return blocks
170
+
171
+
172
+
173
+ if __name__ == "__main__":
174
+ doc = Document("./documents/Champion_Petfoods_CSS_SOP_Final 1 1.docx")
175
+ blocks = parse_doc(doc)
176
+
177
+ with open("output.txt", "w") as file:
178
+ for block in blocks:
179
+ # Write directly to the file in each iteration
180
+ file.write(f"type: {block.BLOCK_TYPE}\n{block.content}\n\n")
181
+
182
+ print("Done! Data saved to output.txt")
183
+
184
+
routes/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .ingestion import router as ingest_router
2
+
3
+ __all__ = ["ingest_router"]
routes/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (238 Bytes). View file
 
routes/__pycache__/ingestion.cpython-313.pyc ADDED
Binary file (699 Bytes). View file
 
routes/ingestion.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ingestion route
2
+ #recieve file from frontend and call the parsing controller
3
+ from fastapi import APIRouter, UploadFile, File
4
+ from app.controllers import ingest_file_controller
5
+
6
+ router = APIRouter(prefix = "/ingest", tags = ["ingest"])
7
+
8
+ @router.post("")
9
+ async def ingest_file_route(file: UploadFile = File(...)):
10
+ result = await ingest_file_controller(file)
11
+ return result
12
+