ravimohan19 commited on
Commit
92fd11a
·
verified ·
1 Parent(s): cb4d01a

Upload pdf_extractor.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. pdf_extractor.py +74 -0
pdf_extractor.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF text extraction for user-uploaded datasheets.
3
+ Uses PyMuPDF (fitz) for robust PDF parsing.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from pathlib import Path
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def extract_text_from_pdf(file_path: str | Path) -> str:
15
+ """
16
+ Extract all text from a PDF file.
17
+ Returns concatenated text from all pages.
18
+ """
19
+ try:
20
+ import fitz # PyMuPDF
21
+ except ImportError:
22
+ logger.error("PyMuPDF not installed. Run: pip install pymupdf")
23
+ return ""
24
+
25
+ try:
26
+ doc = fitz.open(str(file_path))
27
+ pages_text = []
28
+ for page_num, page in enumerate(doc):
29
+ text = page.get_text()
30
+ if text.strip():
31
+ pages_text.append(f"--- Page {page_num + 1} ---\n{text}")
32
+ doc.close()
33
+
34
+ full_text = "\n".join(pages_text)
35
+ logger.info(
36
+ "Extracted %d chars from %d pages of %s",
37
+ len(full_text), len(pages_text), file_path,
38
+ )
39
+ return full_text
40
+
41
+ except Exception as exc:
42
+ logger.error("PDF extraction failed for %s: %s", file_path, exc)
43
+ return ""
44
+
45
+
46
+ def extract_text_from_bytes(file_bytes: bytes, filename: str = "upload.pdf") -> str:
47
+ """
48
+ Extract text from PDF bytes (for Gradio file upload handling).
49
+ """
50
+ try:
51
+ import fitz
52
+ except ImportError:
53
+ logger.error("PyMuPDF not installed.")
54
+ return ""
55
+
56
+ try:
57
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
58
+ pages_text = []
59
+ for page_num, page in enumerate(doc):
60
+ text = page.get_text()
61
+ if text.strip():
62
+ pages_text.append(f"--- Page {page_num + 1} ---\n{text}")
63
+ doc.close()
64
+
65
+ full_text = "\n".join(pages_text)
66
+ logger.info(
67
+ "Extracted %d chars from %d pages of %s",
68
+ len(full_text), len(pages_text), filename,
69
+ )
70
+ return full_text
71
+
72
+ except Exception as exc:
73
+ logger.error("PDF bytes extraction failed: %s", exc)
74
+ return ""