idp-system / src /processor.py
AmeenAktharT's picture
Update src/processor.py
c0a1a00 verified
import pdfplumber
import docx
from PIL import Image, ImageOps
import pytesseract
import io
def get_text_from_base64(file_bytes, file_type):
"""
Extracts raw text from binary data based on the detected file extension.
Supports PDF (text-based), DOCX, and Images (via OCR).
"""
text = ""
# Wrap bytes in a file-like object for the libraries to read
file_stream = io.BytesIO(file_bytes)
try:
if file_type.lower() == 'pdf':
# Use pdfplumber for high-accuracy text extraction
with pdfplumber.open(file_stream) as pdf:
pages_text = []
for page in pdf.pages:
content = page.extract_text()
if content:
pages_text.append(content)
text = "\n".join(pages_text)
elif file_type.lower() == 'docx':
# Extract text from Word paragraphs
doc = docx.Document(file_stream)
text = "\n".join([p.text for p in doc.paragraphs if p.text])
elif file_type.lower() in ['image', 'jpg', 'jpeg', 'png']:
# Open image and normalize for OCR
image = Image.open(file_stream)
# Pro Fix: Handle transparency (RGBA) and orientation
image = image.convert("RGB")
image = ImageOps.exif_transpose(image)
# Run Tesseract OCR
text = pytesseract.image_to_string(image)
except Exception as e:
# Return the error so the main API can report it
raise ValueError(f"Processing error ({file_type}): {str(e)}")
return text.strip()