Spaces:
Sleeping
Sleeping
File size: 1,698 Bytes
c0a1a00 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | import pdfplumber
import docx
from PIL import Image, ImageOps
import pytesseract
import io
def get_text_from_base64(file_bytes, file_type):
"""
Extracts raw text from binary data based on the detected file extension.
Supports PDF (text-based), DOCX, and Images (via OCR).
"""
text = ""
# Wrap bytes in a file-like object for the libraries to read
file_stream = io.BytesIO(file_bytes)
try:
if file_type.lower() == 'pdf':
# Use pdfplumber for high-accuracy text extraction
with pdfplumber.open(file_stream) as pdf:
pages_text = []
for page in pdf.pages:
content = page.extract_text()
if content:
pages_text.append(content)
text = "\n".join(pages_text)
elif file_type.lower() == 'docx':
# Extract text from Word paragraphs
doc = docx.Document(file_stream)
text = "\n".join([p.text for p in doc.paragraphs if p.text])
elif file_type.lower() in ['image', 'jpg', 'jpeg', 'png']:
# Open image and normalize for OCR
image = Image.open(file_stream)
# Pro Fix: Handle transparency (RGBA) and orientation
image = image.convert("RGB")
image = ImageOps.exif_transpose(image)
# Run Tesseract OCR
text = pytesseract.image_to_string(image)
except Exception as e:
# Return the error so the main API can report it
raise ValueError(f"Processing error ({file_type}): {str(e)}")
return text.strip() |