import pdfplumber import docx from PIL import Image, ImageOps import pytesseract import io def get_text_from_base64(file_bytes, file_type): """ Extracts raw text from binary data based on the detected file extension. Supports PDF (text-based), DOCX, and Images (via OCR). """ text = "" # Wrap bytes in a file-like object for the libraries to read file_stream = io.BytesIO(file_bytes) try: if file_type.lower() == 'pdf': # Use pdfplumber for high-accuracy text extraction with pdfplumber.open(file_stream) as pdf: pages_text = [] for page in pdf.pages: content = page.extract_text() if content: pages_text.append(content) text = "\n".join(pages_text) elif file_type.lower() == 'docx': # Extract text from Word paragraphs doc = docx.Document(file_stream) text = "\n".join([p.text for p in doc.paragraphs if p.text]) elif file_type.lower() in ['image', 'jpg', 'jpeg', 'png']: # Open image and normalize for OCR image = Image.open(file_stream) # Pro Fix: Handle transparency (RGBA) and orientation image = image.convert("RGB") image = ImageOps.exif_transpose(image) # Run Tesseract OCR text = pytesseract.image_to_string(image) except Exception as e: # Return the error so the main API can report it raise ValueError(f"Processing error ({file_type}): {str(e)}") return text.strip()