Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| import docx | |
| from PIL import Image, ImageOps | |
| import pytesseract | |
| import io | |
| def get_text_from_base64(file_bytes, file_type): | |
| """ | |
| Extracts raw text from binary data based on the detected file extension. | |
| Supports PDF (text-based), DOCX, and Images (via OCR). | |
| """ | |
| text = "" | |
| # Wrap bytes in a file-like object for the libraries to read | |
| file_stream = io.BytesIO(file_bytes) | |
| try: | |
| if file_type.lower() == 'pdf': | |
| # Use pdfplumber for high-accuracy text extraction | |
| with pdfplumber.open(file_stream) as pdf: | |
| pages_text = [] | |
| for page in pdf.pages: | |
| content = page.extract_text() | |
| if content: | |
| pages_text.append(content) | |
| text = "\n".join(pages_text) | |
| elif file_type.lower() == 'docx': | |
| # Extract text from Word paragraphs | |
| doc = docx.Document(file_stream) | |
| text = "\n".join([p.text for p in doc.paragraphs if p.text]) | |
| elif file_type.lower() in ['image', 'jpg', 'jpeg', 'png']: | |
| # Open image and normalize for OCR | |
| image = Image.open(file_stream) | |
| # Pro Fix: Handle transparency (RGBA) and orientation | |
| image = image.convert("RGB") | |
| image = ImageOps.exif_transpose(image) | |
| # Run Tesseract OCR | |
| text = pytesseract.image_to_string(image) | |
| except Exception as e: | |
| # Return the error so the main API can report it | |
| raise ValueError(f"Processing error ({file_type}): {str(e)}") | |
| return text.strip() |