File size: 1,698 Bytes
c0a1a00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pdfplumber
import docx
from PIL import Image, ImageOps
import pytesseract
import io

def get_text_from_base64(file_bytes, file_type):
    """
    Extracts raw text from binary data based on the detected file extension.
    Supports PDF (text-based), DOCX, and Images (via OCR).
    """
    text = ""
    # Wrap bytes in a file-like object for the libraries to read
    file_stream = io.BytesIO(file_bytes)
    
    try:
        if file_type.lower() == 'pdf':
            # Use pdfplumber for high-accuracy text extraction
            with pdfplumber.open(file_stream) as pdf:
                pages_text = []
                for page in pdf.pages:
                    content = page.extract_text()
                    if content:
                        pages_text.append(content)
                text = "\n".join(pages_text)
                
        elif file_type.lower() == 'docx':
            # Extract text from Word paragraphs
            doc = docx.Document(file_stream)
            text = "\n".join([p.text for p in doc.paragraphs if p.text])
            
        elif file_type.lower() in ['image', 'jpg', 'jpeg', 'png']:
            # Open image and normalize for OCR
            image = Image.open(file_stream)
            
            # Pro Fix: Handle transparency (RGBA) and orientation
            image = image.convert("RGB")
            image = ImageOps.exif_transpose(image) 
            
            # Run Tesseract OCR
            text = pytesseract.image_to_string(image)
            
    except Exception as e:
        # Return the error so the main API can report it
        raise ValueError(f"Processing error ({file_type}): {str(e)}")
        
    return text.strip()