Spaces:

AmeenAktharT
/

idp-system

Sleeping

idp-system / src /processor.py

Update src/processor.py

c0a1a00 verified about 1 month ago

1.7 kB

	import pdfplumber
	import docx
	from PIL import Image, ImageOps
	import pytesseract
	import io

	def get_text_from_base64(file_bytes, file_type):
	"""
	Extracts raw text from binary data based on the detected file extension.
	Supports PDF (text-based), DOCX, and Images (via OCR).
	"""
	text = ""
	# Wrap bytes in a file-like object for the libraries to read
	file_stream = io.BytesIO(file_bytes)

	try:
	if file_type.lower() == 'pdf':
	# Use pdfplumber for high-accuracy text extraction
	with pdfplumber.open(file_stream) as pdf:
	pages_text = []
	for page in pdf.pages:
	content = page.extract_text()
	if content:
	pages_text.append(content)
	text = "\n".join(pages_text)

	elif file_type.lower() == 'docx':
	# Extract text from Word paragraphs
	doc = docx.Document(file_stream)
	text = "\n".join([p.text for p in doc.paragraphs if p.text])

	elif file_type.lower() in ['image', 'jpg', 'jpeg', 'png']:
	# Open image and normalize for OCR
	image = Image.open(file_stream)

	# Pro Fix: Handle transparency (RGBA) and orientation
	image = image.convert("RGB")
	image = ImageOps.exif_transpose(image)

	# Run Tesseract OCR
	text = pytesseract.image_to_string(image)

	except Exception as e:
	# Return the error so the main API can report it
	raise ValueError(f"Processing error ({file_type}): {str(e)}")

	return text.strip()