Spaces:

bk939448
/

ocr-api

Paused

ocr-api / app.py

🚀 Final OCR API with format filter and multi-page PDF

6298ba6 10 months ago

1.3 kB

	from fastapi import FastAPI, UploadFile, File
	from fastapi.responses import JSONResponse
	from pdf2image import convert_from_bytes
	from PIL import Image
	import pytesseract
	import io

	app = FastAPI()

	@app.post("/ocr")
	async def extract_text(file: UploadFile = File(...)):
	filename = file.filename.lower()
	allowed_ext = (".jpg", ".jpeg", ".png", ".pdf")

	if not filename.endswith(allowed_ext):
	return JSONResponse(
	content={"error": "❌ Unsupported file format! Please upload JPG, PNG, or PDF."},
	status_code=400
	)

	contents = await file.read()
	extracted_text = ""

	try:
	if filename.endswith(".pdf"):
	images = convert_from_bytes(contents)
	for page in images:
	text = pytesseract.image_to_string(page, lang="hin+eng")
	extracted_text += text + "\n\n"
	else:
	image = Image.open(io.BytesIO(contents))
	text = pytesseract.image_to_string(image, lang="hin+eng")
	extracted_text = text

	return {"text": extracted_text.strip() or "⚠️ No text found."}

	except Exception as e:
	return JSONResponse(
	content={"error": "🚫 Failed to process file", "details": str(e)},
	status_code=500
	)