Spaces:

vaibreact
/

audiolens-backend

Build error

App Files Files Community

Vaibhav Gaikwad commited on 28 days ago

Commit

a80a32e

1 Parent(s): 31e30cc

deploy audiolens backend — dit + easyocr + kokoro

Browse files

Files changed (3) hide show

app.py +308 -4
j2_preprocess.py +127 -0
requirements.txt +13 -0

app.py CHANGED Viewed

@@ -1,7 +1,311 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+"""
+audiolens — app.py
+huggingface space backend (zerogpu + fastapi + gradio)
+endpoints:
+    POST /classify   — document type classification (dit-base)
+    POST /ocr        — text extraction (easyocr)
+    POST /speak      — text to speech (kokoro)
+preprocessing (opencv) runs inline — no separate endpoint needed.
+llm extraction (gemini) is called directly from the pwa — not here.
+models load once at startup into cpu ram (except easyocr which
+lazy-inits inside the gpu function so it can bind to cuda).
+gpu is grabbed per-request via @spaces.GPU and released immediately after.
+"""
+import io
+import os
+import tempfile
+import warnings
+warnings.filterwarnings('ignore')
+import numpy as np
+import cv2
+from PIL import Image
+import torch
+import spaces
 import gradio as gr
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, FileResponse
+from pydantic import BaseModel
+from starlette.background import BackgroundTask
+from j2_preprocess import preprocess
+# ============================================================
+# -- app setup --
+# ============================================================
+app = FastAPI(title='audiolens api')
+# allow pwa to call from any origin
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=['*'],
+    allow_methods=['*'],
+    allow_headers=['*'],
+)
+# dit maps its 16 rvl-cdip classes to audiolens categories
+# indices must match the 9 classes we trained with in j1
+DIT_CLASS_MAP = {
+    0:  'letter',
+    1:  'form',
+    2:  'email',
+    3:  'handwritten',
+    4:  'advertisement',
+    7:  'specification',
+    9:  'news_article',
+    10: 'budget',
+    11: 'invoice',
+}
+SELECTED_RVL_IDX = list(DIT_CLASS_MAP.keys())
+# ============================================================
+# -- model loading (runs once at startup, cpu ram) --
+# ============================================================
+print('loading models...')
+# -- classifier: dit-base (loads to cpu at startup) --
+from transformers import AutoImageProcessor, AutoModelForImageClassification
+dit_processor = AutoImageProcessor.from_pretrained('microsoft/dit-base-finetuned-rvlcdip')
+dit_model     = AutoModelForImageClassification.from_pretrained('microsoft/dit-base-finetuned-rvlcdip')
+dit_model.eval()
+print('dit-base loaded.')
+# -- ocr: easyocr (lazy-init inside gpu function so it binds to cuda) --
+ocr_reader = None
+print('easyocr will lazy-init on first ocr request.')
+# -- tts: kokoro (loads to cpu at startup) --
+import soundfile as sf
+from kokoro import KPipeline
+kokoro_pipeline = KPipeline(lang_code='b')   # b = british english
+print('kokoro loaded.')
+print('all models ready.')
+# ============================================================
+# -- request schemas --
+# ============================================================
+class SpeakRequest(BaseModel):
+    text: str
+    voice: str = 'bf_emma'
+# ============================================================
+# -- helpers --
+# ============================================================
+def bytes_to_pil(image_bytes):
+    """converts raw image bytes to a pil image."""
+    return Image.open(io.BytesIO(image_bytes)).convert('RGB')
+def bytes_to_cv2(image_bytes):
+    """converts raw image bytes to a bgr numpy array for opencv."""
+    arr = np.frombuffer(image_bytes, np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    if img is None:
+        raise ValueError('could not decode image — check the file format')
+    return img
+# ============================================================
+# -- endpoint: health check --
+# ============================================================
+@app.get('/health')
+async def health():
+    """simple ping to check if the space is warm."""
+    return {'status': 'ok', 'models': ['dit-base', 'easyocr', 'kokoro']}
+# ============================================================
+# -- endpoint: classify --
+# classifies the document type from the uploaded image.
+# uses zerogpu for inference, releases gpu immediately after.
+# ============================================================
+@spaces.GPU
+def _run_classify(pil_image):
+    """runs dit-base inference on gpu. called inside classify endpoint."""
+    dit_model.to('cuda')
+    inputs = dit_processor(images=pil_image, return_tensors='pt').to('cuda')
+    with torch.no_grad():
+        logits = dit_model(**inputs).logits
+    # no need to move back to cpu — zerogpu reclaims on function exit
+    # slice to our 9 selected classes and get the winner
+    selected_logits = logits[0, SELECTED_RVL_IDX]
+    pred_idx        = selected_logits.argmax().item()
+    confidence      = torch.softmax(selected_logits, dim=0)[pred_idx].item()
+    doc_type        = DIT_CLASS_MAP[SELECTED_RVL_IDX[pred_idx]]
+    return doc_type, round(confidence, 4)
+@app.post('/classify')
+async def classify(file: UploadFile = File(...)):
+    """
+    classifies a document image into one of 9 categories.
+    returns:
+        doc_type   — e.g. 'invoice', 'letter', 'form'
+        confidence — float 0–1
+    """
+    try:
+        image_bytes = await file.read()
+        if not image_bytes:
+            raise HTTPException(status_code=400, detail='empty file uploaded')
+        pil_image = bytes_to_pil(image_bytes)
+        doc_type, confidence = _run_classify(pil_image)
+        return JSONResponse({'doc_type': doc_type, 'confidence': confidence})
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================
+# -- endpoint: ocr --
+# preprocesses the image (cpu, outside gpu) then runs easyocr
+# on gpu via zerogpu. easyocr lazy-inits on first call so it
+# binds to the cuda device provided by zerogpu.
+# ============================================================
+@spaces.GPU
+def _run_ocr_gpu(clean_image):
+    """runs easyocr inference on gpu. reader lazy-inits on first call."""
+    global ocr_reader
+    if ocr_reader is None:
+        import easyocr
+        ocr_reader = easyocr.Reader(['en'], gpu=True, verbose=False)
+        print('easyocr initialised on gpu.')
+    results = ocr_reader.readtext(clean_image, detail=0)
+    return ' '.join(results)
+@app.post('/ocr')
+async def ocr(file: UploadFile = File(...)):
+    """
+    extracts all text from a document image.
+    preprocessing (deskew, denoise, contrast, binarise) is applied first.
+    returns:
+        text — raw extracted text string
+    """
+    try:
+        image_bytes = await file.read()
+        if not image_bytes:
+            raise HTTPException(status_code=400, detail='empty file uploaded')
+        cv2_image = bytes_to_cv2(image_bytes)
+        # preprocessing runs on cpu — outside the gpu-decorated function
+        clean = preprocess(cv2_image)
+        # ocr inference on gpu
+        text = _run_ocr_gpu(clean)
+        return JSONResponse({'text': text})
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================
+# -- endpoint: speak --
+# converts text to speech using kokoro and returns a wav file.
+# kokoro runs on gpu via zerogpu.
+# temp wav file is cleaned up after the response is sent.
+# ============================================================
+@spaces.GPU(duration=30)
+def _run_tts(text, voice='bf_emma'):
+    """runs kokoro tts on gpu. called inside speak endpoint."""
+    chunks = []
+    for _, _, audio in kokoro_pipeline(text, voice=voice, speed=1.0):
+        chunks.append(audio)
+    if not chunks:
+        return None
+    return np.concatenate(chunks)
+@app.post('/speak')
+async def speak(req: SpeakRequest):
+    """
+    converts text to speech using kokoro.
+    json body:
+        text  — the text to synthesise
+        voice — kokoro voice id (default: bf_emma — british female)
+    returns:
+        audio/wav file
+    """
+    try:
+        if not req.text or not req.text.strip():
+            raise HTTPException(status_code=400, detail='text cannot be empty')
+        audio_array = _run_tts(req.text, req.voice)
+        if audio_array is None:
+            raise HTTPException(status_code=500, detail='tts produced no audio')
+        # write wav to a temp file
+        tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+        sf.write(tmp.name, audio_array, 24000)
+        tmp.close()
+        # return the file and clean up after response is sent
+        return FileResponse(
+            tmp.name,
+            media_type='audio/wav',
+            filename='audiolens_output.wav',
+            background=BackgroundTask(os.unlink, tmp.name),
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================
+# -- gradio ui --
+# minimal gradio interface — required by zerogpu.
+# pwa users never see this. it just satisfies the hf spaces sdk.
+# ============================================================
+with gr.Blocks() as gradio_ui:
+    gr.Markdown("""
+    ## AudioLens API
+    **This space provides the AudioLens backend API.**
+    Use the endpoints below from the AudioLens PWA:
+    - `POST /classify` — document type classification
+    - `POST /ocr` — text extraction
+    - `POST /speak` — text to speech
+    - `GET /health` — check if space is warm
+    """)
+    gr.Markdown("_This UI is for reference only. The AudioLens PWA calls the API directly._")
+# mount fastapi on gradio — zerogpu requires gradio sdk
+app = gr.mount_gradio_app(app, gradio_ui, path='/gradio')

j2_preprocess.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+audiolens — j2 image preprocessing
+prepares a raw phone-captured document image for ocr.
+each preprocessing step is a separate function so they can be
+tested, tuned, or swapped out individually as needed.
+pipeline order:
+  1. to_grayscale    — converts colour input to grayscale
+  2. deskew          — corrects tilt from phone capture angle
+  3. denoise         — removes grain and compression artifacts
+  4. enhance_contrast — applies clahe for local contrast improvement
+  5. binarise        — converts to clean black/white via otsu threshold
+  6. preprocess      — runs all steps in order (main entry point)
+no downloads needed. import preprocess() directly into the pipeline.
+"""
+import numpy as np
+import cv2
+def to_grayscale(image):
+    """
+    converts a bgr colour image to grayscale.
+    if image is already grayscale, returns a copy unchanged.
+    """
+    if len(image.shape) == 3 and image.shape[2] == 3:
+        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    return image.copy()
+def deskew(gray):
+    """
+    detects and corrects the dominant tilt angle of the document.
+    common when a user photographs a document at a slight angle.
+    uses the minimum area bounding box of dark pixel clusters to
+    estimate the skew angle, then rotates to correct it.
+    angles under 0.5 degrees are ignored to avoid introducing
+    unnecessary interpolation artifacts on near-straight images.
+    """
+    coords = np.column_stack(np.where(gray < 128))
+    # not enough dark pixels to estimate angle reliably
+    if len(coords) < 50:
+        return gray
+    angle = cv2.minAreaRect(coords)[-1]
+    # minAreaRect returns angles in [-90, 0) — normalise to [-45, 45]
+    if angle < -45:
+        angle = 90 + angle
+    # skip tiny corrections
+    if abs(angle) < 0.5:
+        return gray
+    h, w    = gray.shape
+    center  = (w // 2, h // 2)
+    matrix  = cv2.getRotationMatrix2D(center, angle, 1.0)
+    rotated = cv2.warpAffine(
+        gray, matrix, (w, h),
+        flags=cv2.INTER_CUBIC,
+        borderMode=cv2.BORDER_REPLICATE,
+    )
+    return rotated
+def denoise(gray):
+    """
+    removes noise, grain, and jpeg compression artifacts from the image.
+    uses opencv's non-local means denoising which is effective on
+    document scans and phone camera captures without blurring text edges.
+    h=10 is a conservative strength — enough to clean grain but
+    not so aggressive that it softens thin strokes in small text.
+    """
+    return cv2.fastNlMeansDenoising(gray, h=10)
+def enhance_contrast(gray):
+    """
+    applies clahe (contrast limited adaptive histogram equalisation).
+    unlike global histogram equalisation, clahe works on small tiles
+    so it handles documents with uneven lighting — e.g. a shadow
+    across part of a medicine label or a receipt photographed in dim light.
+    cliplimit=2.0 prevents over-amplification of noise in flat regions.
+    tileGridSize=(8, 8) gives a good balance between local and global correction.
+    """
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    return clahe.apply(gray)
+def binarise(gray):
+    """
+    converts the grayscale image to a clean black and white binary image.
+    uses otsu's method which automatically finds the optimal threshold
+    value based on the image's intensity histogram — no manual tuning needed.
+    binarisation removes any remaining grey tones and produces the
+    high-contrast input that ocr models perform best on.
+    """
+    _, binary = cv2.threshold(
+        gray, 0, 255,
+        cv2.THRESH_BINARY + cv2.THRESH_OTSU,
+    )
+    return binary
+def preprocess(image):
+    """
+    runs the full preprocessing pipeline on a raw document image.
+    this is the main entry point called from the audiolens pipeline.
+    input:  numpy array — bgr colour or grayscale, any resolution
+    output: numpy array — grayscale binarised image, same resolution
+    pipeline: grayscale → deskew → denoise → enhance_contrast → binarise
+    """
+    image = to_grayscale(image)
+    image = deskew(image)
+    image = denoise(image)
+    image = enhance_contrast(image)
+    image = binarise(image)
+    return image

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi>=0.110.0,<1.0
+gradio>=6.9.0,<7.0
+spaces>=0.28.0
+uvicorn>=0.27.0,<1.0
+python-multipart>=0.0.7
+transformers>=4.35.0,<4.50
+torch>=2.1.0,<2.5
+easyocr>=1.7.0,<1.8
+kokoro>=0.9.0
+soundfile>=0.12.0
+opencv-python-headless>=4.8.0,<4.11
+numpy>=1.24.0,<2.0
+Pillow>=10.0.0,<11.0