Spaces:

vaibreact
/

audiolens-backend

Build error

App Files Files Community

Vaibhav Gaikwad commited on 29 days ago

Commit

374083e

1 Parent(s): a80a32e

fix: switch to gradio native api for zerogpu compatibility

Browse files

Files changed (2) hide show

app.py +153 -180
requirements.txt +1 -4

app.py CHANGED Viewed

@@ -1,23 +1,21 @@
 """
 audiolens — app.py
-huggingface space backend (zerogpu + fastapi + gradio)
-endpoints:
-    POST /classify   — document type classification (dit-base)
-    POST /ocr        — text extraction (easyocr)
-    POST /speak      — text to speech (kokoro)
-preprocessing (opencv) runs inline — no separate endpoint needed.
-llm extraction (gemini) is called directly from the pwa — not here.
-models load once at startup into cpu ram (except easyocr which
-lazy-inits inside the gpu function so it can bind to cuda).
-gpu is grabbed per-request via @spaces.GPU and released immediately after.
 """
 import io
-import os
-import tempfile
 import warnings
 warnings.filterwarnings('ignore')
@@ -28,31 +26,16 @@ from PIL import Image
 import torch
 import spaces
 import gradio as gr
-from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, FileResponse
-from pydantic import BaseModel
-from starlette.background import BackgroundTask
 from j2_preprocess import preprocess
 # ============================================================
-# -- app setup --
 # ============================================================
-app = FastAPI(title='audiolens api')
-# allow pwa to call from any origin
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=['*'],
-    allow_methods=['*'],
-    allow_headers=['*'],
-)
 # dit maps its 16 rvl-cdip classes to audiolens categories
-# indices must match the 9 classes we trained with in j1
 DIT_CLASS_MAP = {
     0:  'letter',
     1:  'form',
@@ -73,7 +56,7 @@ SELECTED_RVL_IDX = list(DIT_CLASS_MAP.keys())
 print('loading models...')
-# -- classifier: dit-base (loads to cpu at startup) --
 from transformers import AutoImageProcessor, AutoModelForImageClassification
 dit_processor = AutoImageProcessor.from_pretrained('microsoft/dit-base-finetuned-rvlcdip')
@@ -85,7 +68,7 @@ print('dit-base loaded.')
 ocr_reader = None
 print('easyocr will lazy-init on first ocr request.')
-# -- tts: kokoro (loads to cpu at startup) --
 import soundfile as sf
 from kokoro import KPipeline
 kokoro_pipeline = KPipeline(lang_code='b')   # b = british english
@@ -94,102 +77,57 @@ print('kokoro loaded.')
 print('all models ready.')
-# ============================================================
-# -- request schemas --
-# ============================================================
-class SpeakRequest(BaseModel):
-    text: str
-    voice: str = 'bf_emma'
 # ============================================================
 # -- helpers --
 # ============================================================
-def bytes_to_pil(image_bytes):
-    """converts raw image bytes to a pil image."""
-    return Image.open(io.BytesIO(image_bytes)).convert('RGB')
-def bytes_to_cv2(image_bytes):
-    """converts raw image bytes to a bgr numpy array for opencv."""
-    arr = np.frombuffer(image_bytes, np.uint8)
-    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-    if img is None:
-        raise ValueError('could not decode image — check the file format')
-    return img
-# ============================================================
-# -- endpoint: health check --
-# ============================================================
-@app.get('/health')
-async def health():
-    """simple ping to check if the space is warm."""
-    return {'status': 'ok', 'models': ['dit-base', 'easyocr', 'kokoro']}
 # ============================================================
-# -- endpoint: classify --
-# classifies the document type from the uploaded image.
-# uses zerogpu for inference, releases gpu immediately after.
 # ============================================================
 @spaces.GPU
-def _run_classify(pil_image):
-    """runs dit-base inference on gpu. called inside classify endpoint."""
-    dit_model.to('cuda')
-    inputs = dit_processor(images=pil_image, return_tensors='pt').to('cuda')
-    with torch.no_grad():
-        logits = dit_model(**inputs).logits
-    # no need to move back to cpu — zerogpu reclaims on function exit
-    # slice to our 9 selected classes and get the winner
-    selected_logits = logits[0, SELECTED_RVL_IDX]
-    pred_idx        = selected_logits.argmax().item()
-    confidence      = torch.softmax(selected_logits, dim=0)[pred_idx].item()
-    doc_type        = DIT_CLASS_MAP[SELECTED_RVL_IDX[pred_idx]]
-    return doc_type, round(confidence, 4)
-@app.post('/classify')
-async def classify(file: UploadFile = File(...)):
     """
     classifies a document image into one of 9 categories.
-    returns:
-        doc_type   — e.g. 'invoice', 'letter', 'form'
-        confidence — float 0–1
     """
     try:
-        image_bytes = await file.read()
-        if not image_bytes:
-            raise HTTPException(status_code=400, detail='empty file uploaded')
-        pil_image = bytes_to_pil(image_bytes)
-        doc_type, confidence = _run_classify(pil_image)
-        return JSONResponse({'doc_type': doc_type, 'confidence': confidence})
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-# ============================================================
-# -- endpoint: ocr --
-# preprocesses the image (cpu, outside gpu) then runs easyocr
-# on gpu via zerogpu. easyocr lazy-inits on first call so it
-# binds to the cuda device provided by zerogpu.
-# ============================================================
 @spaces.GPU
-def _run_ocr_gpu(clean_image):
-    """runs easyocr inference on gpu. reader lazy-inits on first call."""
     global ocr_reader
     if ocr_reader is None:
         import easyocr
@@ -200,112 +138,147 @@ def _run_ocr_gpu(clean_image):
     return ' '.join(results)
-@app.post('/ocr')
-async def ocr(file: UploadFile = File(...)):
     """
-    extracts all text from a document image.
-    preprocessing (deskew, denoise, contrast, binarise) is applied first.
-    returns:
-        text — raw extracted text string
     """
-    try:
-        image_bytes = await file.read()
-        if not image_bytes:
-            raise HTTPException(status_code=400, detail='empty file uploaded')
-        cv2_image = bytes_to_cv2(image_bytes)
-        # preprocessing runs on cpu — outside the gpu-decorated function
         clean = preprocess(cv2_image)
         # ocr inference on gpu
-        text = _run_ocr_gpu(clean)
-        return JSONResponse({'text': text})
-    except HTTPException:
-        raise
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-# ============================================================
-# -- endpoint: speak --
-# converts text to speech using kokoro and returns a wav file.
-# kokoro runs on gpu via zerogpu.
-# temp wav file is cleaned up after the response is sent.
-# ============================================================
 @spaces.GPU(duration=30)
-def _run_tts(text, voice='bf_emma'):
-    """runs kokoro tts on gpu. called inside speak endpoint."""
-    chunks = []
-    for _, _, audio in kokoro_pipeline(text, voice=voice, speed=1.0):
-        chunks.append(audio)
-    if not chunks:
-        return None
-    return np.concatenate(chunks)
-@app.post('/speak')
-async def speak(req: SpeakRequest):
     """
     converts text to speech using kokoro.
-    json body:
-        text  — the text to synthesise
-        voice — kokoro voice id (default: bf_emma — british female)
-    returns:
-        audio/wav file
     """
     try:
-        if not req.text or not req.text.strip():
-            raise HTTPException(status_code=400, detail='text cannot be empty')
-        audio_array = _run_tts(req.text, req.voice)
-        if audio_array is None:
-            raise HTTPException(status_code=500, detail='tts produced no audio')
-        # write wav to a temp file
-        tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
-        sf.write(tmp.name, audio_array, 24000)
-        tmp.close()
-        # return the file and clean up after response is sent
-        return FileResponse(
-            tmp.name,
-            media_type='audio/wav',
-            filename='audiolens_output.wav',
-            background=BackgroundTask(os.unlink, tmp.name),
-        )
-    except HTTPException:
-        raise
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 # ============================================================
-# -- gradio ui --
-# minimal gradio interface — required by zerogpu.
-# pwa users never see this. it just satisfies the hf spaces sdk.
 # ============================================================
-with gr.Blocks() as gradio_ui:
     gr.Markdown("""
     ## AudioLens API
-    **This space provides the AudioLens backend API.**
-    Use the endpoints below from the AudioLens PWA:
-    - `POST /classify` — document type classification
-    - `POST /ocr` — text extraction
-    - `POST /speak` — text to speech
-    - `GET /health` — check if space is warm
     """)
-    gr.Markdown("_This UI is for reference only. The AudioLens PWA calls the API directly._")
-# mount fastapi on gradio — zerogpu requires gradio sdk
-app = gr.mount_gradio_app(app, gradio_ui, path='/gradio')

 """
 audiolens — app.py
+huggingface space backend (zerogpu + gradio native api)
+api endpoints (via gradio):
+    /call/classify   — document type classification (dit-base)
+    /call/ocr        — text extraction (easyocr)
+    /call/speak      — text to speech (kokoro)
+    /call/health     — check if space is warm
+the pwa calls these using the gradio js client (@gradio/client)
+or via gradio's rest api. each function decorated with @spaces.GPU
+gets a gpu allocation only for the duration of that call.
+llm extraction (gemini) is called directly from the pwa — not here.
 """
 import io
 import warnings
 warnings.filterwarnings('ignore')
 import torch
 import spaces
 import gradio as gr
 from j2_preprocess import preprocess
 # ============================================================
+# -- dit class mapping --
 # ============================================================
 # dit maps its 16 rvl-cdip classes to audiolens categories
+# indices must match the 9 classes we selected in j1
 DIT_CLASS_MAP = {
     0:  'letter',
     1:  'form',
 print('loading models...')
+# -- classifier: dit-base --
 from transformers import AutoImageProcessor, AutoModelForImageClassification
 dit_processor = AutoImageProcessor.from_pretrained('microsoft/dit-base-finetuned-rvlcdip')
 ocr_reader = None
 print('easyocr will lazy-init on first ocr request.')
+# -- tts: kokoro --
 import soundfile as sf
 from kokoro import KPipeline
 kokoro_pipeline = KPipeline(lang_code='b')   # b = british english
 print('all models ready.')
 # ============================================================
 # -- helpers --
 # ============================================================
+def pil_to_cv2(pil_image):
+    """converts a pil rgb image to a bgr numpy array for opencv."""
+    rgb = np.array(pil_image)
+    return cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
 # ============================================================
+# -- gpu functions --
 # ============================================================
 @spaces.GPU
+def classify_fn(image):
     """
     classifies a document image into one of 9 categories.
+    called via gradio api: /call/classify
+    input:  pil image (gradio Image component with type="pil")
+    output: json dict with doc_type and confidence
     """
+    if image is None:
+        return {'error': 'no image provided'}
     try:
+        dit_model.to('cuda')
+        inputs = dit_processor(images=image, return_tensors='pt').to('cuda')
+        with torch.no_grad():
+            logits = dit_model(**inputs).logits
+        # slice to our 9 selected classes and get the winner
+        selected_logits = logits[0, SELECTED_RVL_IDX]
+        pred_idx        = selected_logits.argmax().item()
+        confidence      = torch.softmax(selected_logits, dim=0)[pred_idx].item()
+        doc_type        = DIT_CLASS_MAP[SELECTED_RVL_IDX[pred_idx]]
+        return {'doc_type': doc_type, 'confidence': round(confidence, 4)}
+    except Exception as e:
+        return {'error': str(e)}
 @spaces.GPU
+def ocr_gpu(clean_image):
+    """
+    runs easyocr on a preprocessed image.
+    easyocr lazy-inits on first call so it binds to cuda.
+    """
     global ocr_reader
     if ocr_reader is None:
         import easyocr
     return ' '.join(results)
+def ocr_fn(image):
     """
+    extracts text from a document image.
+    called via gradio api: /call/ocr
+    preprocessing (deskew, denoise, contrast, binarise) runs on cpu
+    before the gpu function is called for ocr inference.
+    input:  pil image (gradio Image component with type="pil")
+    output: extracted text string
     """
+    if image is None:
+        return 'error: no image provided'
+    try:
+        # convert pil to cv2 for preprocessing
+        cv2_image = pil_to_cv2(image)
+        # preprocessing runs on cpu — outside the gpu function
         clean = preprocess(cv2_image)
         # ocr inference on gpu
+        text = ocr_gpu(clean)
+        return text
     except Exception as e:
+        return f'error: {str(e)}'
 @spaces.GPU(duration=30)
+def speak_fn(text, voice):
     """
     converts text to speech using kokoro.
+    called via gradio api: /call/speak
+    input:  text string + voice id
+    output: tuple of (sample_rate, audio_array) for gradio Audio component
     """
+    if not text or not text.strip():
+        return None
     try:
+        if not voice or not voice.strip():
+            voice = 'bf_emma'
+        chunks = []
+        for _, _, audio in kokoro_pipeline(text, voice=voice, speed=1.0):
+            chunks.append(audio)
+        if not chunks:
+            return None
+        audio_array = np.concatenate(chunks)
+        # gradio Audio expects (sample_rate, numpy_array)
+        return (24000, audio_array)
     except Exception as e:
+        print(f'tts error: {e}')
+        return None
+def health_fn():
+    """
+    simple check to see if the space is warm and models are loaded.
+    called via gradio api: /call/health
+    """
+    return {'status': 'ok', 'models': ['dit-base', 'easyocr', 'kokoro']}
 # ============================================================
+# -- gradio ui + api --
 # ============================================================
+with gr.Blocks(title='AudioLens API') as demo:
     gr.Markdown("""
     ## AudioLens API
+    **This space provides the AudioLens backend.**
+    The AudioLens PWA calls the API endpoints below using the Gradio client.
     """)
+    # -- classify tab --
+    with gr.Tab('Classify'):
+        classify_image = gr.Image(type='pil', label='document image')
+        classify_btn   = gr.Button('classify')
+        classify_out   = gr.JSON(label='result')
+        classify_btn.click(
+            fn=classify_fn,
+            inputs=classify_image,
+            outputs=classify_out,
+            api_name='classify',
+        )
+    # -- ocr tab --
+    with gr.Tab('OCR'):
+        ocr_image = gr.Image(type='pil', label='document image')
+        ocr_btn   = gr.Button('extract text')
+        ocr_out   = gr.Textbox(label='extracted text', lines=10)
+        ocr_btn.click(
+            fn=ocr_fn,
+            inputs=ocr_image,
+            outputs=ocr_out,
+            api_name='ocr',
+        )
+    # -- speak tab --
+    with gr.Tab('Speak'):
+        speak_text  = gr.Textbox(label='text to speak', lines=5)
+        speak_voice = gr.Textbox(label='voice id', value='bf_emma')
+        speak_btn   = gr.Button('generate speech')
+        speak_out   = gr.Audio(label='output audio')
+        speak_btn.click(
+            fn=speak_fn,
+            inputs=[speak_text, speak_voice],
+            outputs=speak_out,
+            api_name='speak',
+        )
+    # -- health (hidden, api only) --
+    health_btn = gr.Button('health', visible=False)
+    health_out = gr.JSON(visible=False)
+    health_btn.click(
+        fn=health_fn,
+        inputs=[],
+        outputs=health_out,
+        api_name='health',
+    )
+    gr.Markdown("""
+    ---
+    **API endpoints** (use via [@gradio/client](https://www.gradio.app/guides/getting-started-with-the-js-client)):
+    - `/call/classify` — document type classification
+    - `/call/ocr` — text extraction with preprocessing
+    - `/call/speak` — text to speech
+    - `/call/health` — check if space is warm
+    _This UI is for testing. The AudioLens PWA calls the API directly._
+    """)
+# launch — hf spaces handles this automatically
+if __name__ == '__main__':
+    demo.launch(server_name='0.0.0.0', server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,8 +1,5 @@
-fastapi>=0.110.0,<1.0
 gradio>=6.9.0,<7.0
 spaces>=0.28.0
-uvicorn>=0.27.0,<1.0
-python-multipart>=0.0.7
 transformers>=4.35.0,<4.50
 torch>=2.1.0,<2.5
 easyocr>=1.7.0,<1.8
@@ -10,4 +7,4 @@ kokoro>=0.9.0
 soundfile>=0.12.0
 opencv-python-headless>=4.8.0,<4.11
 numpy>=1.24.0,<2.0
-Pillow>=10.0.0,<11.0

 gradio>=6.9.0,<7.0
 spaces>=0.28.0
 transformers>=4.35.0,<4.50
 torch>=2.1.0,<2.5
 easyocr>=1.7.0,<1.8
 soundfile>=0.12.0
 opencv-python-headless>=4.8.0,<4.11
 numpy>=1.24.0,<2.0
+Pillow>=10.0.0,<11.0