Spaces:

prashantmatlani
/

chunker

Running

File size: 5,077 Bytes


# /pb/py/chunker/hf/main.py
# ./main.py

import zipfile
import io

import os
import asyncio
import json
import uvicorn
from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks
from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse
from fastapi.staticfiles import StaticFiles
import shutil
import glob

# Import chunking logic from the existing combined script
# Note: Ensure script functions are wrap-able or callable
from chunker_2 import run_chunking_process 

app = FastAPI()

# Global store to keep track of progress for the UI
progress_queue = asyncio.Queue()

@app.get("/", response_class=HTMLResponse)
async def get_ui():
    with open("index.html", "r") as f:
        return f.read()

"""

@app.post("/upload")

async def handle_upload(file: UploadFile = File(...)):

    # Save the uploaded PDF to a local temp file

    temp_path = f"temp_{file.filename}"

    with open(temp_path, "wb") as buffer:

        shutil.copyfileobj(file.file, buffer)

    

    # Start the chunking in the background so the UI doesn't freeze

    # We pass the queue so the script can "push" updates to it

    asyncio.create_task(run_chunking_process(temp_path, progress_queue))

    

    return {"status": "Processing started", "filename": file.filename}

"""

@app.get("/stream")
async def stream_updates():
    """

    This is the SSE endpoint. The browser listens here to get 

    real-time updates as chunks are created.

    """
    async def event_generator():
        while True:
            # Wait for a new chunk/summary from the background task
            data = await progress_queue.get()
            if data == "DONE":
                yield "data: {\"type\": \"done\"}\n\n"
                break
            yield f"data: {json.dumps(data)}\n\n"

    return StreamingResponse(event_generator(), media_type="text/event-stream")


@app.post("/upload")
async def handle_upload(

    file: UploadFile = File(...),

    whole: str = Form("false"),

    start: str = Form("20"),

    end: str = Form("30")

):
    temp_path = f"temp_{file.filename}"
    with open(temp_path, "wb") as buffer:
        shutil.copyfileobj(file.file, buffer)
    
    # Fix: Convert strings to proper types
    is_whole = whole.lower() == "true"
    s_page = int(start)
    #s_page = s_page-1 if s_page != 1 else 0
    e_page = int(end)

    #Debugging the values received from the UI
    print(f"📡 UI SIGNAL RECEIVED: whole={is_whole}, start={s_page}, end={e_page}")

    # Start the task with explicit parameters; pass everything to the aggregator
    asyncio.create_task(run_chunking_process(
        temp_path, 
        progress_queue, 
        whole=is_whole, 
        start_p=s_page, 
        end_p=e_page
    ))
    return {"status": "Processing started"}

#"""
@app.get("/download-latest")
async def download_latest():
    # Look for files matching our pattern
    files = glob.glob("knowledge_tree_*.json")
    if not files: 
        return {"error": "No JSON files found yet. Finish an extraction first."}
    # Sort by creation time to get the newest one
    latest_file = max(files, key=os.path.getctime)
    return FileResponse(path=latest_file, filename=os.path.basename(latest_file))
#"""

@app.get("/download-markdown")
async def download_md(type: str = "nested"):
    pattern = "nested_knowledge_*.md" if type == "nested" else "table_knowledge_*.md"
    files = glob.glob(pattern)
    if not files: return {"error": "No markdown found"}
    latest = max(files, key=os.path.getctime)
    return FileResponse(path=latest, filename=os.path.basename(latest))

@app.get("/download-all")
async def download_all():
    # Find the latest files for each type
    json_files = glob.glob("knowledge_tree_*.json")
    nested_files = glob.glob("nested_knowledge_*.md")
    table_files = glob.glob("table_knowledge_*.md")
    
    if not json_files:
        return {"error": "No files found. Please complete a run first."}

    # Identify the newest ones
    latest_json = max(json_files, key=os.path.getctime)
    # Match the timestamp from the JSON to get the corresponding MDs
    timestamp = os.path.basename(latest_json).replace("knowledge_tree_", "").replace(".json", "")
    
    files_to_zip = [
        latest_json,
        f"nested_knowledge_{timestamp}.md",
        f"table_knowledge_{timestamp}.md"
    ]

    # Create an in-memory ZIP file
    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
        for file_path in files_to_zip:
            if os.path.exists(file_path):
                zip_file.write(file_path, os.path.basename(file_path))
    
    zip_buffer.seek(0)
    return StreamingResponse(
        zip_buffer, 
        media_type="application/x-zip-compressed",
        headers={"Content-Disposition": f"attachment; filename=jung_knowledge_base_{timestamp}.zip"}
    )


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)