Spaces:
Running
Running
File size: 5,077 Bytes
36529d5 207b3fa 36529d5 207b3fa 36529d5 57761e2 36529d5 21b1abb 36529d5 207b3fa 36529d5 207b3fa 36529d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# /pb/py/chunker/hf/main.py
# ./main.py
import zipfile
import io
import os
import asyncio
import json
import uvicorn
from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks
from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse
from fastapi.staticfiles import StaticFiles
import shutil
import glob
# Import chunking logic from the existing combined script
# Note: Ensure script functions are wrap-able or callable
from chunker_2 import run_chunking_process
app = FastAPI()
# Global store to keep track of progress for the UI
progress_queue = asyncio.Queue()
@app.get("/", response_class=HTMLResponse)
async def get_ui():
with open("index.html", "r") as f:
return f.read()
"""
@app.post("/upload")
async def handle_upload(file: UploadFile = File(...)):
# Save the uploaded PDF to a local temp file
temp_path = f"temp_{file.filename}"
with open(temp_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# Start the chunking in the background so the UI doesn't freeze
# We pass the queue so the script can "push" updates to it
asyncio.create_task(run_chunking_process(temp_path, progress_queue))
return {"status": "Processing started", "filename": file.filename}
"""
@app.get("/stream")
async def stream_updates():
"""
This is the SSE endpoint. The browser listens here to get
real-time updates as chunks are created.
"""
async def event_generator():
while True:
# Wait for a new chunk/summary from the background task
data = await progress_queue.get()
if data == "DONE":
yield "data: {\"type\": \"done\"}\n\n"
break
yield f"data: {json.dumps(data)}\n\n"
return StreamingResponse(event_generator(), media_type="text/event-stream")
@app.post("/upload")
async def handle_upload(
file: UploadFile = File(...),
whole: str = Form("false"),
start: str = Form("20"),
end: str = Form("30")
):
temp_path = f"temp_{file.filename}"
with open(temp_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# Fix: Convert strings to proper types
is_whole = whole.lower() == "true"
s_page = int(start)
#s_page = s_page-1 if s_page != 1 else 0
e_page = int(end)
#Debugging the values received from the UI
print(f"📡 UI SIGNAL RECEIVED: whole={is_whole}, start={s_page}, end={e_page}")
# Start the task with explicit parameters; pass everything to the aggregator
asyncio.create_task(run_chunking_process(
temp_path,
progress_queue,
whole=is_whole,
start_p=s_page,
end_p=e_page
))
return {"status": "Processing started"}
#"""
@app.get("/download-latest")
async def download_latest():
# Look for files matching our pattern
files = glob.glob("knowledge_tree_*.json")
if not files:
return {"error": "No JSON files found yet. Finish an extraction first."}
# Sort by creation time to get the newest one
latest_file = max(files, key=os.path.getctime)
return FileResponse(path=latest_file, filename=os.path.basename(latest_file))
#"""
@app.get("/download-markdown")
async def download_md(type: str = "nested"):
pattern = "nested_knowledge_*.md" if type == "nested" else "table_knowledge_*.md"
files = glob.glob(pattern)
if not files: return {"error": "No markdown found"}
latest = max(files, key=os.path.getctime)
return FileResponse(path=latest, filename=os.path.basename(latest))
@app.get("/download-all")
async def download_all():
# Find the latest files for each type
json_files = glob.glob("knowledge_tree_*.json")
nested_files = glob.glob("nested_knowledge_*.md")
table_files = glob.glob("table_knowledge_*.md")
if not json_files:
return {"error": "No files found. Please complete a run first."}
# Identify the newest ones
latest_json = max(json_files, key=os.path.getctime)
# Match the timestamp from the JSON to get the corresponding MDs
timestamp = os.path.basename(latest_json).replace("knowledge_tree_", "").replace(".json", "")
files_to_zip = [
latest_json,
f"nested_knowledge_{timestamp}.md",
f"table_knowledge_{timestamp}.md"
]
# Create an in-memory ZIP file
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
for file_path in files_to_zip:
if os.path.exists(file_path):
zip_file.write(file_path, os.path.basename(file_path))
zip_buffer.seek(0)
return StreamingResponse(
zip_buffer,
media_type="application/x-zip-compressed",
headers={"Content-Disposition": f"attachment; filename=jung_knowledge_base_{timestamp}.zip"}
)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)
|