Spaces:
Sleeping
Sleeping
Upload 22 files
Browse files
app.py
CHANGED
|
@@ -44,38 +44,103 @@ def reset_kb():
|
|
| 44 |
# TAB 1 β UPLOAD & EXTRACT
|
| 45 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
|
| 47 |
-
def
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
proc = PDFProcessor()
|
| 53 |
ai = AIExtractor()
|
| 54 |
dedup = Deduplicator()
|
| 55 |
-
kb = get_kb()
|
| 56 |
-
log = []
|
| 57 |
-
totals = {k:{"added":0,"merged":0,"skipped":0} for k in ("strategies","formulas","systems")}
|
| 58 |
hf_files = []
|
| 59 |
|
| 60 |
-
for i,
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
log.append(f"\nπ [{i+1}/{len(pdf_files)}] {path.name}")
|
| 64 |
try:
|
| 65 |
chunks = list(proc.process(path))
|
| 66 |
-
log.append(f" β {len(chunks)} chunks")
|
| 67 |
except Exception as e:
|
| 68 |
-
log.append(f" β {e}")
|
|
|
|
| 69 |
|
| 70 |
for chunk in chunks:
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
for
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
for cid, rec in kb["strategies"].items():
|
| 81 |
hf_files.append((f"extracted/strategies/{slugify(rec.get('name',''))}.md",
|
|
@@ -84,24 +149,82 @@ def run_extraction(pdf_files, progress=gr.Progress()):
|
|
| 84 |
hf_files.append((f"extracted/formulas/{slugify(rec.get('name',''))}.md",
|
| 85 |
formula_md(rec).encode()))
|
| 86 |
|
| 87 |
-
progress(0.
|
| 88 |
hf.kb_save(kb)
|
| 89 |
if hf_files and cfg.HF_TOKEN:
|
| 90 |
pushed = hf.push_batch(hf_files, "Update extracted knowledge")
|
| 91 |
-
log.append(f"\nβοΈ Pushed {pushed} files to HuggingFace")
|
| 92 |
reset_kb()
|
|
|
|
| 93 |
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
summary = f"""β
Extraction Complete
|
| 96 |
|
| 97 |
-
PDFs processed: {len(
|
| 98 |
Strategies β added: {totals['strategies']['added']} merged: {totals['strategies']['merged']} skipped: {totals['strategies']['skipped']}
|
| 99 |
Formulas β added: {totals['formulas']['added']} merged: {totals['formulas']['merged']} skipped: {totals['formulas']['skipped']}
|
| 100 |
Systems β added: {totals['systems']['added']} merged: {totals['systems']['merged']} skipped: {totals['systems']['skipped']}
|
| 101 |
|
| 102 |
-
KB totals: {counts['strategies']} strategies Β· {counts['formulas']} formulas Β· {counts['systems']} systems
|
| 103 |
-
Tokens used: {
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -364,15 +487,39 @@ with gr.Blocks(title="Quant Knowledge Extractor β Julia Engine") as demo:
|
|
| 364 |
|
| 365 |
# Tab 1 β Extract
|
| 366 |
with gr.Tab("π€ Upload & Extract"):
|
| 367 |
-
gr.Markdown("### Upload algorithmic trading PDFs
|
|
|
|
|
|
|
| 368 |
with gr.Row():
|
| 369 |
with gr.Column(scale=2):
|
| 370 |
-
pdf_in = gr.File(label="Drop PDFs here", file_count="multiple",
|
| 371 |
-
|
|
|
|
| 372 |
with gr.Column(scale=1):
|
| 373 |
-
ext_out = gr.Textbox(label="Result", lines=14, interactive=False,
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
ext_btn.click(fn=run_extraction, inputs=[pdf_in], outputs=[ext_out, ext_log])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
# Tab 2 β Browse
|
| 378 |
with gr.Tab("π Knowledge Base"):
|
|
|
|
| 44 |
# TAB 1 β UPLOAD & EXTRACT
|
| 45 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
|
| 47 |
+
def _save_and_resolve_pdfs(pdf_files) -> list:
|
| 48 |
+
"""
|
| 49 |
+
Gradio 6 passes uploaded files as plain string paths into a temp dir
|
| 50 |
+
that may be cleaned up before or during processing.
|
| 51 |
+
|
| 52 |
+
This function:
|
| 53 |
+
1. Immediately copies every uploaded file to /tmp/quant/pdfs/ (persistent for session)
|
| 54 |
+
2. Uploads each to HuggingFace dataset pdfs/ folder (persistent across restarts)
|
| 55 |
+
3. Returns stable local Path objects ready for processing
|
| 56 |
+
"""
|
| 57 |
+
import shutil
|
| 58 |
+
PDF_DIR = cfg.TMP / "pdfs"
|
| 59 |
+
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
| 60 |
+
resolved = []
|
| 61 |
+
for f in (pdf_files or []):
|
| 62 |
+
try:
|
| 63 |
+
# Gradio 6: f is a str path; Gradio 5: f has .name attribute
|
| 64 |
+
src = Path(f.name if hasattr(f, "name") else f)
|
| 65 |
+
if not src.exists():
|
| 66 |
+
logger.warning(f"Uploaded path does not exist: {src}")
|
| 67 |
+
continue
|
| 68 |
+
dst = PDF_DIR / src.name
|
| 69 |
+
if not dst.exists():
|
| 70 |
+
shutil.copy2(str(src), str(dst))
|
| 71 |
+
resolved.append(dst)
|
| 72 |
+
# Persist to HuggingFace
|
| 73 |
+
if cfg.HF_TOKEN and cfg.HF_DATASET_REPO:
|
| 74 |
+
hf.pdf_upload(dst)
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.error(f"Failed to resolve upload {f}: {e}")
|
| 77 |
+
return resolved
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def load_pdfs_from_hf() -> list:
|
| 81 |
+
"""List PDFs previously uploaded to HuggingFace dataset."""
|
| 82 |
+
try:
|
| 83 |
+
from huggingface_hub import list_repo_files
|
| 84 |
+
files = list(list_repo_files(
|
| 85 |
+
repo_id=cfg.HF_DATASET_REPO,
|
| 86 |
+
repo_type="dataset",
|
| 87 |
+
token=cfg.HF_TOKEN,
|
| 88 |
+
))
|
| 89 |
+
return sorted([f for f in files if f.startswith("pdfs/") and f.endswith(".pdf")])
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.warning(f"Could not list HF PDFs: {e}")
|
| 92 |
+
return []
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def download_pdf_from_hf(remote_path: str) -> Path | None:
|
| 96 |
+
"""Download a PDF from HuggingFace to local cache."""
|
| 97 |
+
try:
|
| 98 |
+
from huggingface_hub import hf_hub_download
|
| 99 |
+
PDF_DIR = cfg.TMP / "pdfs"
|
| 100 |
+
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
| 101 |
+
local = hf_hub_download(
|
| 102 |
+
repo_id=cfg.HF_DATASET_REPO,
|
| 103 |
+
filename=remote_path,
|
| 104 |
+
repo_type="dataset",
|
| 105 |
+
token=cfg.HF_TOKEN,
|
| 106 |
+
local_dir=str(PDF_DIR),
|
| 107 |
+
force_download=False,
|
| 108 |
+
)
|
| 109 |
+
return Path(local)
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.warning(f"Failed to download {remote_path}: {e}")
|
| 112 |
+
return None
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _extract_paths(paths: list, log: list, totals: dict, progress, kb: dict):
|
| 116 |
+
"""Core extraction loop β shared by new upload and re-process from HF."""
|
| 117 |
proc = PDFProcessor()
|
| 118 |
ai = AIExtractor()
|
| 119 |
dedup = Deduplicator()
|
|
|
|
|
|
|
|
|
|
| 120 |
hf_files = []
|
| 121 |
|
| 122 |
+
for i, path in enumerate(paths):
|
| 123 |
+
progress((i + 1) / max(len(paths), 1), desc=f"{path.name}")
|
| 124 |
+
log.append(f"\nπ [{i+1}/{len(paths)}] {path.name}")
|
|
|
|
| 125 |
try:
|
| 126 |
chunks = list(proc.process(path))
|
| 127 |
+
log.append(f" β {len(chunks)} chunks extracted")
|
| 128 |
except Exception as e:
|
| 129 |
+
log.append(f" β PDF read error: {e}")
|
| 130 |
+
continue
|
| 131 |
|
| 132 |
for chunk in chunks:
|
| 133 |
+
try:
|
| 134 |
+
extracted = ai.extract(chunk)
|
| 135 |
+
stats = dedup.process(extracted, kb)
|
| 136 |
+
for kind in ("strategies", "formulas", "systems"):
|
| 137 |
+
for act in ("added", "merged", "skipped"):
|
| 138 |
+
totals[kind][act] += stats[kind][act]
|
| 139 |
+
except Exception as e:
|
| 140 |
+
log.append(f" β οΈ Chunk error: {e}")
|
| 141 |
+
|
| 142 |
+
log.append(f" β New: {totals['strategies']['added']} strats, "
|
| 143 |
+
f"{totals['formulas']['added']} formulas")
|
| 144 |
|
| 145 |
for cid, rec in kb["strategies"].items():
|
| 146 |
hf_files.append((f"extracted/strategies/{slugify(rec.get('name',''))}.md",
|
|
|
|
| 149 |
hf_files.append((f"extracted/formulas/{slugify(rec.get('name',''))}.md",
|
| 150 |
formula_md(rec).encode()))
|
| 151 |
|
| 152 |
+
progress(0.95, desc="Saving to HuggingFaceβ¦")
|
| 153 |
hf.kb_save(kb)
|
| 154 |
if hf_files and cfg.HF_TOKEN:
|
| 155 |
pushed = hf.push_batch(hf_files, "Update extracted knowledge")
|
| 156 |
+
log.append(f"\nβοΈ Pushed {pushed} markdown files to HuggingFace")
|
| 157 |
reset_kb()
|
| 158 |
+
return ai.tokens_used
|
| 159 |
|
| 160 |
+
|
| 161 |
+
def run_extraction(pdf_files, progress=gr.Progress()):
|
| 162 |
+
if not cfg.ANTHROPIC_API_KEY: return "β ANTHROPIC_API_KEY secret not set.", ""
|
| 163 |
+
if not cfg.HF_DATASET_REPO: return "β HF_DATASET_REPO secret not set.", ""
|
| 164 |
+
|
| 165 |
+
# Step 1: resolve uploads β stable local paths + upload to HF
|
| 166 |
+
progress(0.0, desc="Saving uploads to HuggingFaceβ¦")
|
| 167 |
+
paths = _save_and_resolve_pdfs(pdf_files)
|
| 168 |
+
|
| 169 |
+
if not paths:
|
| 170 |
+
return ("β οΈ No valid PDFs found. Upload files above, "
|
| 171 |
+
"or use 'Re-process from HF' to reprocess previously uploaded PDFs."), ""
|
| 172 |
+
|
| 173 |
+
kb = get_kb()
|
| 174 |
+
log = []
|
| 175 |
+
totals = {k: {"added":0,"merged":0,"skipped":0}
|
| 176 |
+
for k in ("strategies","formulas","systems")}
|
| 177 |
+
|
| 178 |
+
tokens = _extract_paths(paths, log, totals, progress, kb)
|
| 179 |
+
|
| 180 |
+
counts = {k: len(kb[k]) for k in kb}
|
| 181 |
summary = f"""β
Extraction Complete
|
| 182 |
|
| 183 |
+
PDFs processed : {len(paths)}
|
| 184 |
Strategies β added: {totals['strategies']['added']} merged: {totals['strategies']['merged']} skipped: {totals['strategies']['skipped']}
|
| 185 |
Formulas β added: {totals['formulas']['added']} merged: {totals['formulas']['merged']} skipped: {totals['formulas']['skipped']}
|
| 186 |
Systems β added: {totals['systems']['added']} merged: {totals['systems']['merged']} skipped: {totals['systems']['skipped']}
|
| 187 |
|
| 188 |
+
KB totals : {counts['strategies']} strategies Β· {counts['formulas']} formulas Β· {counts['systems']} systems
|
| 189 |
+
Tokens used : {tokens:,}
|
| 190 |
+
PDFs stored : HuggingFace β {cfg.HF_DATASET_REPO}/pdfs/"""
|
| 191 |
+
return summary, "\n".join(log[-50:])
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def reprocess_from_hf(selected_pdfs, progress=gr.Progress()):
|
| 195 |
+
"""Download selected PDFs from HF and re-extract."""
|
| 196 |
+
if not cfg.ANTHROPIC_API_KEY: return "β ANTHROPIC_API_KEY secret not set.", ""
|
| 197 |
+
if not cfg.HF_DATASET_REPO: return "β HF_DATASET_REPO secret not set.", ""
|
| 198 |
+
if not selected_pdfs: return "β οΈ No PDFs selected.", ""
|
| 199 |
+
|
| 200 |
+
progress(0.0, desc="Downloading from HuggingFaceβ¦")
|
| 201 |
+
paths = []
|
| 202 |
+
for remote in selected_pdfs:
|
| 203 |
+
p = download_pdf_from_hf(remote)
|
| 204 |
+
if p: paths.append(p)
|
| 205 |
+
|
| 206 |
+
if not paths:
|
| 207 |
+
return "β Could not download any PDFs from HuggingFace.", ""
|
| 208 |
+
|
| 209 |
+
kb = get_kb()
|
| 210 |
+
log = [f"Re-processing {len(paths)} PDF(s) from HuggingFace\n"]
|
| 211 |
+
totals = {k: {"added":0,"merged":0,"skipped":0}
|
| 212 |
+
for k in ("strategies","formulas","systems")}
|
| 213 |
+
|
| 214 |
+
tokens = _extract_paths(paths, log, totals, progress, kb)
|
| 215 |
+
counts = {k: len(kb[k]) for k in kb}
|
| 216 |
+
return (f"β
Re-extraction complete\n"
|
| 217 |
+
f"PDFs: {len(paths)} Β· "
|
| 218 |
+
f"Strategies: +{totals['strategies']['added']} Β· "
|
| 219 |
+
f"Formulas: +{totals['formulas']['added']}\n"
|
| 220 |
+
f"KB totals: {counts['strategies']} strategies Β· "
|
| 221 |
+
f"{counts['formulas']} formulas\n"
|
| 222 |
+
f"Tokens: {tokens:,}"), "\n".join(log[-50:])
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def refresh_hf_pdf_list():
|
| 226 |
+
pdfs = load_pdfs_from_hf()
|
| 227 |
+
return gr.update(choices=pdfs, value=[])
|
| 228 |
|
| 229 |
|
| 230 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 487 |
|
| 488 |
# Tab 1 β Extract
|
| 489 |
with gr.Tab("π€ Upload & Extract"):
|
| 490 |
+
gr.Markdown("""### Upload algorithmic trading PDFs
|
| 491 |
+
PDFs are **saved to HuggingFace** (`pdfs/` folder) so you can re-process them anytime without re-uploading.
|
| 492 |
+
OCR is applied automatically to scanned pages.""")
|
| 493 |
with gr.Row():
|
| 494 |
with gr.Column(scale=2):
|
| 495 |
+
pdf_in = gr.File(label="Drop PDFs here", file_count="multiple",
|
| 496 |
+
file_types=[".pdf"])
|
| 497 |
+
ext_btn = gr.Button("π Upload + Extract", variant="primary", size="lg")
|
| 498 |
with gr.Column(scale=1):
|
| 499 |
+
ext_out = gr.Textbox(label="Result", lines=14, interactive=False,
|
| 500 |
+
elem_classes=["status-box"])
|
| 501 |
+
ext_log = gr.Textbox(label="Log", lines=8, interactive=False,
|
| 502 |
+
elem_classes=["status-box"])
|
| 503 |
+
|
| 504 |
+
gr.Markdown("---\n### Re-process PDFs already on HuggingFace")
|
| 505 |
+
gr.Markdown("*Use this if the container restarted and lost your session, "
|
| 506 |
+
"or to re-extract with updated prompts.*")
|
| 507 |
+
with gr.Row():
|
| 508 |
+
hf_refresh = gr.Button("π Refresh HF PDF list")
|
| 509 |
+
hf_pdf_list = gr.CheckboxGroup(label="PDFs stored on HuggingFace",
|
| 510 |
+
choices=[], value=[])
|
| 511 |
+
rep_btn = gr.Button("β»οΈ Re-process selected PDFs from HuggingFace",
|
| 512 |
+
variant="secondary")
|
| 513 |
+
rep_out = gr.Textbox(label="Re-process result", lines=6, interactive=False,
|
| 514 |
+
elem_classes=["status-box"])
|
| 515 |
+
rep_log = gr.Textbox(label="Re-process log", lines=6, interactive=False,
|
| 516 |
+
elem_classes=["status-box"])
|
| 517 |
+
|
| 518 |
ext_btn.click(fn=run_extraction, inputs=[pdf_in], outputs=[ext_out, ext_log])
|
| 519 |
+
hf_refresh.click(fn=refresh_hf_pdf_list, outputs=[hf_pdf_list])
|
| 520 |
+
rep_btn.click(fn=reprocess_from_hf, inputs=[hf_pdf_list],
|
| 521 |
+
outputs=[rep_out, rep_log])
|
| 522 |
+
demo.load(fn=refresh_hf_pdf_list, outputs=[hf_pdf_list])
|
| 523 |
|
| 524 |
# Tab 2 β Browse
|
| 525 |
with gr.Tab("π Knowledge Base"):
|