cyberkyne commited on
Commit
4465bd2
Β·
verified Β·
1 Parent(s): 04d44fa

Upload 22 files

Browse files
Files changed (1) hide show
  1. app.py +181 -34
app.py CHANGED
@@ -44,38 +44,103 @@ def reset_kb():
44
  # TAB 1 β€” UPLOAD & EXTRACT
45
  # ═══════════════════════════════════════════════════
46
 
47
- def run_extraction(pdf_files, progress=gr.Progress()):
48
- if not pdf_files: return "⚠️ No PDFs uploaded.", ""
49
- if not cfg.ANTHROPIC_API_KEY: return "❌ ANTHROPIC_API_KEY secret not set.", ""
50
- if not cfg.HF_DATASET_REPO: return "❌ HF_DATASET_REPO secret not set.", ""
51
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  proc = PDFProcessor()
53
  ai = AIExtractor()
54
  dedup = Deduplicator()
55
- kb = get_kb()
56
- log = []
57
- totals = {k:{"added":0,"merged":0,"skipped":0} for k in ("strategies","formulas","systems")}
58
  hf_files = []
59
 
60
- for i, pdf_file in enumerate(pdf_files):
61
- path = Path(pdf_file.name if hasattr(pdf_file, "name") else pdf_file)
62
- progress(i/len(pdf_files), desc=f"{path.name}")
63
- log.append(f"\nπŸ“– [{i+1}/{len(pdf_files)}] {path.name}")
64
  try:
65
  chunks = list(proc.process(path))
66
- log.append(f" β†’ {len(chunks)} chunks")
67
  except Exception as e:
68
- log.append(f" ❌ {e}"); continue
 
69
 
70
  for chunk in chunks:
71
- extracted = ai.extract(chunk)
72
- stats = dedup.process(extracted, kb)
73
- for kind in ("strategies","formulas","systems"):
74
- for act in ("added","merged","skipped"):
75
- totals[kind][act] += stats[kind][act]
76
-
77
- log.append(f" β†’ New: {totals['strategies']['added']} strats, {totals['formulas']['added']} formulas")
78
- if cfg.HF_TOKEN: hf.pdf_upload(path)
 
 
 
79
 
80
  for cid, rec in kb["strategies"].items():
81
  hf_files.append((f"extracted/strategies/{slugify(rec.get('name',''))}.md",
@@ -84,24 +149,82 @@ def run_extraction(pdf_files, progress=gr.Progress()):
84
  hf_files.append((f"extracted/formulas/{slugify(rec.get('name',''))}.md",
85
  formula_md(rec).encode()))
86
 
87
- progress(0.9, desc="Saving to HuggingFace…")
88
  hf.kb_save(kb)
89
  if hf_files and cfg.HF_TOKEN:
90
  pushed = hf.push_batch(hf_files, "Update extracted knowledge")
91
- log.append(f"\n☁️ Pushed {pushed} files to HuggingFace")
92
  reset_kb()
 
93
 
94
- counts = {k: len(kb[k]) for k in kb}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  summary = f"""βœ… Extraction Complete
96
 
97
- PDFs processed: {len(pdf_files)}
98
  Strategies β€” added: {totals['strategies']['added']} merged: {totals['strategies']['merged']} skipped: {totals['strategies']['skipped']}
99
  Formulas β€” added: {totals['formulas']['added']} merged: {totals['formulas']['merged']} skipped: {totals['formulas']['skipped']}
100
  Systems β€” added: {totals['systems']['added']} merged: {totals['systems']['merged']} skipped: {totals['systems']['skipped']}
101
 
102
- KB totals: {counts['strategies']} strategies Β· {counts['formulas']} formulas Β· {counts['systems']} systems
103
- Tokens used: {ai.tokens_used:,}"""
104
- return summary, "\n".join(log[-40:])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
 
107
  # ═══════════════════════════════════════════════════
@@ -364,15 +487,39 @@ with gr.Blocks(title="Quant Knowledge Extractor β€” Julia Engine") as demo:
364
 
365
  # Tab 1 β€” Extract
366
  with gr.Tab("πŸ“€ Upload & Extract"):
367
- gr.Markdown("### Upload algorithmic trading PDFs β€” OCR applied automatically")
 
 
368
  with gr.Row():
369
  with gr.Column(scale=2):
370
- pdf_in = gr.File(label="Drop PDFs here", file_count="multiple", file_types=[".pdf"])
371
- ext_btn = gr.Button("πŸš€ Extract Knowledge", variant="primary", size="lg")
 
372
  with gr.Column(scale=1):
373
- ext_out = gr.Textbox(label="Result", lines=14, interactive=False, elem_classes=["status-box"])
374
- ext_log = gr.Textbox(label="Log", lines=8, interactive=False, elem_classes=["status-box"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  ext_btn.click(fn=run_extraction, inputs=[pdf_in], outputs=[ext_out, ext_log])
 
 
 
 
376
 
377
  # Tab 2 β€” Browse
378
  with gr.Tab("πŸ“š Knowledge Base"):
 
44
  # TAB 1 β€” UPLOAD & EXTRACT
45
  # ═══════════════════════════════════════════════════
46
 
47
+ def _save_and_resolve_pdfs(pdf_files) -> list:
48
+ """
49
+ Gradio 6 passes uploaded files as plain string paths into a temp dir
50
+ that may be cleaned up before or during processing.
51
+
52
+ This function:
53
+ 1. Immediately copies every uploaded file to /tmp/quant/pdfs/ (persistent for session)
54
+ 2. Uploads each to HuggingFace dataset pdfs/ folder (persistent across restarts)
55
+ 3. Returns stable local Path objects ready for processing
56
+ """
57
+ import shutil
58
+ PDF_DIR = cfg.TMP / "pdfs"
59
+ PDF_DIR.mkdir(parents=True, exist_ok=True)
60
+ resolved = []
61
+ for f in (pdf_files or []):
62
+ try:
63
+ # Gradio 6: f is a str path; Gradio 5: f has .name attribute
64
+ src = Path(f.name if hasattr(f, "name") else f)
65
+ if not src.exists():
66
+ logger.warning(f"Uploaded path does not exist: {src}")
67
+ continue
68
+ dst = PDF_DIR / src.name
69
+ if not dst.exists():
70
+ shutil.copy2(str(src), str(dst))
71
+ resolved.append(dst)
72
+ # Persist to HuggingFace
73
+ if cfg.HF_TOKEN and cfg.HF_DATASET_REPO:
74
+ hf.pdf_upload(dst)
75
+ except Exception as e:
76
+ logger.error(f"Failed to resolve upload {f}: {e}")
77
+ return resolved
78
+
79
+
80
+ def load_pdfs_from_hf() -> list:
81
+ """List PDFs previously uploaded to HuggingFace dataset."""
82
+ try:
83
+ from huggingface_hub import list_repo_files
84
+ files = list(list_repo_files(
85
+ repo_id=cfg.HF_DATASET_REPO,
86
+ repo_type="dataset",
87
+ token=cfg.HF_TOKEN,
88
+ ))
89
+ return sorted([f for f in files if f.startswith("pdfs/") and f.endswith(".pdf")])
90
+ except Exception as e:
91
+ logger.warning(f"Could not list HF PDFs: {e}")
92
+ return []
93
+
94
+
95
+ def download_pdf_from_hf(remote_path: str) -> Path | None:
96
+ """Download a PDF from HuggingFace to local cache."""
97
+ try:
98
+ from huggingface_hub import hf_hub_download
99
+ PDF_DIR = cfg.TMP / "pdfs"
100
+ PDF_DIR.mkdir(parents=True, exist_ok=True)
101
+ local = hf_hub_download(
102
+ repo_id=cfg.HF_DATASET_REPO,
103
+ filename=remote_path,
104
+ repo_type="dataset",
105
+ token=cfg.HF_TOKEN,
106
+ local_dir=str(PDF_DIR),
107
+ force_download=False,
108
+ )
109
+ return Path(local)
110
+ except Exception as e:
111
+ logger.warning(f"Failed to download {remote_path}: {e}")
112
+ return None
113
+
114
+
115
+ def _extract_paths(paths: list, log: list, totals: dict, progress, kb: dict):
116
+ """Core extraction loop β€” shared by new upload and re-process from HF."""
117
  proc = PDFProcessor()
118
  ai = AIExtractor()
119
  dedup = Deduplicator()
 
 
 
120
  hf_files = []
121
 
122
+ for i, path in enumerate(paths):
123
+ progress((i + 1) / max(len(paths), 1), desc=f"{path.name}")
124
+ log.append(f"\nπŸ“– [{i+1}/{len(paths)}] {path.name}")
 
125
  try:
126
  chunks = list(proc.process(path))
127
+ log.append(f" β†’ {len(chunks)} chunks extracted")
128
  except Exception as e:
129
+ log.append(f" ❌ PDF read error: {e}")
130
+ continue
131
 
132
  for chunk in chunks:
133
+ try:
134
+ extracted = ai.extract(chunk)
135
+ stats = dedup.process(extracted, kb)
136
+ for kind in ("strategies", "formulas", "systems"):
137
+ for act in ("added", "merged", "skipped"):
138
+ totals[kind][act] += stats[kind][act]
139
+ except Exception as e:
140
+ log.append(f" ⚠️ Chunk error: {e}")
141
+
142
+ log.append(f" β†’ New: {totals['strategies']['added']} strats, "
143
+ f"{totals['formulas']['added']} formulas")
144
 
145
  for cid, rec in kb["strategies"].items():
146
  hf_files.append((f"extracted/strategies/{slugify(rec.get('name',''))}.md",
 
149
  hf_files.append((f"extracted/formulas/{slugify(rec.get('name',''))}.md",
150
  formula_md(rec).encode()))
151
 
152
+ progress(0.95, desc="Saving to HuggingFace…")
153
  hf.kb_save(kb)
154
  if hf_files and cfg.HF_TOKEN:
155
  pushed = hf.push_batch(hf_files, "Update extracted knowledge")
156
+ log.append(f"\n☁️ Pushed {pushed} markdown files to HuggingFace")
157
  reset_kb()
158
+ return ai.tokens_used
159
 
160
+
161
+ def run_extraction(pdf_files, progress=gr.Progress()):
162
+ if not cfg.ANTHROPIC_API_KEY: return "❌ ANTHROPIC_API_KEY secret not set.", ""
163
+ if not cfg.HF_DATASET_REPO: return "❌ HF_DATASET_REPO secret not set.", ""
164
+
165
+ # Step 1: resolve uploads β†’ stable local paths + upload to HF
166
+ progress(0.0, desc="Saving uploads to HuggingFace…")
167
+ paths = _save_and_resolve_pdfs(pdf_files)
168
+
169
+ if not paths:
170
+ return ("⚠️ No valid PDFs found. Upload files above, "
171
+ "or use 'Re-process from HF' to reprocess previously uploaded PDFs."), ""
172
+
173
+ kb = get_kb()
174
+ log = []
175
+ totals = {k: {"added":0,"merged":0,"skipped":0}
176
+ for k in ("strategies","formulas","systems")}
177
+
178
+ tokens = _extract_paths(paths, log, totals, progress, kb)
179
+
180
+ counts = {k: len(kb[k]) for k in kb}
181
  summary = f"""βœ… Extraction Complete
182
 
183
+ PDFs processed : {len(paths)}
184
  Strategies β€” added: {totals['strategies']['added']} merged: {totals['strategies']['merged']} skipped: {totals['strategies']['skipped']}
185
  Formulas β€” added: {totals['formulas']['added']} merged: {totals['formulas']['merged']} skipped: {totals['formulas']['skipped']}
186
  Systems β€” added: {totals['systems']['added']} merged: {totals['systems']['merged']} skipped: {totals['systems']['skipped']}
187
 
188
+ KB totals : {counts['strategies']} strategies Β· {counts['formulas']} formulas Β· {counts['systems']} systems
189
+ Tokens used : {tokens:,}
190
+ PDFs stored : HuggingFace β†’ {cfg.HF_DATASET_REPO}/pdfs/"""
191
+ return summary, "\n".join(log[-50:])
192
+
193
+
194
+ def reprocess_from_hf(selected_pdfs, progress=gr.Progress()):
195
+ """Download selected PDFs from HF and re-extract."""
196
+ if not cfg.ANTHROPIC_API_KEY: return "❌ ANTHROPIC_API_KEY secret not set.", ""
197
+ if not cfg.HF_DATASET_REPO: return "❌ HF_DATASET_REPO secret not set.", ""
198
+ if not selected_pdfs: return "⚠️ No PDFs selected.", ""
199
+
200
+ progress(0.0, desc="Downloading from HuggingFace…")
201
+ paths = []
202
+ for remote in selected_pdfs:
203
+ p = download_pdf_from_hf(remote)
204
+ if p: paths.append(p)
205
+
206
+ if not paths:
207
+ return "❌ Could not download any PDFs from HuggingFace.", ""
208
+
209
+ kb = get_kb()
210
+ log = [f"Re-processing {len(paths)} PDF(s) from HuggingFace\n"]
211
+ totals = {k: {"added":0,"merged":0,"skipped":0}
212
+ for k in ("strategies","formulas","systems")}
213
+
214
+ tokens = _extract_paths(paths, log, totals, progress, kb)
215
+ counts = {k: len(kb[k]) for k in kb}
216
+ return (f"βœ… Re-extraction complete\n"
217
+ f"PDFs: {len(paths)} Β· "
218
+ f"Strategies: +{totals['strategies']['added']} Β· "
219
+ f"Formulas: +{totals['formulas']['added']}\n"
220
+ f"KB totals: {counts['strategies']} strategies Β· "
221
+ f"{counts['formulas']} formulas\n"
222
+ f"Tokens: {tokens:,}"), "\n".join(log[-50:])
223
+
224
+
225
+ def refresh_hf_pdf_list():
226
+ pdfs = load_pdfs_from_hf()
227
+ return gr.update(choices=pdfs, value=[])
228
 
229
 
230
  # ═══════════════════════════════════════════════════
 
487
 
488
  # Tab 1 β€” Extract
489
  with gr.Tab("πŸ“€ Upload & Extract"):
490
+ gr.Markdown("""### Upload algorithmic trading PDFs
491
+ PDFs are **saved to HuggingFace** (`pdfs/` folder) so you can re-process them anytime without re-uploading.
492
+ OCR is applied automatically to scanned pages.""")
493
  with gr.Row():
494
  with gr.Column(scale=2):
495
+ pdf_in = gr.File(label="Drop PDFs here", file_count="multiple",
496
+ file_types=[".pdf"])
497
+ ext_btn = gr.Button("πŸš€ Upload + Extract", variant="primary", size="lg")
498
  with gr.Column(scale=1):
499
+ ext_out = gr.Textbox(label="Result", lines=14, interactive=False,
500
+ elem_classes=["status-box"])
501
+ ext_log = gr.Textbox(label="Log", lines=8, interactive=False,
502
+ elem_classes=["status-box"])
503
+
504
+ gr.Markdown("---\n### Re-process PDFs already on HuggingFace")
505
+ gr.Markdown("*Use this if the container restarted and lost your session, "
506
+ "or to re-extract with updated prompts.*")
507
+ with gr.Row():
508
+ hf_refresh = gr.Button("πŸ”„ Refresh HF PDF list")
509
+ hf_pdf_list = gr.CheckboxGroup(label="PDFs stored on HuggingFace",
510
+ choices=[], value=[])
511
+ rep_btn = gr.Button("♻️ Re-process selected PDFs from HuggingFace",
512
+ variant="secondary")
513
+ rep_out = gr.Textbox(label="Re-process result", lines=6, interactive=False,
514
+ elem_classes=["status-box"])
515
+ rep_log = gr.Textbox(label="Re-process log", lines=6, interactive=False,
516
+ elem_classes=["status-box"])
517
+
518
  ext_btn.click(fn=run_extraction, inputs=[pdf_in], outputs=[ext_out, ext_log])
519
+ hf_refresh.click(fn=refresh_hf_pdf_list, outputs=[hf_pdf_list])
520
+ rep_btn.click(fn=reprocess_from_hf, inputs=[hf_pdf_list],
521
+ outputs=[rep_out, rep_log])
522
+ demo.load(fn=refresh_hf_pdf_list, outputs=[hf_pdf_list])
523
 
524
  # Tab 2 β€” Browse
525
  with gr.Tab("πŸ“š Knowledge Base"):