Spaces:
Running
Running
Commit Β·
207b3fa
1
Parent(s): 819e85a
updated changes in phase0102 and in main
Browse files- index.html +8 -2
- main.py +49 -1
- phase0102_chunker_aggregator_2.py +58 -2
index.html
CHANGED
|
@@ -110,10 +110,16 @@ function listenToStream() {
|
|
| 110 |
|
| 111 |
if (data.type === 'done') {
|
| 112 |
status.innerText = "β
EXTRACTION FINISHED!";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
dlBtn.style.display = "block"; // Show the button
|
| 114 |
-
dlBtn.
|
|
|
|
| 115 |
eventSource.close();
|
| 116 |
-
|
| 117 |
}
|
| 118 |
|
| 119 |
// Add Leaf or Summary to the UI
|
|
|
|
| 110 |
|
| 111 |
if (data.type === 'done') {
|
| 112 |
status.innerText = "β
EXTRACTION FINISHED!";
|
| 113 |
+
//dlBtn.style.display = "block"; // Show the button
|
| 114 |
+
//dlBtn.onclick = () => window.location.href = '/download-latest';
|
| 115 |
+
//eventSource.close();
|
| 116 |
+
//return;
|
| 117 |
+
const dlBtn = document.getElementById('downloadBtn');
|
| 118 |
dlBtn.style.display = "block"; // Show the button
|
| 119 |
+
dlBtn.innerHTML = "π₯ Download All Files (.zip)";
|
| 120 |
+
dlBtn.onclick = () => window.location.href = '/download-all'; // Points to the ZIP endpoint
|
| 121 |
eventSource.close();
|
| 122 |
+
|
| 123 |
}
|
| 124 |
|
| 125 |
// Add Leaf or Summary to the UI
|
main.py
CHANGED
|
@@ -2,12 +2,15 @@
|
|
| 2 |
# /pb/py/chunker/hf/main.py
|
| 3 |
# ./main.py
|
| 4 |
|
|
|
|
|
|
|
|
|
|
| 5 |
import os
|
| 6 |
import asyncio
|
| 7 |
import json
|
| 8 |
import uvicorn
|
| 9 |
from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks
|
| 10 |
-
from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse
|
| 11 |
from fastapi.staticfiles import StaticFiles
|
| 12 |
import shutil
|
| 13 |
import glob
|
|
@@ -89,6 +92,7 @@ async def handle_upload(
|
|
| 89 |
))
|
| 90 |
return {"status": "Processing started"}
|
| 91 |
|
|
|
|
| 92 |
@app.get("/download-latest")
|
| 93 |
async def download_latest():
|
| 94 |
# Look for files matching our pattern
|
|
@@ -98,6 +102,50 @@ async def download_latest():
|
|
| 98 |
# Sort by creation time to get the newest one
|
| 99 |
latest_file = max(files, key=os.path.getctime)
|
| 100 |
return FileResponse(path=latest_file, filename=os.path.basename(latest_file))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
if __name__ == "__main__":
|
|
|
|
| 2 |
# /pb/py/chunker/hf/main.py
|
| 3 |
# ./main.py
|
| 4 |
|
| 5 |
+
import zipfile
|
| 6 |
+
import io
|
| 7 |
+
|
| 8 |
import os
|
| 9 |
import asyncio
|
| 10 |
import json
|
| 11 |
import uvicorn
|
| 12 |
from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks
|
| 13 |
+
from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse
|
| 14 |
from fastapi.staticfiles import StaticFiles
|
| 15 |
import shutil
|
| 16 |
import glob
|
|
|
|
| 92 |
))
|
| 93 |
return {"status": "Processing started"}
|
| 94 |
|
| 95 |
+
#"""
|
| 96 |
@app.get("/download-latest")
|
| 97 |
async def download_latest():
|
| 98 |
# Look for files matching our pattern
|
|
|
|
| 102 |
# Sort by creation time to get the newest one
|
| 103 |
latest_file = max(files, key=os.path.getctime)
|
| 104 |
return FileResponse(path=latest_file, filename=os.path.basename(latest_file))
|
| 105 |
+
#"""
|
| 106 |
+
|
| 107 |
+
@app.get("/download-markdown")
|
| 108 |
+
async def download_md(type: str = "nested"):
|
| 109 |
+
pattern = "nested_knowledge_*.md" if type == "nested" else "table_knowledge_*.md"
|
| 110 |
+
files = glob.glob(pattern)
|
| 111 |
+
if not files: return {"error": "No markdown found"}
|
| 112 |
+
latest = max(files, key=os.path.getctime)
|
| 113 |
+
return FileResponse(path=latest, filename=os.path.basename(latest))
|
| 114 |
+
|
| 115 |
+
@app.get("/download-all")
|
| 116 |
+
async def download_all():
|
| 117 |
+
# Find the latest files for each type
|
| 118 |
+
json_files = glob.glob("knowledge_tree_*.json")
|
| 119 |
+
nested_files = glob.glob("nested_knowledge_*.md")
|
| 120 |
+
table_files = glob.glob("table_knowledge_*.md")
|
| 121 |
+
|
| 122 |
+
if not json_files:
|
| 123 |
+
return {"error": "No files found. Please complete a run first."}
|
| 124 |
+
|
| 125 |
+
# Identify the newest ones
|
| 126 |
+
latest_json = max(json_files, key=os.path.getctime)
|
| 127 |
+
# Match the timestamp from the JSON to get the corresponding MDs
|
| 128 |
+
timestamp = os.path.basename(latest_json).replace("knowledge_tree_", "").replace(".json", "")
|
| 129 |
+
|
| 130 |
+
files_to_zip = [
|
| 131 |
+
latest_json,
|
| 132 |
+
f"nested_knowledge_{timestamp}.md",
|
| 133 |
+
f"table_knowledge_{timestamp}.md"
|
| 134 |
+
]
|
| 135 |
+
|
| 136 |
+
# Create an in-memory ZIP file
|
| 137 |
+
zip_buffer = io.BytesIO()
|
| 138 |
+
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
|
| 139 |
+
for file_path in files_to_zip:
|
| 140 |
+
if os.path.exists(file_path):
|
| 141 |
+
zip_file.write(file_path, os.path.basename(file_path))
|
| 142 |
+
|
| 143 |
+
zip_buffer.seek(0)
|
| 144 |
+
return StreamingResponse(
|
| 145 |
+
zip_buffer,
|
| 146 |
+
media_type="application/x-zip-compressed",
|
| 147 |
+
headers={"Content-Disposition": f"attachment; filename=jung_knowledge_base_{timestamp}.zip"}
|
| 148 |
+
)
|
| 149 |
|
| 150 |
|
| 151 |
if __name__ == "__main__":
|
phase0102_chunker_aggregator_2.py
CHANGED
|
@@ -219,6 +219,15 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
|
|
| 219 |
# Semantic Jump Logic, find the break text and move cursor
|
| 220 |
break_text = res.get('break_text', "")
|
| 221 |
cursor += (lookahead.find(break_text) + len(break_text)) if break_text in lookahead else 2000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
# Throttling to stay under 6000 TPM limit
|
| 224 |
await asyncio.sleep(7)
|
|
@@ -277,7 +286,10 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
|
|
| 277 |
output_file = f"knowledge_tree_{timestamp}.json"
|
| 278 |
with open(output_file, "w") as f:
|
| 279 |
json.dump(final_data, f, indent=4)
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
| 281 |
if queue: await queue.put("DONE")
|
| 282 |
|
| 283 |
"""
|
|
@@ -309,4 +321,48 @@ async def generate_summary_block(chunks_to_summarize, label="Level-1 Cluster"):
|
|
| 309 |
"synthesis": "the dense summary text"
|
| 310 |
}}
|
| 311 |
"""
|
| 312 |
-
return await call_groq_json(system_prompt, combined_content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
# Semantic Jump Logic, find the break text and move cursor
|
| 220 |
break_text = res.get('break_text', "")
|
| 221 |
cursor += (lookahead.find(break_text) + len(break_text)) if break_text in lookahead else 2000
|
| 222 |
+
|
| 223 |
+
new_chunk = {
|
| 224 |
+
"type": "leaf",
|
| 225 |
+
"filename": res.get('filename', 'untitled'),
|
| 226 |
+
"content": res.get('rewritten_text', ''),
|
| 227 |
+
"original": lookahead[:len(res.get('break_text', '')) + 500] # Save a snippet of the original
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
|
| 231 |
|
| 232 |
# Throttling to stay under 6000 TPM limit
|
| 233 |
await asyncio.sleep(7)
|
|
|
|
| 286 |
output_file = f"knowledge_tree_{timestamp}.json"
|
| 287 |
with open(output_file, "w") as f:
|
| 288 |
json.dump(final_data, f, indent=4)
|
| 289 |
+
|
| 290 |
+
# CALL TO CREATE NESTED AND TABULAR MARKDOWNs
|
| 291 |
+
export_visual_formats(final_data, timestamp)
|
| 292 |
+
|
| 293 |
if queue: await queue.put("DONE")
|
| 294 |
|
| 295 |
"""
|
|
|
|
| 321 |
"synthesis": "the dense summary text"
|
| 322 |
}}
|
| 323 |
"""
|
| 324 |
+
return await call_groq_json(system_prompt, combined_content)
|
| 325 |
+
|
| 326 |
+
"""
|
| 327 |
+
Nested Markdown
|
| 328 |
+
|
| 329 |
+
Contextual Integrity - Acts as a "Read Me" for the Jungian Agent. It can follow the # headers to understand the hierarchy.
|
| 330 |
+
Auditability: By including the SOURCE TEXT vs AI INTERPRETATION, it becomes possible to verify whether the LLM is "hallucinating" terms like individuation or if it's a valid AI interpretation in the Jungian sense, owing to the alchemical symbols.
|
| 331 |
+
|
| 332 |
+
Table Markdown
|
| 333 |
+
|
| 334 |
+
Visual Clarity: Table Markdown is perfect for a quick bird's-eye view, such as the number of chunks under each chapter
|
| 335 |
+
"""
|
| 336 |
+
# --- NESTED AND TABULAR MARKDOWN
|
| 337 |
+
def export_visual_formats(final_data, timestamp):
|
| 338 |
+
# --- NESTED MARKDOWN ---
|
| 339 |
+
md_nested = f"# π VOLUME: {final_data['metadata']['pages']}\n"
|
| 340 |
+
md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
|
| 341 |
+
|
| 342 |
+
for l2 in final_data['l2_chapters']:
|
| 343 |
+
md_nested += f"## π CHAPTER: {l2['name']}\n> {l2['content']}\n\n"
|
| 344 |
+
# Logic to associate children would go here; for now, we list all relevant nodes
|
| 345 |
+
for l1 in final_data['l1_clusters']:
|
| 346 |
+
md_nested += f"### β CLUSTER: {l1['name']}\n> {l1['content']}\n\n"
|
| 347 |
+
for leaf in final_data['leaves']:
|
| 348 |
+
md_nested += f"#### π {leaf['name']}\n"
|
| 349 |
+
md_nested += f"**[AI INTERPRETATION]:** {leaf['content']}\n\n"
|
| 350 |
+
md_nested += f"**[ORIGINAL TEXT]:** {leaf.get('original', 'N/A')[:250]}...\n\n---\n"
|
| 351 |
+
|
| 352 |
+
# --- TABULAR MARKDOWN ---
|
| 353 |
+
md_table = "| Level | Name | Content Snippet |\n| :--- | :--- | :--- |\n"
|
| 354 |
+
if final_data['l3_volume']:
|
| 355 |
+
md_table += f"| π VOLUME | {final_data['l3_volume']['name']} | {final_data['l3_volume']['content'][:150]}... |\n"
|
| 356 |
+
for l2 in final_data['l2_chapters']:
|
| 357 |
+
md_table += f"| π CHAPTER | {l2['name']} | {l2['content'][:150]}... |\n"
|
| 358 |
+
for l1 in final_data['l1_clusters']:
|
| 359 |
+
md_table += f"| β CLUSTER | {l1['name']} | {l1['content'][:150]}... |\n"
|
| 360 |
+
for leaf in final_data['leaves']:
|
| 361 |
+
md_table += f"| π LEAF | {leaf['name']} | **[AI]** {leaf['content'][:150]}... |\n"
|
| 362 |
+
|
| 363 |
+
# Save files
|
| 364 |
+
with open(f"nested_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_nested)
|
| 365 |
+
with open(f"table_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_table)
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
print(f"β
Visual Markdowns created: nested_knowledge_{timestamp}.md and table_knowledge_{timestamp}.md")
|