Spaces:

prashantmatlani
/

chunker

Running

App Files Files Community

prashantmatlani commited on 15 days ago

Commit

207b3fa

1 Parent(s): 819e85a

updated changes in phase0102 and in main

Browse files

Files changed (3) hide show

index.html +8 -2
main.py +49 -1
phase0102_chunker_aggregator_2.py +58 -2

index.html CHANGED Viewed

@@ -110,10 +110,16 @@ function listenToStream() {
         if (data.type === 'done') {
             status.innerText = "✅ EXTRACTION FINISHED!";
             dlBtn.style.display = "block"; // Show the button
-            dlBtn.onclick = () => window.location.href = '/download-latest';
             eventSource.close();
-            return;
         }
         // Add Leaf or Summary to the UI

         if (data.type === 'done') {
             status.innerText = "✅ EXTRACTION FINISHED!";
+            //dlBtn.style.display = "block"; // Show the button
+            //dlBtn.onclick = () => window.location.href = '/download-latest';
+            //eventSource.close();
+            //return;
+            const dlBtn = document.getElementById('downloadBtn');
             dlBtn.style.display = "block"; // Show the button
+            dlBtn.innerHTML = "📥 Download All Files (.zip)";
+            dlBtn.onclick = () => window.location.href = '/download-all'; // Points to the ZIP endpoint
             eventSource.close();
         }
         // Add Leaf or Summary to the UI

main.py CHANGED Viewed

@@ -2,12 +2,15 @@
 # /pb/py/chunker/hf/main.py
 # ./main.py
 import os
 import asyncio
 import json
 import uvicorn
 from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks
-from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse # Added FileResponse
 from fastapi.staticfiles import StaticFiles
 import shutil
 import glob
@@ -89,6 +92,7 @@ async def handle_upload(
     ))
     return {"status": "Processing started"}
 @app.get("/download-latest")
 async def download_latest():
     # Look for files matching our pattern
@@ -98,6 +102,50 @@ async def download_latest():
     # Sort by creation time to get the newest one
     latest_file = max(files, key=os.path.getctime)
     return FileResponse(path=latest_file, filename=os.path.basename(latest_file))
 if __name__ == "__main__":

 # /pb/py/chunker/hf/main.py
 # ./main.py
+import zipfile
+import io
 import os
 import asyncio
 import json
 import uvicorn
 from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks
+from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse
 from fastapi.staticfiles import StaticFiles
 import shutil
 import glob
     ))
     return {"status": "Processing started"}
+#"""
 @app.get("/download-latest")
 async def download_latest():
     # Look for files matching our pattern
     # Sort by creation time to get the newest one
     latest_file = max(files, key=os.path.getctime)
     return FileResponse(path=latest_file, filename=os.path.basename(latest_file))
+#"""
+@app.get("/download-markdown")
+async def download_md(type: str = "nested"):
+    pattern = "nested_knowledge_*.md" if type == "nested" else "table_knowledge_*.md"
+    files = glob.glob(pattern)
+    if not files: return {"error": "No markdown found"}
+    latest = max(files, key=os.path.getctime)
+    return FileResponse(path=latest, filename=os.path.basename(latest))
+@app.get("/download-all")
+async def download_all():
+    # Find the latest files for each type
+    json_files = glob.glob("knowledge_tree_*.json")
+    nested_files = glob.glob("nested_knowledge_*.md")
+    table_files = glob.glob("table_knowledge_*.md")
+    if not json_files:
+        return {"error": "No files found. Please complete a run first."}
+    # Identify the newest ones
+    latest_json = max(json_files, key=os.path.getctime)
+    # Match the timestamp from the JSON to get the corresponding MDs
+    timestamp = os.path.basename(latest_json).replace("knowledge_tree_", "").replace(".json", "")
+    files_to_zip = [
+        latest_json,
+        f"nested_knowledge_{timestamp}.md",
+        f"table_knowledge_{timestamp}.md"
+    ]
+    # Create an in-memory ZIP file
+    zip_buffer = io.BytesIO()
+    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
+        for file_path in files_to_zip:
+            if os.path.exists(file_path):
+                zip_file.write(file_path, os.path.basename(file_path))
+    zip_buffer.seek(0)
+    return StreamingResponse(
+        zip_buffer,
+        media_type="application/x-zip-compressed",
+        headers={"Content-Disposition": f"attachment; filename=jung_knowledge_base_{timestamp}.zip"}
+    )
 if __name__ == "__main__":

phase0102_chunker_aggregator_2.py CHANGED Viewed

@@ -219,6 +219,15 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
             # Semantic Jump Logic, find the break text and move cursor
             break_text = res.get('break_text', "")
             cursor += (lookahead.find(break_text) + len(break_text)) if break_text in lookahead else 2000
             # Throttling to stay under 6000 TPM limit
             await asyncio.sleep(7)
@@ -277,7 +286,10 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
     output_file = f"knowledge_tree_{timestamp}.json"
     with open(output_file, "w") as f:
         json.dump(final_data, f, indent=4)
     if queue: await queue.put("DONE")
 """
@@ -309,4 +321,48 @@ async def generate_summary_block(chunks_to_summarize, label="Level-1 Cluster"):
       "synthesis": "the dense summary text"
     }}
     """
-    return await call_groq_json(system_prompt, combined_content)

             # Semantic Jump Logic, find the break text and move cursor
             break_text = res.get('break_text', "")
             cursor += (lookahead.find(break_text) + len(break_text)) if break_text in lookahead else 2000
+            new_chunk = {
+            "type": "leaf",
+            "filename": res.get('filename', 'untitled'),
+            "content": res.get('rewritten_text', ''),
+            "original": lookahead[:len(res.get('break_text', '')) + 500] # Save a snippet of the original
+            }
             # Throttling to stay under 6000 TPM limit
             await asyncio.sleep(7)
     output_file = f"knowledge_tree_{timestamp}.json"
     with open(output_file, "w") as f:
         json.dump(final_data, f, indent=4)
+    # CALL TO CREATE NESTED AND TABULAR MARKDOWNs
+    export_visual_formats(final_data, timestamp)
     if queue: await queue.put("DONE")
 """
       "synthesis": "the dense summary text"
     }}
     """
+    return await call_groq_json(system_prompt, combined_content)
+"""
+Nested Markdown
+Contextual Integrity - Acts as a "Read Me" for the Jungian Agent. It can follow the # headers to understand the hierarchy.
+Auditability: By including the SOURCE TEXT vs AI INTERPRETATION, it becomes possible to verify whether the LLM is "hallucinating" terms like individuation or if it's a valid AI interpretation in the Jungian sense, owing to the alchemical symbols.
+Table Markdown
+Visual Clarity: Table Markdown is perfect for a quick bird's-eye view, such as the number of chunks under each chapter
+"""
+# --- NESTED AND TABULAR MARKDOWN
+def export_visual_formats(final_data, timestamp):
+    # --- NESTED MARKDOWN ---
+    md_nested = f"# 👑 VOLUME: {final_data['metadata']['pages']}\n"
+    md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
+    for l2 in final_data['l2_chapters']:
+        md_nested += f"## 💎 CHAPTER: {l2['name']}\n> {l2['content']}\n\n"
+        # Logic to associate children would go here; for now, we list all relevant nodes
+        for l1 in final_data['l1_clusters']:
+            md_nested += f"### ⭐ CLUSTER: {l1['name']}\n> {l1['content']}\n\n"
+            for leaf in final_data['leaves']:
+                md_nested += f"#### 📄 {leaf['name']}\n"
+                md_nested += f"**[AI INTERPRETATION]:** {leaf['content']}\n\n"
+                md_nested += f"**[ORIGINAL TEXT]:** {leaf.get('original', 'N/A')[:250]}...\n\n---\n"
+    # --- TABULAR MARKDOWN ---
+    md_table = "| Level | Name | Content Snippet |\n| :--- | :--- | :--- |\n"
+    if final_data['l3_volume']:
+        md_table += f"| 👑 VOLUME | {final_data['l3_volume']['name']} | {final_data['l3_volume']['content'][:150]}... |\n"
+    for l2 in final_data['l2_chapters']:
+        md_table += f"| 💎 CHAPTER | {l2['name']} | {l2['content'][:150]}... |\n"
+    for l1 in final_data['l1_clusters']:
+        md_table += f"| ⭐ CLUSTER | {l1['name']} | {l1['content'][:150]}... |\n"
+    for leaf in final_data['leaves']:
+        md_table += f"| 📄 LEAF | {leaf['name']} | **[AI]** {leaf['content'][:150]}... |\n"
+    # Save files
+    with open(f"nested_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_nested)
+    with open(f"table_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_table)
+    print(f"✅ Visual Markdowns created: nested_knowledge_{timestamp}.md and table_knowledge_{timestamp}.md")