prashantmatlani commited on
Commit
207b3fa
Β·
1 Parent(s): 819e85a

updated changes in phase0102 and in main

Browse files
Files changed (3) hide show
  1. index.html +8 -2
  2. main.py +49 -1
  3. phase0102_chunker_aggregator_2.py +58 -2
index.html CHANGED
@@ -110,10 +110,16 @@ function listenToStream() {
110
 
111
  if (data.type === 'done') {
112
  status.innerText = "βœ… EXTRACTION FINISHED!";
 
 
 
 
 
113
  dlBtn.style.display = "block"; // Show the button
114
- dlBtn.onclick = () => window.location.href = '/download-latest';
 
115
  eventSource.close();
116
- return;
117
  }
118
 
119
  // Add Leaf or Summary to the UI
 
110
 
111
  if (data.type === 'done') {
112
  status.innerText = "βœ… EXTRACTION FINISHED!";
113
+ //dlBtn.style.display = "block"; // Show the button
114
+ //dlBtn.onclick = () => window.location.href = '/download-latest';
115
+ //eventSource.close();
116
+ //return;
117
+ const dlBtn = document.getElementById('downloadBtn');
118
  dlBtn.style.display = "block"; // Show the button
119
+ dlBtn.innerHTML = "πŸ“₯ Download All Files (.zip)";
120
+ dlBtn.onclick = () => window.location.href = '/download-all'; // Points to the ZIP endpoint
121
  eventSource.close();
122
+
123
  }
124
 
125
  // Add Leaf or Summary to the UI
main.py CHANGED
@@ -2,12 +2,15 @@
2
  # /pb/py/chunker/hf/main.py
3
  # ./main.py
4
 
 
 
 
5
  import os
6
  import asyncio
7
  import json
8
  import uvicorn
9
  from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks
10
- from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse # Added FileResponse
11
  from fastapi.staticfiles import StaticFiles
12
  import shutil
13
  import glob
@@ -89,6 +92,7 @@ async def handle_upload(
89
  ))
90
  return {"status": "Processing started"}
91
 
 
92
  @app.get("/download-latest")
93
  async def download_latest():
94
  # Look for files matching our pattern
@@ -98,6 +102,50 @@ async def download_latest():
98
  # Sort by creation time to get the newest one
99
  latest_file = max(files, key=os.path.getctime)
100
  return FileResponse(path=latest_file, filename=os.path.basename(latest_file))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
 
103
  if __name__ == "__main__":
 
2
  # /pb/py/chunker/hf/main.py
3
  # ./main.py
4
 
5
+ import zipfile
6
+ import io
7
+
8
  import os
9
  import asyncio
10
  import json
11
  import uvicorn
12
  from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks
13
+ from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse
14
  from fastapi.staticfiles import StaticFiles
15
  import shutil
16
  import glob
 
92
  ))
93
  return {"status": "Processing started"}
94
 
95
+ #"""
96
  @app.get("/download-latest")
97
  async def download_latest():
98
  # Look for files matching our pattern
 
102
  # Sort by creation time to get the newest one
103
  latest_file = max(files, key=os.path.getctime)
104
  return FileResponse(path=latest_file, filename=os.path.basename(latest_file))
105
+ #"""
106
+
107
+ @app.get("/download-markdown")
108
+ async def download_md(type: str = "nested"):
109
+ pattern = "nested_knowledge_*.md" if type == "nested" else "table_knowledge_*.md"
110
+ files = glob.glob(pattern)
111
+ if not files: return {"error": "No markdown found"}
112
+ latest = max(files, key=os.path.getctime)
113
+ return FileResponse(path=latest, filename=os.path.basename(latest))
114
+
115
+ @app.get("/download-all")
116
+ async def download_all():
117
+ # Find the latest files for each type
118
+ json_files = glob.glob("knowledge_tree_*.json")
119
+ nested_files = glob.glob("nested_knowledge_*.md")
120
+ table_files = glob.glob("table_knowledge_*.md")
121
+
122
+ if not json_files:
123
+ return {"error": "No files found. Please complete a run first."}
124
+
125
+ # Identify the newest ones
126
+ latest_json = max(json_files, key=os.path.getctime)
127
+ # Match the timestamp from the JSON to get the corresponding MDs
128
+ timestamp = os.path.basename(latest_json).replace("knowledge_tree_", "").replace(".json", "")
129
+
130
+ files_to_zip = [
131
+ latest_json,
132
+ f"nested_knowledge_{timestamp}.md",
133
+ f"table_knowledge_{timestamp}.md"
134
+ ]
135
+
136
+ # Create an in-memory ZIP file
137
+ zip_buffer = io.BytesIO()
138
+ with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
139
+ for file_path in files_to_zip:
140
+ if os.path.exists(file_path):
141
+ zip_file.write(file_path, os.path.basename(file_path))
142
+
143
+ zip_buffer.seek(0)
144
+ return StreamingResponse(
145
+ zip_buffer,
146
+ media_type="application/x-zip-compressed",
147
+ headers={"Content-Disposition": f"attachment; filename=jung_knowledge_base_{timestamp}.zip"}
148
+ )
149
 
150
 
151
  if __name__ == "__main__":
phase0102_chunker_aggregator_2.py CHANGED
@@ -219,6 +219,15 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
219
  # Semantic Jump Logic, find the break text and move cursor
220
  break_text = res.get('break_text', "")
221
  cursor += (lookahead.find(break_text) + len(break_text)) if break_text in lookahead else 2000
 
 
 
 
 
 
 
 
 
222
 
223
  # Throttling to stay under 6000 TPM limit
224
  await asyncio.sleep(7)
@@ -277,7 +286,10 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
277
  output_file = f"knowledge_tree_{timestamp}.json"
278
  with open(output_file, "w") as f:
279
  json.dump(final_data, f, indent=4)
280
-
 
 
 
281
  if queue: await queue.put("DONE")
282
 
283
  """
@@ -309,4 +321,48 @@ async def generate_summary_block(chunks_to_summarize, label="Level-1 Cluster"):
309
  "synthesis": "the dense summary text"
310
  }}
311
  """
312
- return await call_groq_json(system_prompt, combined_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  # Semantic Jump Logic, find the break text and move cursor
220
  break_text = res.get('break_text', "")
221
  cursor += (lookahead.find(break_text) + len(break_text)) if break_text in lookahead else 2000
222
+
223
+ new_chunk = {
224
+ "type": "leaf",
225
+ "filename": res.get('filename', 'untitled'),
226
+ "content": res.get('rewritten_text', ''),
227
+ "original": lookahead[:len(res.get('break_text', '')) + 500] # Save a snippet of the original
228
+ }
229
+
230
+
231
 
232
  # Throttling to stay under 6000 TPM limit
233
  await asyncio.sleep(7)
 
286
  output_file = f"knowledge_tree_{timestamp}.json"
287
  with open(output_file, "w") as f:
288
  json.dump(final_data, f, indent=4)
289
+
290
+ # CALL TO CREATE NESTED AND TABULAR MARKDOWNs
291
+ export_visual_formats(final_data, timestamp)
292
+
293
  if queue: await queue.put("DONE")
294
 
295
  """
 
321
  "synthesis": "the dense summary text"
322
  }}
323
  """
324
+ return await call_groq_json(system_prompt, combined_content)
325
+
326
+ """
327
+ Nested Markdown
328
+
329
+ Contextual Integrity - Acts as a "Read Me" for the Jungian Agent. It can follow the # headers to understand the hierarchy.
330
+ Auditability: By including the SOURCE TEXT vs AI INTERPRETATION, it becomes possible to verify whether the LLM is "hallucinating" terms like individuation or if it's a valid AI interpretation in the Jungian sense, owing to the alchemical symbols.
331
+
332
+ Table Markdown
333
+
334
+ Visual Clarity: Table Markdown is perfect for a quick bird's-eye view, such as the number of chunks under each chapter
335
+ """
336
+ # --- NESTED AND TABULAR MARKDOWN
337
+ def export_visual_formats(final_data, timestamp):
338
+ # --- NESTED MARKDOWN ---
339
+ md_nested = f"# πŸ‘‘ VOLUME: {final_data['metadata']['pages']}\n"
340
+ md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
341
+
342
+ for l2 in final_data['l2_chapters']:
343
+ md_nested += f"## πŸ’Ž CHAPTER: {l2['name']}\n> {l2['content']}\n\n"
344
+ # Logic to associate children would go here; for now, we list all relevant nodes
345
+ for l1 in final_data['l1_clusters']:
346
+ md_nested += f"### ⭐ CLUSTER: {l1['name']}\n> {l1['content']}\n\n"
347
+ for leaf in final_data['leaves']:
348
+ md_nested += f"#### πŸ“„ {leaf['name']}\n"
349
+ md_nested += f"**[AI INTERPRETATION]:** {leaf['content']}\n\n"
350
+ md_nested += f"**[ORIGINAL TEXT]:** {leaf.get('original', 'N/A')[:250]}...\n\n---\n"
351
+
352
+ # --- TABULAR MARKDOWN ---
353
+ md_table = "| Level | Name | Content Snippet |\n| :--- | :--- | :--- |\n"
354
+ if final_data['l3_volume']:
355
+ md_table += f"| πŸ‘‘ VOLUME | {final_data['l3_volume']['name']} | {final_data['l3_volume']['content'][:150]}... |\n"
356
+ for l2 in final_data['l2_chapters']:
357
+ md_table += f"| πŸ’Ž CHAPTER | {l2['name']} | {l2['content'][:150]}... |\n"
358
+ for l1 in final_data['l1_clusters']:
359
+ md_table += f"| ⭐ CLUSTER | {l1['name']} | {l1['content'][:150]}... |\n"
360
+ for leaf in final_data['leaves']:
361
+ md_table += f"| πŸ“„ LEAF | {leaf['name']} | **[AI]** {leaf['content'][:150]}... |\n"
362
+
363
+ # Save files
364
+ with open(f"nested_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_nested)
365
+ with open(f"table_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_table)
366
+
367
+
368
+ print(f"βœ… Visual Markdowns created: nested_knowledge_{timestamp}.md and table_knowledge_{timestamp}.md")