prashantmatlani commited on
Commit
1735959
·
verified ·
1 Parent(s): 36529d5

Upload phase0102_chunker_aggregator_2.py

Browse files
Files changed (1) hide show
  1. phase0102_chunker_aggregator_2.py +67 -23
phase0102_chunker_aggregator_2.py CHANGED
@@ -8,11 +8,12 @@ https://www.linkedin.com/pulse/new-way-encode-documents-ai-agents-navigable-tree
8
  https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f
9
 
10
 
11
- The Logic of the Knowledge-Pyramid
12
- L0 (Leaves): 1-2 pages of raw text rewritten.
13
- L1 (Clusters/Branches): Summary of 5 Leaves (~10 pages).
14
- L2 (Chapters): Summary of 5 L1 Clusters/Branches (~50 pages).
15
- L3 (Volume): Summary of all L2 Nodes (The entire book).
 
16
 
17
  The combined script - with two phases, I and II, fired sequentially - aligns with a/ the "Dense Theory" of knowledge extraction and b/ with Makarevych's "Incremental Aggregation" logic of the availabity of a set of chunks triggering the system's to generate a summary. The "Dense Theory" of knowledge extraction is the idea that the LLM should not only extract chunks but also immediately synthesize them into higher-level summaries, creating a "Knowledge Tree" with multiple levels of abstraction.
18
 
@@ -135,10 +136,15 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
135
 
136
  cursor = 0
137
  l0_buffer = [] # Holds Leaves for L1 (Clusters/Branches)
138
- l1_buffer = [] # Holds L1 Summaries for L2 (Chapters)
139
- l2_buffer = [] # Holds L2 Summaries for L3 (Volumes)
 
 
 
 
 
140
 
141
- l0_buffer_size = 5 # CHUNK_GROUP_SIZE
142
 
143
  #all_leaves = []
144
  #summary_blocks = []
@@ -174,28 +180,38 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
174
  res = await call_groq_json(prompt, lookahead)
175
 
176
  leaf = {"type": "leaf", "name": res['filename'], "content": res['rewritten_text']}
 
 
177
  l0_buffer.append(leaf) # stack-up leaves
178
 
179
  # PUSH TO UI
180
  if queue: await queue.put(leaf)
181
 
182
  # --- PHASE II: AGGREGATE LEAVES; TRIGGER L1 (Every 5 Leaves) ---
183
- if len(l0_buffer) >= l0_buffer_size:
184
  print("⭐ Creating L1 Cluster...")
185
  l1_res = await generate_summary_block(l0_buffer, "Level-1 Cluster")
186
  l1_node = {"type": "summary_l1", "name": l1_res['summary_name'], "content": l1_res['synthesis']}
187
 
188
- l1_buffer.append(l1_node) # stack-up clusters/branches
 
 
189
  if queue: await queue.put(l1_node)
 
190
  l0_buffer = [] # Reset L0
191
 
192
  # --- PHASE III: TRIGGER L2 (Every 5 L1 Clusters) ---
193
- if len(l1_buffer) >= 5:
 
194
  print("💎 Creating L2 Chapter...")
195
- l2_res = await generate_summary_block(l1_buffer, "Level-2 Chapter")
 
 
 
196
  l2_node = {"type": "summary_l2", "name": l2_res['summary_name'], "content": l2_res['synthesis']}
197
 
198
- l2_buffer.append(l2_node) # stack-up chapters
 
199
  if queue: await queue.put(l2_node)
200
  l1_buffer = [] # Reset L1
201
 
@@ -218,24 +234,52 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
218
  await asyncio.sleep(10) # Longer pause on error
219
  continue
220
 
221
- # --- FINAL WRAP UP: L3 VOLUME SUMMARY ---
222
- if l2_buffer:
223
- print("👑 Creating L3 Volume Summary...")
224
- l3_res = await generate_summary_block(l2_buffer, "Level-3 Volume")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  l3_node = {"type": "summary_l3", "name": l3_res['summary_name'], "content": l3_res['synthesis']}
226
  if queue: await queue.put(l3_node)
227
 
228
- if queue: await queue.put("DONE")
229
 
230
 
231
- # Final Save
232
  timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
233
- final_data = {"leaves": leaf, "cluster/branch summary": l1_node, "chapter": l2_node, "volume": l3_node}
234
- with open(f"knowledge_tree_{timestamp}.json", "w") as f:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  json.dump(final_data, f, indent=4)
236
 
237
- if queue:
238
- await queue.put("DONE")
239
 
240
  # Helper for summary
241
  async def generate_summary_block(chunks):
 
8
  https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f
9
 
10
 
11
+ ----- The Logic of the Knowledge-Pyramid: -----
12
+
13
+ L0 (Leaves): 1-2 pages of raw text rewritten
14
+ L1 (Clusters/Branches): Summary of 5 Leaves (~10 pages)
15
+ L2 (Chapters): Summary of 5 L1 Clusters/Branches (~50 pages)
16
+ L3 (Volume): Summary of all L2 Nodes (The entire book)
17
 
18
  The combined script - with two phases, I and II, fired sequentially - aligns with a/ the "Dense Theory" of knowledge extraction and b/ with Makarevych's "Incremental Aggregation" logic of the availabity of a set of chunks triggering the system's to generate a summary. The "Dense Theory" of knowledge extraction is the idea that the LLM should not only extract chunks but also immediately synthesize them into higher-level summaries, creating a "Knowledge Tree" with multiple levels of abstraction.
19
 
 
136
 
137
  cursor = 0
138
  l0_buffer = [] # Holds Leaves for L1 (Clusters/Branches)
139
+ #l1_buffer = [] # Holds L1 Summaries for L2 (Chapters)
140
+ #l2_buffer = [] # Holds L2 Summaries for L3 (Volumes)
141
+
142
+ all_leaves = [] # Final collection
143
+ all_l1_summaries = []
144
+ all_l2_summaries = []
145
+ l3_node = None # The final crown
146
 
147
+ l_buffer_size = 5 # CHUNK_GROUP_SIZE
148
 
149
  #all_leaves = []
150
  #summary_blocks = []
 
180
  res = await call_groq_json(prompt, lookahead)
181
 
182
  leaf = {"type": "leaf", "name": res['filename'], "content": res['rewritten_text']}
183
+
184
+ all_leaves.append(leaf)
185
  l0_buffer.append(leaf) # stack-up leaves
186
 
187
  # PUSH TO UI
188
  if queue: await queue.put(leaf)
189
 
190
  # --- PHASE II: AGGREGATE LEAVES; TRIGGER L1 (Every 5 Leaves) ---
191
+ if len(l0_buffer) >= l_buffer_size:
192
  print("⭐ Creating L1 Cluster...")
193
  l1_res = await generate_summary_block(l0_buffer, "Level-1 Cluster")
194
  l1_node = {"type": "summary_l1", "name": l1_res['summary_name'], "content": l1_res['synthesis']}
195
 
196
+ all_l1_summaries.append(l1_node)
197
+ #l1_buffer.append(l1_node) # stack-up clusters/branches
198
+
199
  if queue: await queue.put(l1_node)
200
+
201
  l0_buffer = [] # Reset L0
202
 
203
  # --- PHASE III: TRIGGER L2 (Every 5 L1 Clusters) ---
204
+ #if len(l1_buffer) >= l_buffer_size:
205
+ if len(all_l1_summaries) >= l_buffer_size and len(all_l1_summaries) % 5 == 0:
206
  print("💎 Creating L2 Chapter...")
207
+ # We take the last 5 L1s
208
+
209
+ l2_res = await generate_summary_block(all_l1_summaries[-5:], "Level-2 Chapter")
210
+
211
  l2_node = {"type": "summary_l2", "name": l2_res['summary_name'], "content": l2_res['synthesis']}
212
 
213
+ all_l2_summaries.append(l2_node)
214
+ #l2_buffer.append(l2_node) # stack-up chapters
215
  if queue: await queue.put(l2_node)
216
  l1_buffer = [] # Reset L1
217
 
 
234
  await asyncio.sleep(10) # Longer pause on error
235
  continue
236
 
237
+ # --- FINAL FLUSH (The "Cleanup" Phase) ---
238
+ # If the book ends and we have leftover leaves (1-4), summarize them now!
239
+ if l0_buffer:
240
+ l1_res = await generate_summary_block(l0_buffer, "Final Level-1 Cluster")
241
+ l1_node = {"type": "summary_l1", "name": l1_res['summary_name'], "content": l1_res['synthesis']}
242
+ all_l1_summaries.append(l1_node)
243
+ if queue: await queue.put(l1_node)
244
+
245
+ # Summarize all L1s into L2 if we haven't already
246
+ if all_l1_summaries and not all_l2_summaries:
247
+ l2_res = await generate_summary_block(all_l1_summaries, "Level-2 Chapter")
248
+ l2_node = {"type": "summary_l2", "name": l2_res['summary_name'], "content": l2_res['synthesis']}
249
+ all_l2_summaries.append(l2_node)
250
+ if queue: await queue.put(l2_node)
251
+
252
+ # FINAL VOLUME SUMMARY (L3)
253
+ if all_l2_summaries:
254
+ l3_res = await generate_summary_block(all_l2_summaries, "Level-3 Volume")
255
  l3_node = {"type": "summary_l3", "name": l3_res['summary_name'], "content": l3_res['synthesis']}
256
  if queue: await queue.put(l3_node)
257
 
258
+ #if queue: await queue.put("DONE")
259
 
260
 
261
+ # --- THE SAFE SAVE ---
262
  timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
263
+ #final_data = {
264
+ # "metadata": {"pages": f"{start_p}-{end_p}", "date": timestamp},
265
+ # "leaves": all_leaves,
266
+ # "l1_clusters": all_l1_summaries,
267
+ # "l2_chapters": all_l2_summaries,
268
+ # "l3_volume": l3_node
269
+ #}
270
+
271
+ final_data = {"date": timestamp,
272
+ "leaves": all_leaves,
273
+ "l1_clusters": all_l1_summaries,
274
+ "l2_chapters": all_l2_summaries,
275
+ "l3_volume": l3_node}
276
+
277
+ output_file = f"knowledge_tree_{timestamp}.json"
278
+ with open(output_file, "w") as f:
279
  json.dump(final_data, f, indent=4)
280
 
281
+ if queue: await queue.put("DONE")
282
+
283
 
284
  # Helper for summary
285
  async def generate_summary_block(chunks):