Spaces:
Running
Running
Upload phase0102_chunker_aggregator_2.py
Browse files
phase0102_chunker_aggregator_2.py
CHANGED
|
@@ -8,11 +8,12 @@ https://www.linkedin.com/pulse/new-way-encode-documents-ai-agents-navigable-tree
|
|
| 8 |
https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f
|
| 9 |
|
| 10 |
|
| 11 |
-
The Logic of the Knowledge-Pyramid
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
The combined script - with two phases, I and II, fired sequentially - aligns with a/ the "Dense Theory" of knowledge extraction and b/ with Makarevych's "Incremental Aggregation" logic of the availabity of a set of chunks triggering the system's to generate a summary. The "Dense Theory" of knowledge extraction is the idea that the LLM should not only extract chunks but also immediately synthesize them into higher-level summaries, creating a "Knowledge Tree" with multiple levels of abstraction.
|
| 18 |
|
|
@@ -135,10 +136,15 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
|
|
| 135 |
|
| 136 |
cursor = 0
|
| 137 |
l0_buffer = [] # Holds Leaves for L1 (Clusters/Branches)
|
| 138 |
-
l1_buffer = [] # Holds L1 Summaries for L2 (Chapters)
|
| 139 |
-
l2_buffer = [] # Holds L2 Summaries for L3 (Volumes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
-
|
| 142 |
|
| 143 |
#all_leaves = []
|
| 144 |
#summary_blocks = []
|
|
@@ -174,28 +180,38 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
|
|
| 174 |
res = await call_groq_json(prompt, lookahead)
|
| 175 |
|
| 176 |
leaf = {"type": "leaf", "name": res['filename'], "content": res['rewritten_text']}
|
|
|
|
|
|
|
| 177 |
l0_buffer.append(leaf) # stack-up leaves
|
| 178 |
|
| 179 |
# PUSH TO UI
|
| 180 |
if queue: await queue.put(leaf)
|
| 181 |
|
| 182 |
# --- PHASE II: AGGREGATE LEAVES; TRIGGER L1 (Every 5 Leaves) ---
|
| 183 |
-
if len(l0_buffer) >=
|
| 184 |
print("⭐ Creating L1 Cluster...")
|
| 185 |
l1_res = await generate_summary_block(l0_buffer, "Level-1 Cluster")
|
| 186 |
l1_node = {"type": "summary_l1", "name": l1_res['summary_name'], "content": l1_res['synthesis']}
|
| 187 |
|
| 188 |
-
|
|
|
|
|
|
|
| 189 |
if queue: await queue.put(l1_node)
|
|
|
|
| 190 |
l0_buffer = [] # Reset L0
|
| 191 |
|
| 192 |
# --- PHASE III: TRIGGER L2 (Every 5 L1 Clusters) ---
|
| 193 |
-
if len(l1_buffer) >=
|
|
|
|
| 194 |
print("💎 Creating L2 Chapter...")
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
| 196 |
l2_node = {"type": "summary_l2", "name": l2_res['summary_name'], "content": l2_res['synthesis']}
|
| 197 |
|
| 198 |
-
|
|
|
|
| 199 |
if queue: await queue.put(l2_node)
|
| 200 |
l1_buffer = [] # Reset L1
|
| 201 |
|
|
@@ -218,24 +234,52 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
|
|
| 218 |
await asyncio.sleep(10) # Longer pause on error
|
| 219 |
continue
|
| 220 |
|
| 221 |
-
# --- FINAL
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
l3_node = {"type": "summary_l3", "name": l3_res['summary_name'], "content": l3_res['synthesis']}
|
| 226 |
if queue: await queue.put(l3_node)
|
| 227 |
|
| 228 |
-
if queue: await queue.put("DONE")
|
| 229 |
|
| 230 |
|
| 231 |
-
#
|
| 232 |
timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
|
| 233 |
-
final_data = {
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
json.dump(final_data, f, indent=4)
|
| 236 |
|
| 237 |
-
if queue:
|
| 238 |
-
|
| 239 |
|
| 240 |
# Helper for summary
|
| 241 |
async def generate_summary_block(chunks):
|
|
|
|
| 8 |
https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f
|
| 9 |
|
| 10 |
|
| 11 |
+
----- The Logic of the Knowledge-Pyramid: -----
|
| 12 |
+
|
| 13 |
+
L0 (Leaves): 1-2 pages of raw text rewritten
|
| 14 |
+
L1 (Clusters/Branches): Summary of 5 Leaves (~10 pages)
|
| 15 |
+
L2 (Chapters): Summary of 5 L1 Clusters/Branches (~50 pages)
|
| 16 |
+
L3 (Volume): Summary of all L2 Nodes (The entire book)
|
| 17 |
|
| 18 |
The combined script - with two phases, I and II, fired sequentially - aligns with a/ the "Dense Theory" of knowledge extraction and b/ with Makarevych's "Incremental Aggregation" logic of the availabity of a set of chunks triggering the system's to generate a summary. The "Dense Theory" of knowledge extraction is the idea that the LLM should not only extract chunks but also immediately synthesize them into higher-level summaries, creating a "Knowledge Tree" with multiple levels of abstraction.
|
| 19 |
|
|
|
|
| 136 |
|
| 137 |
cursor = 0
|
| 138 |
l0_buffer = [] # Holds Leaves for L1 (Clusters/Branches)
|
| 139 |
+
#l1_buffer = [] # Holds L1 Summaries for L2 (Chapters)
|
| 140 |
+
#l2_buffer = [] # Holds L2 Summaries for L3 (Volumes)
|
| 141 |
+
|
| 142 |
+
all_leaves = [] # Final collection
|
| 143 |
+
all_l1_summaries = []
|
| 144 |
+
all_l2_summaries = []
|
| 145 |
+
l3_node = None # The final crown
|
| 146 |
|
| 147 |
+
l_buffer_size = 5 # CHUNK_GROUP_SIZE
|
| 148 |
|
| 149 |
#all_leaves = []
|
| 150 |
#summary_blocks = []
|
|
|
|
| 180 |
res = await call_groq_json(prompt, lookahead)
|
| 181 |
|
| 182 |
leaf = {"type": "leaf", "name": res['filename'], "content": res['rewritten_text']}
|
| 183 |
+
|
| 184 |
+
all_leaves.append(leaf)
|
| 185 |
l0_buffer.append(leaf) # stack-up leaves
|
| 186 |
|
| 187 |
# PUSH TO UI
|
| 188 |
if queue: await queue.put(leaf)
|
| 189 |
|
| 190 |
# --- PHASE II: AGGREGATE LEAVES; TRIGGER L1 (Every 5 Leaves) ---
|
| 191 |
+
if len(l0_buffer) >= l_buffer_size:
|
| 192 |
print("⭐ Creating L1 Cluster...")
|
| 193 |
l1_res = await generate_summary_block(l0_buffer, "Level-1 Cluster")
|
| 194 |
l1_node = {"type": "summary_l1", "name": l1_res['summary_name'], "content": l1_res['synthesis']}
|
| 195 |
|
| 196 |
+
all_l1_summaries.append(l1_node)
|
| 197 |
+
#l1_buffer.append(l1_node) # stack-up clusters/branches
|
| 198 |
+
|
| 199 |
if queue: await queue.put(l1_node)
|
| 200 |
+
|
| 201 |
l0_buffer = [] # Reset L0
|
| 202 |
|
| 203 |
# --- PHASE III: TRIGGER L2 (Every 5 L1 Clusters) ---
|
| 204 |
+
#if len(l1_buffer) >= l_buffer_size:
|
| 205 |
+
if len(all_l1_summaries) >= l_buffer_size and len(all_l1_summaries) % 5 == 0:
|
| 206 |
print("💎 Creating L2 Chapter...")
|
| 207 |
+
# We take the last 5 L1s
|
| 208 |
+
|
| 209 |
+
l2_res = await generate_summary_block(all_l1_summaries[-5:], "Level-2 Chapter")
|
| 210 |
+
|
| 211 |
l2_node = {"type": "summary_l2", "name": l2_res['summary_name'], "content": l2_res['synthesis']}
|
| 212 |
|
| 213 |
+
all_l2_summaries.append(l2_node)
|
| 214 |
+
#l2_buffer.append(l2_node) # stack-up chapters
|
| 215 |
if queue: await queue.put(l2_node)
|
| 216 |
l1_buffer = [] # Reset L1
|
| 217 |
|
|
|
|
| 234 |
await asyncio.sleep(10) # Longer pause on error
|
| 235 |
continue
|
| 236 |
|
| 237 |
+
# --- FINAL FLUSH (The "Cleanup" Phase) ---
|
| 238 |
+
# If the book ends and we have leftover leaves (1-4), summarize them now!
|
| 239 |
+
if l0_buffer:
|
| 240 |
+
l1_res = await generate_summary_block(l0_buffer, "Final Level-1 Cluster")
|
| 241 |
+
l1_node = {"type": "summary_l1", "name": l1_res['summary_name'], "content": l1_res['synthesis']}
|
| 242 |
+
all_l1_summaries.append(l1_node)
|
| 243 |
+
if queue: await queue.put(l1_node)
|
| 244 |
+
|
| 245 |
+
# Summarize all L1s into L2 if we haven't already
|
| 246 |
+
if all_l1_summaries and not all_l2_summaries:
|
| 247 |
+
l2_res = await generate_summary_block(all_l1_summaries, "Level-2 Chapter")
|
| 248 |
+
l2_node = {"type": "summary_l2", "name": l2_res['summary_name'], "content": l2_res['synthesis']}
|
| 249 |
+
all_l2_summaries.append(l2_node)
|
| 250 |
+
if queue: await queue.put(l2_node)
|
| 251 |
+
|
| 252 |
+
# FINAL VOLUME SUMMARY (L3)
|
| 253 |
+
if all_l2_summaries:
|
| 254 |
+
l3_res = await generate_summary_block(all_l2_summaries, "Level-3 Volume")
|
| 255 |
l3_node = {"type": "summary_l3", "name": l3_res['summary_name'], "content": l3_res['synthesis']}
|
| 256 |
if queue: await queue.put(l3_node)
|
| 257 |
|
| 258 |
+
#if queue: await queue.put("DONE")
|
| 259 |
|
| 260 |
|
| 261 |
+
# --- THE SAFE SAVE ---
|
| 262 |
timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
|
| 263 |
+
#final_data = {
|
| 264 |
+
# "metadata": {"pages": f"{start_p}-{end_p}", "date": timestamp},
|
| 265 |
+
# "leaves": all_leaves,
|
| 266 |
+
# "l1_clusters": all_l1_summaries,
|
| 267 |
+
# "l2_chapters": all_l2_summaries,
|
| 268 |
+
# "l3_volume": l3_node
|
| 269 |
+
#}
|
| 270 |
+
|
| 271 |
+
final_data = {"date": timestamp,
|
| 272 |
+
"leaves": all_leaves,
|
| 273 |
+
"l1_clusters": all_l1_summaries,
|
| 274 |
+
"l2_chapters": all_l2_summaries,
|
| 275 |
+
"l3_volume": l3_node}
|
| 276 |
+
|
| 277 |
+
output_file = f"knowledge_tree_{timestamp}.json"
|
| 278 |
+
with open(output_file, "w") as f:
|
| 279 |
json.dump(final_data, f, indent=4)
|
| 280 |
|
| 281 |
+
if queue: await queue.put("DONE")
|
| 282 |
+
|
| 283 |
|
| 284 |
# Helper for summary
|
| 285 |
async def generate_summary_block(chunks):
|