prashantmatlani commited on
Commit
57761e2
·
1 Parent(s): d6bfcc5

updated chunker_2

Browse files
main.py CHANGED
@@ -17,7 +17,7 @@ import glob
17
 
18
  # Import chunking logic from the existing combined script
19
  # Note: Ensure script functions are wrap-able or callable
20
- from phase0102_chunker_aggregator_2 import run_chunking_process
21
 
22
  app = FastAPI()
23
 
 
17
 
18
  # Import chunking logic from the existing combined script
19
  # Note: Ensure script functions are wrap-able or callable
20
+ from chunker_2 import run_chunking_process
21
 
22
  app = FastAPI()
23
 
phase0102_chunker_aggregator_2.py DELETED
@@ -1,420 +0,0 @@
1
-
2
- # ./phase0102_chunker_aggregator_2.py
3
-
4
- """
5
-
6
- https://www.linkedin.com/pulse/new-way-encode-documents-ai-agents-navigable-trees-sergii-makarevych-a6cof/
7
-
8
- https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f
9
-
10
-
11
- ----- The Logic of the Knowledge-Pyramid: -----
12
-
13
- L0 (Leaves): 1-2 pages of raw text rewritten
14
- L1 (Clusters/Branches): Summary of 5 Leaves (~10 pages)
15
- L2 (Chapters): Summary of 5 L1 Clusters/Branches (~50 pages)
16
- L3 (Volume): Summary of all L2 Nodes (The entire book)
17
-
18
- The combined script - with two phases, I and II, fired sequentially - aligns with a/ the "Dense Theory" of knowledge extraction and b/ with Makarevych's "Incremental Aggregation" logic of the availabity of a set of chunks triggering the system's to generate a summary. The "Dense Theory" of knowledge extraction is the idea that the LLM should not only extract chunks but also immediately synthesize them into higher-level summaries, creating a "Knowledge Tree" with multiple levels of abstraction.
19
-
20
- . The temp_group: Acts as a "waiting room." Once it hits 5 chunks, it empties itself into the Phase II Aggregator.
21
- . Memory Continuity: When the summary_node is created, it's saved to context_buffer["latest_summary"]. This means chunk #6 will actually "know" the summary of chunks #1–5, helping it stay consistent with the themes already established.
22
- . The "Children" Key: In the final JSON, each summary block now lists which leaf chunks belong to it. This is what makes it a Navigable Tree.
23
-
24
-
25
- > Phase I - Extract and rewrite chunks (The "Leaves")
26
-
27
- The Semantic Split: Instead of splitting at exactly 1000 characters, we give the LLM a 6000-character window and ask it to find the natural "Topic End" (break_text).
28
-
29
- Self-Sufficiency: The prompt tells the LLM to resolve pronouns; in a text where "it" could refer to a concept mentioned three paragraphs ago, this is vital.
30
-
31
- The Cursor: cursor += relative_break_point ensures we never lose our place in a document spanned across thousands of words, hundreds of pages.
32
-
33
-
34
- > Phase II - Incremental Aggregation into Summaries (The "Branches")
35
-
36
- Summary Block: With about five chunks, system builds a Summary Block
37
-
38
- Continuity: This Summary Block is then fed back into the context_buffer so the next set of Phase I chunks knows what the previous summary was.
39
-
40
- "Knowledge Tree" is thus created of summaries as branches connecting chunks as leaves
41
-
42
- """
43
-
44
- import os
45
- import json
46
- import datetime
47
- import asyncio
48
- import tiktoken
49
- import pymupdf4llm
50
- from groq import Groq
51
-
52
- from dotenv import load_dotenv
53
- from pathlib import Path
54
-
55
- import time
56
- import datetime
57
- import sys
58
-
59
-
60
- load_dotenv()
61
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
62
- MODEL = "llama-3.1-8b-instant"
63
- encoding = tiktoken.get_encoding("cl100k_base")
64
-
65
- # 2. Define the folder and the filename
66
- #pdf_folder = Path("C:\\Users\\wd052\\OneDrive\\Desktop\\00\\01\\PDFs\\J\\CW")
67
- #pdf_path = r"C:\Users\wd052\OneDrive\Desktop\00\01\PDFs\J\CW\Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
68
- #pdf_folder = Path("C:/Users/wd052/OneDrive/Desktop/00/01/PDFs/J/CW")
69
- #pdf_name = "Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
70
-
71
- # Combine them
72
- #pdf_path = pdf_folder / pdf_name
73
-
74
- #WHOLE = False # Set to True to process the whole book; False to process a page range
75
- #START_PAGE = 8
76
- #END_PAGE = 10
77
-
78
- laf = 2000 # look-ahead factor
79
- djf = 0.1 # dynamic jump factor
80
-
81
- async def call_groq_json(system_prompt, user_content):
82
- strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."
83
-
84
- # Use loop.run_in_executor to keep the Groq call from blocking the UI
85
- loop = asyncio.get_event_loop()
86
- completion = await loop.run_in_executor(
87
- None,
88
- lambda: client.chat.completions.create(
89
- model=MODEL,
90
- messages=[
91
- {"role": "system", "content": strict_system_prompt},
92
- {"role": "user", "content": user_content}
93
- ],
94
- response_format={"type": "json_object"},
95
- temperature=0.2 # Lower temperature = more stable JSON; the LLM is less "creative" with formatting at temperature of 0.2, and more likely to follow a perfect JSON structure
96
- )
97
- )
98
-
99
- # LLM can technically generate multiple different versions of an answer if its asked to
100
- # Groq returns these as a list called "choices", since even a single answer is inside a list, Python must be told to look at index 0 to get the actual content
101
- # Then we access the "message" key, followed by "content" key to get the raw JSON string
102
- return json.loads(completion.choices[0].message.content)
103
-
104
- # - 1 to START PAGE; Python's range(5, 7) gives pages 6 and 7, to get to the exact specified range we do START_PAGE-1
105
- # Alignment: Convert Human (1-indexed) to Library (0-indexed)
106
- # Human page 5 is internal page 4
107
- #async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_PAGE-1, end_p=END_PAGE):
108
- async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=1, end_p=1):
109
- """
110
- Main entry point for the chunking logic.
111
- If queue is provided, it 'yields' results to the UI.
112
- """
113
- #print(f"\nwhole: {whole}, start_p: {start_p}, end_p: {end_p}")
114
-
115
- # 1. Determine Page Range
116
- if whole:
117
- # PyMuPDF4LLM uses None to process all pages
118
- pages_to_read = None
119
- print("📚 Processing the WHOLE book...")
120
- else:
121
- # start_p-1 -> adjustment for 0-indexing
122
- pages_to_read = list(range(int(start_p-1), int(end_p)))
123
- #print(f"📑 Processing pages {START_PAGE} to {END_PAGE}...") # for print purposes subtract and add back 1 from start and end pages, aligning with those specified in the code
124
-
125
- # 2. Extract Markdown
126
- md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
127
-
128
- # Returns a list of dictionaries, one for each page
129
- #pagesscanned = pymupdf4llm.to_markdown("your_document.pdf", page_chunks=True)
130
- allpages = pymupdf4llm.to_markdown(str(pdf_path), page_chunks=True)
131
-
132
- pages_data = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read, page_chunks=True)
133
-
134
- print(f"📖 Page-Aware Engine Started. Total Pages to process: {len(pages_data)}")
135
-
136
- # pull page number from the chunk's metadata
137
- for page in pages_data:
138
- # Extract metadata from this specific page
139
- current_page_text = page["text"]
140
- real_page_num = page["metadata"].get("page_number", "??")
141
-
142
- """
143
- # Instead of a single string of text, we have a list to pull directly the page numbers being scanned from each chunk's metadata
144
- for p in pagesscanned:
145
- real_page_num = p["metadata"]["page_number"] # This is the real-time detected page
146
- text_content = p["text"]
147
- """
148
- # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document ---
149
- total_len = len(md_text)
150
-
151
- # DYNAMIC JUMP: 10% of text or 2000 chars
152
- #dynamic_jump = min(2000, max(500, int(total_len * 0.1)))
153
- dynamic_jump = min(2000, max(500, int(total_len * djf)))
154
- # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document - End ---
155
-
156
- print(f"filepath -> {pdf_path}")
157
- print(f"\n# of words -> {total_len}; dynamic jump at -> {dynamic_jump}")
158
-
159
- cursor = 0
160
- l0_buffer = [] # Holds Leaves for L1 (Clusters/Branches)
161
- #l1_buffer = [] # Holds L1 Summaries for L2 (Chapters)
162
- #l2_buffer = [] # Holds L2 Summaries for L3 (Volumes)
163
-
164
- all_leaves = [] # Final collection
165
- all_l1_summaries = []
166
- all_l2_summaries = []
167
- l3_node = None # The final crown
168
-
169
- l_buffer_size = 5 # CHUNK_GROUP_SIZE
170
-
171
- #all_leaves = []
172
- #summary_blocks = []
173
- #temp_group = []
174
- #CHUNK_GROUP_SIZE = 5
175
-
176
- context_buffer = {"predecessor": "Start", "latest_summary": "None"}
177
-
178
- while cursor < len(md_text):
179
- #lookahead = md_text[cursor : cursor + 6000]
180
- lookahead = md_text[cursor : cursor + laf]
181
-
182
- # ---- DEBUG: Print first 50 characters to see the starting sentence ----
183
- start_snippet = lookahead[:80].replace('\n', ' ')
184
- print(f"🔍 DEBUG: Cursor at {cursor}. Current text starts with: '{start_snippet}'")
185
-
186
- # Since pymupdf4llm inserts page markers like '----- Page 5 -----', we search backwards from the cursor to find the last page tag/number
187
- current_page_search = md_text[:cursor].rfind("Page ")
188
- if current_page_search != -1:
189
- page_num = md_text[current_page_search:current_page_search+10]
190
- print(f"📖 DEBUG: Currently scanning near {page_num}")
191
- # ---- DEBUG: Print first 50 characters to see the starting sentence - End ----
192
-
193
- if not lookahead.strip(): break
194
-
195
- #prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."
196
-
197
- try:
198
- # --- PHASE I: CREATE L0 LEAF ---
199
- prompt = "Extract self-sufficient Jungian chunk. JSON: 'break_text', 'rewritten_text', 'filename'."
200
-
201
- # Note: Ensure call_groq_json is an async function or run in executor
202
- res = await call_groq_json(prompt, lookahead)
203
-
204
- leaf = {"type": "leaf", "page": real_page_num, "name": res['filename'], "content": res['rewritten_text']}
205
-
206
- all_leaves.append(leaf)
207
- l0_buffer.append(leaf) # stack-up leaves
208
-
209
- # PUSH TO UI
210
- if queue: await queue.put(leaf)
211
-
212
- # --- PHASE II: AGGREGATE LEAVES; TRIGGER L1 (Every 5 Leaves) ---
213
- if len(l0_buffer) >= l_buffer_size:
214
- print("⭐ Creating L1 Cluster...")
215
- l1_res = await generate_summary_block(l0_buffer, "Level-1 Cluster")
216
- l1_node = {"type": "summary_l1", "name": l1_res['summary_name'], "content": l1_res['synthesis']}
217
-
218
- all_l1_summaries.append(l1_node)
219
- #l1_buffer.append(l1_node) # stack-up clusters/branches
220
-
221
- if queue: await queue.put(l1_node)
222
-
223
- l0_buffer = [] # Reset L0
224
-
225
- # --- PHASE III: TRIGGER L2 (Every 5 L1 Clusters) ---
226
- #if len(l1_buffer) >= l_buffer_size:
227
- if len(all_l1_summaries) >= l_buffer_size and len(all_l1_summaries) % 5 == 0:
228
- print("💎 Creating L2 Chapter...")
229
- # We take the last 5 L1s
230
-
231
- l2_res = await generate_summary_block(all_l1_summaries[-5:], "Level-2 Chapter")
232
-
233
- l2_node = {"type": "summary_l2", "name": l2_res['summary_name'], "content": l2_res['synthesis']}
234
-
235
- all_l2_summaries.append(l2_node)
236
- #l2_buffer.append(l2_node) # stack-up chapters
237
- if queue: await queue.put(l2_node)
238
- l1_buffer = [] # Reset L1
239
-
240
- # Process the break and update cursor; also "result.get(...)" prevents crashes if keys are missing
241
- # Semantic Jump Logic, find the break text and move cursor
242
- break_text = res.get('break_text', "")
243
- cursor += (lookahead.find(break_text) + len(break_text)) if break_text in lookahead else laf # laf -> 2000
244
-
245
- # Calculate exactly where the chunk ends
246
- if break_text in lookahead:
247
- end_index = lookahead.find(break_text) + len(break_text)
248
- else:
249
- end_index = laf # Fallback
250
-
251
- # This captures ONLY the text analyzed for this specific leaf
252
- actual_original_text = lookahead[:end_index]
253
-
254
- new_chunk = {
255
- "type": "leaf",
256
- "filename": res.get('filename', 'untitled'),
257
- "content": res.get('rewritten_text', ''),
258
- "page_num": page["metadata"]["page_number"], # capture page number
259
- "original": actual_original_text, # Save a snippet of the original
260
- }
261
-
262
- # Throttling to stay under 6000 TPM limit
263
- await asyncio.sleep(7)
264
-
265
- except Exception as e:
266
- if "429" in str(e):
267
- print(" ⚠️ Rate limited! Cooling down for 30 seconds...")
268
- time.sleep(30)
269
- print(f"❌ ERROR AT CURSOR {cursor}: {e}")
270
- #print(f"Error: {e}")
271
- #cursor += 2000
272
- cursor += dynamic_jump # Use our automated jump
273
- await asyncio.sleep(10) # Longer pause on error
274
- continue
275
-
276
- # --- FINAL FLUSH (The "Cleanup" Phase) ---
277
- # If the book ends and we have leftover leaves (1-4), summarize them now!
278
- if l0_buffer:
279
- l1_res = await generate_summary_block(l0_buffer, "Final Level-1 Cluster")
280
- l1_node = {"type": "summary_l1", "name": l1_res['summary_name'], "content": l1_res['synthesis']}
281
- all_l1_summaries.append(l1_node)
282
- if queue: await queue.put(l1_node)
283
-
284
- # Summarize all L1s into L2 if we haven't already
285
- if all_l1_summaries and not all_l2_summaries:
286
- l2_res = await generate_summary_block(all_l1_summaries, "Level-2 Chapter")
287
- l2_node = {"type": "summary_l2", "name": l2_res['summary_name'], "content": l2_res['synthesis']}
288
- all_l2_summaries.append(l2_node)
289
- if queue: await queue.put(l2_node)
290
-
291
- # FINAL VOLUME SUMMARY (L3)
292
- if all_l2_summaries:
293
- l3_res = await generate_summary_block(all_l2_summaries, "Level-3 Volume")
294
- l3_node = {"type": "summary_l3", "name": l3_res['summary_name'], "content": l3_res['synthesis']}
295
- if queue: await queue.put(l3_node)
296
-
297
- #if queue: await queue.put("DONE")
298
-
299
-
300
- # --- THE SAFE SAVE ---
301
- timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
302
- #final_data = {
303
- # "metadata": {"pages": f"{start_p}-{end_p}", "date": timestamp},
304
- # "leaves": all_leaves,
305
- # "l1_clusters": all_l1_summaries,
306
- # "l2_chapters": all_l2_summaries,
307
- # "l3_volume": l3_node
308
- #}
309
- #"""
310
- final_data = {
311
- #"metadata": {"pages": f"{allpages}", "date": timestamp},
312
- #"metadata": {"page_number": f"{page_num}", "date": timestamp},
313
- "metadata": {"pages": f"{start_p}-{end_p}", "date": timestamp},
314
- "date": timestamp,
315
- "leaves": all_leaves,
316
- "l1_clusters": all_l1_summaries,
317
- "l2_chapters": all_l2_summaries,
318
- "l3_volume": l3_node}
319
- #"""
320
- output_file = f"knowledge_tree_{timestamp}.json"
321
- with open(output_file, "w") as f:
322
- json.dump(final_data, f, indent=4)
323
-
324
- # CALL TO CREATE NESTED AND TABULAR MARKDOWNs
325
- export_visual_formats(final_data, timestamp)
326
-
327
- if queue: await queue.put("DONE")
328
-
329
- """
330
- # Helper for summary
331
- async def generate_summary_block(chunks):
332
- combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])
333
- prompt = "Synthesize these Jungian chunks into a single high-density Level-1 summary. JSON keys: 'summary_name', 'synthesis'."
334
-
335
- return await call_groq_json(prompt, combined)
336
- """
337
-
338
- # Add 'label' as a second parameter with a default value
339
- async def generate_summary_block(chunks_to_summarize, label="Level-1 Cluster"):
340
- combined_content = "\n\n".join([f"Source: {c['name']}\n{c['content']}" for c in chunks_to_summarize])
341
-
342
- # We use the 'label' in the prompt to help the LLM understand the scale
343
- system_prompt = f"""
344
- You are creating a '{label}' for a Knowledge Tree of Carl Jung's work.
345
-
346
- TASK:
347
- Synthesize the provided content into a single, high-density summary.
348
- - DO NOT say 'This section covers...'.
349
- - DO say 'Psychological concepts in this section include...'
350
- - Maintain the information density of the original inputs.
351
-
352
- RESPONSE FORMAT (JSON):
353
- {{
354
- "summary_name": "thematic_cluster_name",
355
- "synthesis": "the dense summary text"
356
- }}
357
- """
358
- return await call_groq_json(system_prompt, combined_content)
359
-
360
- """
361
- Nested Markdown
362
-
363
- Contextual Integrity - Acts as a "Read Me" for the Jungian Agent. It can follow the # headers to understand the hierarchy.
364
- Auditability: By including the SOURCE TEXT vs AI INTERPRETATION, it becomes possible to verify whether the LLM is "hallucinating" terms like individuation or if it's a valid AI interpretation in the Jungian sense, owing to the alchemical symbols.
365
-
366
- Table Markdown
367
-
368
- Visual Clarity: Table Markdown is perfect for a quick bird's-eye view, such as the number of chunks under each chapter
369
- """
370
- # --- NESTED AND TABULAR MARKDOWN
371
- def export_visual_formats(final_data, timestamp):
372
- # --- NESTED MARKDOWN ---
373
-
374
- # --- Uncoment the below to include the whole text - 'pages' - of the document in generated "nested_knowledge_xxxx" markdown and in json, useful in the case of short documents, articles, papers, etc. ---
375
- #md_nested = f"# 👑 VOLUME: {final_data['metadata']['pages']}\n"
376
- #md_nested = f"# 👑 VOLUME: {final_data['metadata']['page_num']}\n"
377
- md_nested = f"# 👑 VOLUME SUMMARY\n"
378
- md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
379
-
380
- for l2 in final_data['l2_chapters']:
381
- md_nested += f"## 💎 CHAPTER: {l2['name']}\n> {l2['content']}\n\n"
382
- # Logic to associate children would go here; for now, we list all relevant nodes
383
- for l1 in final_data['l1_clusters']:
384
- md_nested += f"### ⭐ CLUSTER: {l1['name']}\n> {l1['content']}\n\n"
385
- for leaf in final_data['leaves']:
386
- page_label = f" (Page {leaf.get('page_num', '??')})"
387
- md_nested += f"#### 📄 [LEAF]: {leaf['name']}\n"
388
- md_nested += f"**[AI INTERPRETATION]:** {leaf['content']}\n\n"
389
- md_nested += f"**[ORIGINAL TEXT]:** {leaf.get('original', 'N/A')[:250]}...\n\n---\n"
390
-
391
- # --- TABULAR MARKDOWN ---
392
- md_table = "| Volume (L3) | Chapter (L2) | Cluster/Summary (L1) | Page | Chunk (L0) |\n"
393
- md_table += "| :--- | :--- | :--- | :--- | :--- |\n"
394
-
395
- l3_name = final_data['l3_volume']['name'] if final_data['l3_volume'] else "Volume"
396
-
397
- for l2 in final_data['l2_chapters']:
398
- l2_name = l2['name']
399
- l2_summary = l2['content'][:100] + "..."
400
-
401
- for l1 in final_data['l1_clusters']:
402
- l1_name = l1['name']
403
- l1_summary = l1['content'][:100] + "..."
404
-
405
- for leaf in final_data['leaves']:
406
- leaf_name = leaf['name']
407
- # Include page number in the table for extra clarity
408
- pg = leaf.get('page_num', '??')
409
- leaf_content = f"**[P.{pg} AI]** " + leaf['content'][:150] + "..."
410
- orig_text = leaf.get('original', 'N/A')[:100] + "..."
411
-
412
- md_table += f"| 👑 VOLUME: {l3_name} | 💎 CHAPTER: **{l2_name}**: {l2_summary} | **⭐ CLUSTER: {l1_name}**: {l1_summary} | {pg} | 📄 LEAF: {leaf_content} | ORIGINAL: {orig_text} | \n"
413
-
414
-
415
- # Save files
416
- with open(f"nested_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_nested)
417
- with open(f"table_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_table)
418
-
419
-
420
- print(f"✅ Created: \n\nVisual Markdowns: \nnested_knowledge_{timestamp}.md \ntable_knowledge_{timestamp}.md \n\nand JSON: \n\nknowledge_tree_{timestamp}.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
phase0102_chunker_aggregator_2_l0l1.py DELETED
@@ -1,249 +0,0 @@
1
-
2
- # ./phase0102_chunker_aggregator_2.py
3
-
4
- """
5
-
6
- https://www.linkedin.com/pulse/new-way-encode-documents-ai-agents-navigable-trees-sergii-makarevych-a6cof/
7
-
8
- https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f
9
-
10
-
11
- The combined script - with two phases, I and II, fired sequentially - aligns with a/ the "Dense Theory" of knowledge extraction and b/ with Makarevych's "Incremental Aggregation" logic of the availabity of a set of chunks triggering the system's to generate a summary. The "Dense Theory" of knowledge extraction is the idea that the LLM should not only extract chunks but also immediately synthesize them into higher-level summaries, creating a "Knowledge Tree" with multiple levels of abstraction.
12
-
13
- . The temp_group: Acts as a "waiting room." Once it hits 5 chunks, it empties itself into the Phase II Aggregator.
14
- . Memory Continuity: When the summary_node is created, it's saved to context_buffer["latest_summary"]. This means chunk #6 will actually "know" the summary of chunks #1–5, helping it stay consistent with the themes already established.
15
- . The "Children" Key: In the final JSON, each summary block now lists which leaf chunks belong to it. This is what makes it a Navigable Tree.
16
-
17
-
18
- > Phase I - Extract and rewrite chunks (The "Leaves")
19
-
20
- The Semantic Split: Instead of splitting at exactly 1000 characters, we give the LLM a 6000-character window and ask it to find the natural "Topic End" (break_text).
21
-
22
- Self-Sufficiency: The prompt tells the LLM to resolve pronouns; in a text where "it" could refer to a concept mentioned three paragraphs ago, this is vital.
23
-
24
- The Cursor: cursor += relative_break_point ensures we never lose our place in a document spanned across thousands of words, hundreds of pages.
25
-
26
-
27
- > Phase II - Incremental Aggregation into Summaries (The "Branches")
28
-
29
- Summary Block: With about five chunks, system builds a Summary Block
30
-
31
- Continuity: This Summary Block is then fed back into the context_buffer so the next set of Phase I chunks knows what the previous summary was.
32
-
33
- "Knowledge Tree" is thus created of summaries as branches connecting chunks as leaves
34
-
35
- """
36
-
37
- import os
38
- import json
39
- import datetime
40
- import asyncio
41
- import tiktoken
42
- import pymupdf4llm
43
- from groq import Groq
44
-
45
- from dotenv import load_dotenv
46
- from pathlib import Path
47
-
48
- import time
49
- import datetime
50
- import sys
51
-
52
-
53
- load_dotenv()
54
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
55
- MODEL = "llama-3.1-8b-instant"
56
- encoding = tiktoken.get_encoding("cl100k_base")
57
-
58
- # 2. Define the folder and the filename
59
- #pdf_folder = Path("C:\\Users\\wd052\\OneDrive\\Desktop\\00\\01\\PDFs\\J\\CW")
60
- #pdf_path = r"C:\Users\wd052\OneDrive\Desktop\00\01\PDFs\J\CW\Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
61
- #pdf_folder = Path("C:/Users/wd052/OneDrive/Desktop/00/01/PDFs/J/CW")
62
- #pdf_name = "Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
63
-
64
- # Combine them
65
- #pdf_path = pdf_folder / pdf_name
66
-
67
- WHOLE = False # Set to True to process the whole book; False to process a page range
68
- START_PAGE = 8
69
- END_PAGE = 10
70
-
71
- laf = 2000 # look-ahead factor
72
- djf = 0.1 # dynamic jump factor
73
-
74
- async def call_groq_json(system_prompt, user_content):
75
- strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."
76
-
77
- # Use loop.run_in_executor to keep the Groq call from blocking the UI
78
- loop = asyncio.get_event_loop()
79
- completion = await loop.run_in_executor(
80
- None,
81
- lambda: client.chat.completions.create(
82
- model=MODEL,
83
- messages=[
84
- {"role": "system", "content": strict_system_prompt},
85
- {"role": "user", "content": user_content}
86
- ],
87
- response_format={"type": "json_object"},
88
- temperature=0.2 # Lower temperature = more stable JSON; the LLM is less "creative" with formatting at temperature of 0.2, and more likely to follow a perfect JSON structure
89
- )
90
- )
91
-
92
- # LLM can technically generate multiple different versions of an answer if its asked to
93
- # Groq returns these as a list called "choices", since even a single answer is inside a list, Python must be told to look at index 0 to get the actual content
94
- # Then we access the "message" key, followed by "content" key to get the raw JSON string
95
- return json.loads(completion.choices[0].message.content)
96
-
97
- """
98
- completion = client.chat.completions.create(
99
- model=MODEL,
100
- messages=[
101
- {"role": "system", "content": strict_system_prompt},
102
- {"role": "user", "content": user_content}
103
- ],
104
- response_format={"type": "json_object"},
105
- temperature=0.2
106
- )
107
- return json.loads(completion.choices[0].message.content)
108
- """
109
-
110
- #async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30):
111
- # - 1 to START PAGE; Python's range(5, 7) gives pages 6 and 7, to get to the exact specified range we do START_PAGE-1
112
- # Alignment: Convert Human (1-indexed) to Library (0-indexed)
113
- # Human page 5 is internal page 4
114
- async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_PAGE-1, end_p=END_PAGE):
115
- """
116
- Main entry point for the chunking logic.
117
- If queue is provided, it 'yields' results to the UI.
118
- """
119
- #print(f"\nwhole: {whole}, start_p: {start_p}, end_p: {end_p}")
120
-
121
- # 1. Determine Page Range
122
- if whole:
123
- # PyMuPDF4LLM uses None to process all pages
124
- pages_to_read = None
125
- print("📚 Processing the WHOLE book...")
126
- else:
127
- pages_to_read = list(range(start_p, end_p))
128
- print(f"📑 Processing pages {START_PAGE} to {END_PAGE}...") # for print purposes subtract and add back 1 from start and end pages, aligning with those specified in the code
129
-
130
- # 2. Extract Markdown
131
- md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
132
-
133
- # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document ---
134
- total_len = len(md_text)
135
-
136
- # DYNAMIC JUMP: 10% of text or 2000 chars
137
- #dynamic_jump = min(2000, max(500, int(total_len * 0.1)))
138
- dynamic_jump = min(2000, max(500, int(total_len * djf)))
139
- # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document - End ---
140
-
141
- print(f"filepath -> {pdf_path}")
142
- print(f"\n# of words -> {total_len}; dynamic jump at -> {dynamic_jump}")
143
-
144
- cursor = 0
145
- all_leaves = []
146
- summary_blocks = []
147
- temp_group = []
148
- CHUNK_GROUP_SIZE = 5
149
-
150
- context_buffer = {"predecessor": "Start", "latest_summary": "None"}
151
-
152
- while cursor < len(md_text):
153
- #lookahead = md_text[cursor : cursor + 6000]
154
- lookahead = md_text[cursor : cursor + laf]
155
-
156
- # ---- DEBUG: Print first 50 characters to see the starting sentence ----
157
- start_snippet = lookahead[:80].replace('\n', ' ')
158
- print(f"🔍 DEBUG: Cursor at {cursor}. Current text starts with: '{start_snippet}'")
159
-
160
- # Since pymupdf4llm inserts page markers like '----- Page 5 -----', we search backwards from the cursor to find the last page tag/number
161
- current_page_search = md_text[:cursor].rfind("Page ")
162
- if current_page_search != -1:
163
- page_num = md_text[current_page_search:current_page_search+10]
164
- print(f"📖 DEBUG: Currently scanning near {page_num}")
165
- # ---- DEBUG: Print first 50 characters to see the starting sentence - End ----
166
-
167
- if not lookahead.strip(): break
168
-
169
- prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."
170
-
171
- try:
172
- # Note: Ensure call_groq_json is an async function or run in executor
173
- result = await call_groq_json(prompt, lookahead)
174
-
175
- # Process the break and update cursor; also "result.get(...)" prevents crashes if keys are missing
176
- # Semantic Jump Logic, find the break text and move cursor
177
- break_text = result.get('break_text', "")
178
- relative_break = lookahead.find(break_text) + len(break_text) if break_text in lookahead else 2000
179
-
180
- new_chunk = {
181
- "type": "leaf",
182
- "filename": result.get('filename', 'untitled_chunk'),
183
- "content": result.get('rewritten_text', '')
184
- }
185
-
186
- all_leaves.append(new_chunk)
187
- temp_group.append(new_chunk)
188
-
189
- # PUSH TO UI
190
- if queue:
191
- await queue.put(new_chunk)
192
-
193
- context_buffer["predecessor"] = new_chunk["content"]
194
- # Throttling to stay under 6000 TPM limit
195
- await asyncio.sleep(7)
196
- cursor += relative_break
197
-
198
- # PHASE II: AGGREGATION - TRIGGER L1 SUMMARY
199
- if len(temp_group) >= CHUNK_GROUP_SIZE:
200
- print("⭐ TRIGGER L1 AGGREGATION - PREPARE SUMMARY")
201
- #from chunker.chunker_hf.phase0102_chunker_aggregator_2_l0l1 import generate_summary_block # Ensure helper is available
202
- summary_res = await generate_summary_block(temp_group)
203
-
204
- summary_node = {
205
- "type": "summary",
206
- "name": summary_res['summary_name'],
207
- "content": summary_res['synthesis'],
208
- "children": [c['filename'] for c in temp_group]
209
- }
210
- summary_blocks.append(summary_node)
211
- context_buffer["latest_summary"] = summary_node["content"]
212
-
213
- if queue:
214
- await queue.put(summary_node)
215
-
216
- temp_group = []
217
-
218
- # 5-second pause after every chunk to stay under TPM limits
219
- print(" ⏳ Throttling for 5s to avoid Rate Limits...")
220
- time.sleep(5)
221
-
222
- except Exception as e:
223
- if "429" in str(e):
224
- print(" ⚠️ Rate limited! Cooling down for 30 seconds...")
225
- time.sleep(30)
226
-
227
- print(f"❌ ERROR AT CURSOR {cursor}: {e}")
228
- #cursor += 3000
229
- cursor += dynamic_jump # Use our automated jump
230
- await asyncio.sleep(10) # Longer pause on error
231
-
232
- continue
233
-
234
- if queue: await queue.put("DONE")
235
-
236
- # Final Save
237
- timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
238
- final_data = {"leaves": all_leaves, "summaries": summary_blocks}
239
- with open(f"knowledge_tree_{timestamp}.json", "w") as f:
240
- json.dump(final_data, f, indent=4)
241
-
242
- if queue:
243
- await queue.put("DONE")
244
-
245
- # Helper for summary
246
- async def generate_summary_block(chunks):
247
- combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])
248
- prompt = "Synthesize these Jungian chunks into a single high-density Level-1 summary. JSON keys: 'summary_name', 'synthesis'."
249
- return await call_groq_json(prompt, combined)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
phase0102_chunker_aggregator_2_mod.py DELETED
@@ -1,243 +0,0 @@
1
- # ./phase0102_chunker_aggregator_2_mod.py
2
-
3
- """
4
-
5
- https://www.linkedin.com/pulse/new-way-encode-documents-ai-agents-navigable-trees-sergii-makarevych-a6cof/
6
-
7
- https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f
8
-
9
-
10
- The combined script - with two phases, I and II, fired sequentially - aligns with a/ the "Dense Theory" of knowledge extraction and b/ with Makarevych's "Incremental Aggregation" logic of the availabity of a set of chunks triggering the system's to generate a summary. The "Dense Theory" of knowledge extraction is the idea that the LLM should not only extract chunks but also immediately synthesize them into higher-level summaries, creating a "Knowledge Tree" with multiple levels of abstraction.
11
-
12
- . The temp_group: Acts as a "waiting room." Once it hits 5 chunks, it empties itself into the Phase II Aggregator.
13
- . Memory Continuity: When the summary_node is created, it's saved to context_buffer["latest_summary"]. This means chunk #6 will actually "know" the summary of chunks #1–5, helping it stay consistent with the themes already established.
14
- . The "Children" Key: In the final JSON, each summary block now lists which leaf chunks belong to it. This is what makes it a Navigable Tree.
15
-
16
-
17
- > Phase I - Extract and rewrite chunks (The "Leaves")
18
-
19
- The Semantic Split: Instead of splitting at exactly 1000 characters, we give the LLM a 6000-character window and ask it to find the natural "Topic End" (break_text).
20
-
21
- Self-Sufficiency: The prompt tells the LLM to resolve pronouns; in a text where "it" could refer to a concept mentioned three paragraphs ago, this is vital.
22
-
23
- The Cursor: cursor += relative_break_point ensures we never lose our place in a document spanned across thousands of words, hundreds of pages.
24
-
25
-
26
- > Phase II - Incremental Aggregation into Summaries (The "Branches")
27
-
28
- Summary Block: With about five chunks, system builds a Summary Block
29
-
30
- Continuity: This Summary Block is then fed back into the context_buffer so the next set of Phase I chunks knows what the previous summary was.
31
-
32
- "Knowledge Tree" is thus created of summaries as branches connecting chunks as leaves
33
-
34
- """
35
-
36
- import os
37
- import json
38
- import datetime
39
- import asyncio
40
- import tiktoken
41
- import pymupdf4llm
42
- from groq import Groq
43
-
44
- from dotenv import load_dotenv
45
- from pathlib import Path
46
-
47
- import time
48
- import datetime
49
- import sys
50
-
51
-
52
- load_dotenv()
53
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
54
- MODEL = "llama-3.1-8b-instant"
55
- encoding = tiktoken.get_encoding("cl100k_base")
56
-
57
- # 2. Define the folder and the filename
58
- #pdf_folder = Path("C:\\Users\\wd052\\OneDrive\\Desktop\\00\\01\\PDFs\\J\\CW")
59
- #pdf_path = r"C:\Users\wd052\OneDrive\Desktop\00\01\PDFs\J\CW\Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
60
- #pdf_folder = Path("C:/Users/wd052/OneDrive/Desktop/00/01/PDFs/J/CW")
61
- #pdf_name = "Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
62
-
63
- # Combine them
64
- #pdf_path = pdf_folder / pdf_name
65
-
66
- WHOLE = False # Set to True to process the whole book; False to process a page range
67
- START_PAGE = 8
68
- END_PAGE = 10
69
-
70
- async def call_groq_json(system_prompt, user_content):
71
- strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."
72
-
73
- # Use loop.run_in_executor to keep the Groq call from blocking the UI
74
- loop = asyncio.get_event_loop()
75
- completion = await loop.run_in_executor(
76
- None,
77
- lambda: client.chat.completions.create(
78
- model=MODEL,
79
- messages=[
80
- {"role": "system", "content": strict_system_prompt},
81
- {"role": "user", "content": user_content}
82
- ],
83
- response_format={"type": "json_object"},
84
- temperature=0.2
85
- )
86
- )
87
- return json.loads(completion.choices[0].message.content)
88
-
89
- """
90
- completion = client.chat.completions.create(
91
- model=MODEL,
92
- messages=[
93
- {"role": "system", "content": strict_system_prompt},
94
- {"role": "user", "content": user_content}
95
- ],
96
- response_format={"type": "json_object"},
97
- temperature=0.2
98
- )
99
- return json.loads(completion.choices[0].message.content)
100
- """
101
-
102
- #async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30):
103
- # - 1 to START PAGE; Python's range(5, 7) gives pages 6 and 7, to get to the exact specified range we do START_PAGE-1
104
- # Alignment: Convert Human (1-indexed) to Library (0-indexed)
105
- # Human page 5 is internal page 4
106
- async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_PAGE-1, end_p=END_PAGE):
107
- """
108
- Main entry point for the chunking logic.
109
- If queue is provided, it 'yields' results to the UI.
110
- """
111
- #print(f"\nwhole: {whole}, start_p: {start_p}, end_p: {end_p}")
112
-
113
- # 1. Determine Page Range
114
- if whole:
115
- # PyMuPDF4LLM uses None to process all pages
116
- pages_to_read = None
117
- print("📚 Processing the WHOLE book...")
118
- else:
119
- pages_to_read = list(range(start_p, end_p))
120
- print(f"📑 Processing pages {START_PAGE} to {END_PAGE}...") # for print purposes subtract and add back 1 from start and end pages, aligning with those specified in the code
121
-
122
- # 2. Extract Markdown
123
- md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
124
-
125
- # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document ---
126
- total_len = len(md_text)
127
-
128
- # DYNAMIC JUMP: 10% of text or 2000 chars
129
- dynamic_jump = min(2000, max(500, int(total_len * 0.1)))
130
- # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document - End ---
131
-
132
- print(f"filepath -> {pdf_path}")
133
- print(f"\n# of words -> {total_len}; dynamic jump at -> {dynamic_jump}")
134
-
135
- cursor = 0
136
- all_leaves = []
137
- summary_blocks = []
138
- temp_group = []
139
- CHUNK_GROUP_SIZE = 5
140
-
141
- context_buffer = {"predecessor": "Start", "latest_summary": "None"}
142
-
143
- while cursor < len(md_text):
144
- lookahead = md_text[cursor : cursor + 6000]
145
-
146
- # ---- DEBUG: Print first 50 characters to see the starting sentence ----
147
- start_snippet = lookahead[:80].replace('\n', ' ')
148
- print(f"🔍 DEBUG: Cursor at {cursor}. Current text starts with: '{start_snippet}'")
149
-
150
- # Since pymupdf4llm inserts page markers like '----- Page 5 -----', we search backwards from the cursor to find the last page tag/number
151
- current_page_search = md_text[:cursor].rfind("Page ")
152
- if current_page_search != -1:
153
- page_num = md_text[current_page_search:current_page_search+10]
154
- print(f"📖 DEBUG: Currently scanning near {page_num}")
155
- # ---- DEBUG: Print first 50 characters to see the starting sentence - End ----
156
-
157
- if not lookahead.strip(): break
158
-
159
- prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."
160
-
161
- try:
162
- #prompt = "Extract self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."
163
-
164
- # Note: Ensure call_groq_json is an async function or run in executor
165
- result = await call_groq_json(prompt, lookahead)
166
-
167
- new_chunk = {
168
- "type": "leaf",
169
- "filename": result.get('filename', 'untitled'),
170
- "content": result.get('rewritten_text', '')
171
- }
172
-
173
- context_buffer["predecessor"] = new_chunk["content"]
174
-
175
- all_leaves.append(new_chunk)
176
-
177
- # PUSH TO UI
178
- if queue:
179
- await queue.put(new_chunk)
180
-
181
- # Semantic Jump Logic; find the break text and move cursor
182
- break_text = result.get('break_text', "")
183
- relative_break = lookahead.find(break_text) + len(break_text) if break_text in lookahead else 2000
184
-
185
- cursor += relative_break
186
-
187
- temp_group.append(new_chunk)
188
- # Throttling to stay under 6000 TPM limit
189
- await asyncio.sleep(7)
190
-
191
-
192
- # PHASE II: AGGREGATION - TRIGGER L1 SUMMARY
193
- if len(temp_group) >= CHUNK_GROUP_SIZE:
194
- print("⭐ TRIGGER L1 AGGREGATION - PREPARE SUMMARY")
195
- from phase0102_chunker_aggregator_2 import generate_summary_block # Ensure helper is available
196
- summary_res = await generate_summary_block(temp_group)
197
-
198
- summary_node = {
199
- "type": "summary",
200
- "name": summary_res['summary_name'],
201
- "content": summary_res['synthesis'],
202
- "children": [c['filename'] for c in temp_group]
203
- }
204
- summary_blocks.append(summary_node)
205
- context_buffer["latest_summary"] = summary_node["content"]
206
-
207
- if queue:
208
- await queue.put(summary_node)
209
-
210
- temp_group = []
211
-
212
- # 5-second pause after every chunk to stay under TPM limits
213
- print(" ⏳ Throttling for 5s to avoid Rate Limits...")
214
- time.sleep(5)
215
-
216
- except Exception as e:
217
- if "429" in str(e):
218
- print(" ⚠️ Rate limited! Cooling down for 30 seconds...")
219
- time.sleep(30)
220
-
221
- print(f"❌ ERROR AT CURSOR {cursor}: {e}")
222
- #cursor += 3000
223
- cursor += dynamic_jump # Use our automated jump
224
- await asyncio.sleep(10) # Longer pause on error
225
-
226
- continue
227
-
228
- if queue: await queue.put("DONE")
229
-
230
- # Final Save
231
- timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
232
- final_data = {"leaves": all_leaves, "summaries": summary_blocks}
233
- with open(f"knowledge_tree_{timestamp}.json", "w") as f:
234
- json.dump(final_data, f, indent=4)
235
-
236
- if queue:
237
- await queue.put("DONE")
238
-
239
- # Helper for summary
240
- async def generate_summary_block(chunks):
241
- combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])
242
- prompt = "Synthesize these Jungian chunks into a single high-density Level-1 summary. JSON keys: 'summary_name', 'synthesis'."
243
- return await call_groq_json(prompt, combined)