prashantmatlani commited on
Commit
21b1abb
Β·
1 Parent(s): b95739e

modified phase0102

Browse files
Files changed (2) hide show
  1. main.py +1 -1
  2. phase0102_chunker_aggregator_2.py +65 -23
main.py CHANGED
@@ -76,7 +76,7 @@ async def handle_upload(
76
  # Fix: Convert strings to proper types
77
  is_whole = whole.lower() == "true"
78
  s_page = int(start)
79
- s_page = s_page-1 if s_page != 1 else 0
80
  e_page = int(end)
81
 
82
  #Debugging the values received from the UI
 
76
  # Fix: Convert strings to proper types
77
  is_whole = whole.lower() == "true"
78
  s_page = int(start)
79
+ #s_page = s_page-1 if s_page != 1 else 0
80
  e_page = int(end)
81
 
82
  #Debugging the values received from the UI
phase0102_chunker_aggregator_2.py CHANGED
@@ -71,9 +71,9 @@ encoding = tiktoken.get_encoding("cl100k_base")
71
  # Combine them
72
  #pdf_path = pdf_folder / pdf_name
73
 
74
- WHOLE = False # Set to True to process the whole book; False to process a page range
75
- START_PAGE = 8
76
- END_PAGE = 10
77
 
78
  laf = 2000 # look-ahead factor
79
  djf = 0.1 # dynamic jump factor
@@ -104,7 +104,8 @@ async def call_groq_json(system_prompt, user_content):
104
  # - 1 to START PAGE; Python's range(5, 7) gives pages 6 and 7, to get to the exact specified range we do START_PAGE-1
105
  # Alignment: Convert Human (1-indexed) to Library (0-indexed)
106
  # Human page 5 is internal page 4
107
- async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_PAGE-1, end_p=END_PAGE):
 
108
  """
109
  Main entry point for the chunking logic.
110
  If queue is provided, it 'yields' results to the UI.
@@ -117,21 +118,33 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
117
  pages_to_read = None
118
  print("πŸ“š Processing the WHOLE book...")
119
  else:
120
- pages_to_read = list(range(start_p, end_p))
121
- print(f"πŸ“‘ Processing pages {START_PAGE} to {END_PAGE}...") # for print purposes subtract and add back 1 from start and end pages, aligning with those specified in the code
 
122
 
123
  # 2. Extract Markdown
124
  md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
125
 
126
  # Returns a list of dictionaries, one for each page
127
  #pagesscanned = pymupdf4llm.to_markdown("your_document.pdf", page_chunks=True)
128
- pagesscanned = pymupdf4llm.to_markdown(str(pdf_path), page_chunks=True)
129
 
 
 
 
 
 
 
 
 
 
 
 
130
  # Instead of a single string of text, we have a list to pull directly the page numbers being scanned from each chunk's metadata
131
  for p in pagesscanned:
132
  real_page_num = p["metadata"]["page_number"] # This is the real-time detected page
133
  text_content = p["text"]
134
-
135
  # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document ---
136
  total_len = len(md_text)
137
 
@@ -188,7 +201,7 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
188
  # Note: Ensure call_groq_json is an async function or run in executor
189
  res = await call_groq_json(prompt, lookahead)
190
 
191
- leaf = {"type": "leaf", "name": res['filename'], "content": res['rewritten_text']}
192
 
193
  all_leaves.append(leaf)
194
  l0_buffer.append(leaf) # stack-up leaves
@@ -227,13 +240,23 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
227
  # Process the break and update cursor; also "result.get(...)" prevents crashes if keys are missing
228
  # Semantic Jump Logic, find the break text and move cursor
229
  break_text = res.get('break_text', "")
230
- cursor += (lookahead.find(break_text) + len(break_text)) if break_text in lookahead else 2000
231
 
 
 
 
 
 
 
 
 
 
232
  new_chunk = {
233
  "type": "leaf",
234
  "filename": res.get('filename', 'untitled'),
235
  "content": res.get('rewritten_text', ''),
236
- "original": lookahead[:len(res.get('break_text', '')) + 500] # Save a snippet of the original
 
237
  }
238
 
239
  # Throttling to stay under 6000 TPM limit
@@ -285,7 +308,9 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
285
  #}
286
  #"""
287
  final_data = {
288
- "metadata": {"pages": f"{pagesscanned}", "date": timestamp},
 
 
289
  "date": timestamp,
290
  "leaves": all_leaves,
291
  "l1_clusters": all_l1_summaries,
@@ -345,8 +370,11 @@ Visual Clarity: Table Markdown is perfect for a quick bird's-eye view, such as t
345
  # --- NESTED AND TABULAR MARKDOWN
346
  def export_visual_formats(final_data, timestamp):
347
  # --- NESTED MARKDOWN ---
348
- #md_nested = f"# πŸ‘‘ VOLUME: {final_data['metadata']['pages']}\n"
349
- md_nested = f"# πŸ‘‘ VOLUME SUMMARY\n"
 
 
 
350
  md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
351
 
352
  for l2 in final_data['l2_chapters']:
@@ -355,24 +383,38 @@ def export_visual_formats(final_data, timestamp):
355
  for l1 in final_data['l1_clusters']:
356
  md_nested += f"### ⭐ CLUSTER: {l1['name']}\n> {l1['content']}\n\n"
357
  for leaf in final_data['leaves']:
 
358
  md_nested += f"#### πŸ“„ [LEAF]: {leaf['name']}\n"
359
  md_nested += f"**[AI INTERPRETATION]:** {leaf['content']}\n\n"
360
  md_nested += f"**[ORIGINAL TEXT]:** {leaf.get('original', 'N/A')[:250]}...\n\n---\n"
361
 
362
  # --- TABULAR MARKDOWN ---
363
- md_table = "| Level | Name | Content Snippet |\n| :--- | :--- | :--- |\n"
364
- if final_data['l3_volume']:
365
- md_table += f"| πŸ‘‘ VOLUME | {final_data['l3_volume']['name']} | {final_data['l3_volume']['content'][:150]}... |\n"
 
 
366
  for l2 in final_data['l2_chapters']:
367
- md_table += f"| πŸ’Ž CHAPTER | {l2['name']} | {l2['content'][:150]}... |\n"
368
- for l1 in final_data['l1_clusters']:
369
- md_table += f"| ⭐ CLUSTER | {l1['name']} | {l1['content'][:150]}... |\n"
370
- for leaf in final_data['leaves']:
371
- md_table += f"| πŸ“„ LEAF | {leaf['name']} | **[AI]** {leaf['content'][:150]}... |\n"
 
 
 
 
 
 
 
 
 
 
 
372
 
373
  # Save files
374
  with open(f"nested_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_nested)
375
  with open(f"table_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_table)
376
 
377
 
378
- print(f"βœ… Visual Markdowns created: nested_knowledge_{timestamp}.md and table_knowledge_{timestamp}.md")
 
71
  # Combine them
72
  #pdf_path = pdf_folder / pdf_name
73
 
74
+ #WHOLE = False # Set to True to process the whole book; False to process a page range
75
+ #START_PAGE = 8
76
+ #END_PAGE = 10
77
 
78
  laf = 2000 # look-ahead factor
79
  djf = 0.1 # dynamic jump factor
 
104
  # - 1 to START PAGE; Python's range(5, 7) gives pages 6 and 7, to get to the exact specified range we do START_PAGE-1
105
  # Alignment: Convert Human (1-indexed) to Library (0-indexed)
106
  # Human page 5 is internal page 4
107
+ #async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_PAGE-1, end_p=END_PAGE):
108
+ async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=1, end_p=1):
109
  """
110
  Main entry point for the chunking logic.
111
  If queue is provided, it 'yields' results to the UI.
 
118
  pages_to_read = None
119
  print("πŸ“š Processing the WHOLE book...")
120
  else:
121
+ # start_p-1 -> adjustment for 0-indexing
122
+ pages_to_read = list(range(int(start_p-1), int(end_p)))
123
+ #print(f"πŸ“‘ Processing pages {START_PAGE} to {END_PAGE}...") # for print purposes subtract and add back 1 from start and end pages, aligning with those specified in the code
124
 
125
  # 2. Extract Markdown
126
  md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
127
 
128
  # Returns a list of dictionaries, one for each page
129
  #pagesscanned = pymupdf4llm.to_markdown("your_document.pdf", page_chunks=True)
130
+ allpages = pymupdf4llm.to_markdown(str(pdf_path), page_chunks=True)
131
 
132
+ pages_data = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read, page_chunks=True)
133
+
134
+ print(f"πŸ“– Page-Aware Engine Started. Total Pages to process: {len(pages_data)}")
135
+
136
+ # pull page number from the chunk's metadata
137
+ for page in pages_data:
138
+ # Extract metadata from this specific page
139
+ current_page_text = page["text"]
140
+ real_page_num = page["metadata"].get("page_number", "??")
141
+
142
+ """
143
  # Instead of a single string of text, we have a list to pull directly the page numbers being scanned from each chunk's metadata
144
  for p in pagesscanned:
145
  real_page_num = p["metadata"]["page_number"] # This is the real-time detected page
146
  text_content = p["text"]
147
+ """
148
  # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document ---
149
  total_len = len(md_text)
150
 
 
201
  # Note: Ensure call_groq_json is an async function or run in executor
202
  res = await call_groq_json(prompt, lookahead)
203
 
204
+ leaf = {"type": "leaf", "page": real_page_num, "name": res['filename'], "content": res['rewritten_text']}
205
 
206
  all_leaves.append(leaf)
207
  l0_buffer.append(leaf) # stack-up leaves
 
240
  # Process the break and update cursor; also "result.get(...)" prevents crashes if keys are missing
241
  # Semantic Jump Logic, find the break text and move cursor
242
  break_text = res.get('break_text', "")
243
+ cursor += (lookahead.find(break_text) + len(break_text)) if break_text in lookahead else laf # laf -> 2000
244
 
245
+ # Calculate exactly where the chunk ends
246
+ if break_text in lookahead:
247
+ end_index = lookahead.find(break_text) + len(break_text)
248
+ else:
249
+ end_index = laf # Fallback
250
+
251
+ # This captures ONLY the text analyzed for this specific leaf
252
+ actual_original_text = lookahead[:end_index]
253
+
254
  new_chunk = {
255
  "type": "leaf",
256
  "filename": res.get('filename', 'untitled'),
257
  "content": res.get('rewritten_text', ''),
258
+ "page_num": page["metadata"]["page_number"], # capture page number
259
+ "original": actual_original_text, # Save a snippet of the original
260
  }
261
 
262
  # Throttling to stay under 6000 TPM limit
 
308
  #}
309
  #"""
310
  final_data = {
311
+ #"metadata": {"pages": f"{allpages}", "date": timestamp},
312
+ #"metadata": {"page_number": f"{page_num}", "date": timestamp},
313
+ "metadata": {"pages": f"{start_p}-{end_p}", "date": timestamp},
314
  "date": timestamp,
315
  "leaves": all_leaves,
316
  "l1_clusters": all_l1_summaries,
 
370
  # --- NESTED AND TABULAR MARKDOWN
371
  def export_visual_formats(final_data, timestamp):
372
  # --- NESTED MARKDOWN ---
373
+
374
+ # --- Uncoment the below to include the whole text - 'pages' - of the document in generated "nested_knowledge_xxxx" markdown and in json, useful in the case of short documents, articles, papers, etc. ---
375
+ #md_nested = f"# πŸ‘‘ VOLUME: {final_data['metadata']['pages']}\n"
376
+ md_nested = f"# πŸ‘‘ VOLUME: {final_data['metadata']['page_num']}\n"
377
+ #md_nested = f"# πŸ‘‘ VOLUME SUMMARY\n"
378
  md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
379
 
380
  for l2 in final_data['l2_chapters']:
 
383
  for l1 in final_data['l1_clusters']:
384
  md_nested += f"### ⭐ CLUSTER: {l1['name']}\n> {l1['content']}\n\n"
385
  for leaf in final_data['leaves']:
386
+ page_label = f" (Page {leaf.get('page_num', '??')})"
387
  md_nested += f"#### πŸ“„ [LEAF]: {leaf['name']}\n"
388
  md_nested += f"**[AI INTERPRETATION]:** {leaf['content']}\n\n"
389
  md_nested += f"**[ORIGINAL TEXT]:** {leaf.get('original', 'N/A')[:250]}...\n\n---\n"
390
 
391
  # --- TABULAR MARKDOWN ---
392
+ md_table = "| Volume (L3) | Chapter (L2) | Cluster/Summary (L1) | Page | Chunk (L0) |\n"
393
+ md_table += "| :--- | :--- | :--- | :--- | :--- |\n"
394
+
395
+ l3_name = final_data['l3_volume']['name'] if final_data['l3_volume'] else "Volume"
396
+
397
  for l2 in final_data['l2_chapters']:
398
+ l2_name = l2['name']
399
+ l2_summary = l2['content'][:100] + "..."
400
+
401
+ for l1 in final_data['l1_clusters']:
402
+ l1_name = l1['name']
403
+ l1_summary = l1['content'][:100] + "..."
404
+
405
+ for leaf in final_data['leaves']:
406
+ leaf_name = leaf['name']
407
+ # Include page number in the table for extra clarity
408
+ pg = leaf.get('page_num', '??')
409
+ leaf_content = f"**[P.{pg} AI]** " + leaf['content'][:150] + "..."
410
+ orig_text = leaf.get('original', 'N/A')[:100] + "..."
411
+
412
+ md_table += f"| πŸ‘‘ VOLUME: {l3_name} | πŸ’Ž CHAPTER: **{l2_name}**: {l2_summary} | **⭐ CLUSTER: {l1_name}**: {l1_summary} | {pg} | πŸ“„ LEAF: {leaf_content} | ORIGINAL: {orig_text} | \n"
413
+
414
 
415
  # Save files
416
  with open(f"nested_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_nested)
417
  with open(f"table_knowledge_{timestamp}.md", "w", encoding="utf-8") as f: f.write(md_table)
418
 
419
 
420
+ print(f"βœ… Created: \n\nVisual Markdowns: \nnested_knowledge_{timestamp}.md \ntable_knowledge_{timestamp}.md \n\nand JSON: \n\nknowledge_tree_{timestamp}.json")