prashantmatlani commited on
Commit
1579f7c
·
1 Parent(s): 100a70e

update phase0102 wih the inclusion of page numbers scanned

Browse files
Files changed (1) hide show
  1. phase0102_chunker_aggregator_2.py +16 -5
phase0102_chunker_aggregator_2.py CHANGED
@@ -95,7 +95,7 @@ async def call_groq_json(system_prompt, user_content):
95
  temperature=0.2 # Lower temperature = more stable JSON; the LLM is less "creative" with formatting at temperature of 0.2, and more likely to follow a perfect JSON structure
96
  )
97
  )
98
-
99
  # LLM can technically generate multiple different versions of an answer if its asked to
100
  # Groq returns these as a list called "choices", since even a single answer is inside a list, Python must be told to look at index 0 to get the actual content
101
  # Then we access the "message" key, followed by "content" key to get the raw JSON string
@@ -123,6 +123,15 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
123
  # 2. Extract Markdown
124
  md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
125
 
 
 
 
 
 
 
 
 
 
126
  # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document ---
127
  total_len = len(md_text)
128
 
@@ -274,13 +283,15 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
274
  # "l2_chapters": all_l2_summaries,
275
  # "l3_volume": l3_node
276
  #}
277
-
278
- final_data = {"date": timestamp,
 
 
279
  "leaves": all_leaves,
280
  "l1_clusters": all_l1_summaries,
281
  "l2_chapters": all_l2_summaries,
282
  "l3_volume": l3_node}
283
-
284
  output_file = f"knowledge_tree_{timestamp}.json"
285
  with open(output_file, "w") as f:
286
  json.dump(final_data, f, indent=4)
@@ -334,7 +345,7 @@ Visual Clarity: Table Markdown is perfect for a quick bird's-eye view, such as t
334
  # --- NESTED AND TABULAR MARKDOWN
335
  def export_visual_formats(final_data, timestamp):
336
  # --- NESTED MARKDOWN ---
337
- #md_nested = f"# 👑 VOLUME: {final_data['metadata']['pages']}\n"
338
  md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
339
 
340
  for l2 in final_data['l2_chapters']:
 
95
  temperature=0.2 # Lower temperature = more stable JSON; the LLM is less "creative" with formatting at temperature of 0.2, and more likely to follow a perfect JSON structure
96
  )
97
  )
98
+
99
  # LLM can technically generate multiple different versions of an answer if its asked to
100
  # Groq returns these as a list called "choices", since even a single answer is inside a list, Python must be told to look at index 0 to get the actual content
101
  # Then we access the "message" key, followed by "content" key to get the raw JSON string
 
123
  # 2. Extract Markdown
124
  md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
125
 
126
+ # Returns a list of dictionaries, one for each page
127
+ #pagesscanned = pymupdf4llm.to_markdown("your_document.pdf", page_chunks=True)
128
+ pagesscanned = pymupdf4llm.to_markdown(str(pdf_path), page_chunks=True)
129
+
130
+ # Instead of a single string of text, we have a list to pull directly the page numbers being scanned from each chunk's metadata
131
+ for p in pagesscanned:
132
+ real_page_num = p["metadata"]["page_number"] # This is the real-time detected page
133
+ text_content = p["text"]
134
+
135
  # --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document ---
136
  total_len = len(md_text)
137
 
 
283
  # "l2_chapters": all_l2_summaries,
284
  # "l3_volume": l3_node
285
  #}
286
+ #"""
287
+ final_data = {
288
+ "metadata": {"pages": f"{pagesscanned}", "date": timestamp},
289
+ "date": timestamp,
290
  "leaves": all_leaves,
291
  "l1_clusters": all_l1_summaries,
292
  "l2_chapters": all_l2_summaries,
293
  "l3_volume": l3_node}
294
+ #"""
295
  output_file = f"knowledge_tree_{timestamp}.json"
296
  with open(output_file, "w") as f:
297
  json.dump(final_data, f, indent=4)
 
345
  # --- NESTED AND TABULAR MARKDOWN
346
  def export_visual_formats(final_data, timestamp):
347
  # --- NESTED MARKDOWN ---
348
+ md_nested = f"# 👑 VOLUME: {final_data['metadata']['pagesscanned']}\n"
349
  md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
350
 
351
  for l2 in final_data['l2_chapters']: