Spaces:
Running
Running
Commit ·
1579f7c
1
Parent(s): 100a70e
update phase0102 wih the inclusion of page numbers scanned
Browse files
phase0102_chunker_aggregator_2.py
CHANGED
|
@@ -95,7 +95,7 @@ async def call_groq_json(system_prompt, user_content):
|
|
| 95 |
temperature=0.2 # Lower temperature = more stable JSON; the LLM is less "creative" with formatting at temperature of 0.2, and more likely to follow a perfect JSON structure
|
| 96 |
)
|
| 97 |
)
|
| 98 |
-
|
| 99 |
# LLM can technically generate multiple different versions of an answer if its asked to
|
| 100 |
# Groq returns these as a list called "choices", since even a single answer is inside a list, Python must be told to look at index 0 to get the actual content
|
| 101 |
# Then we access the "message" key, followed by "content" key to get the raw JSON string
|
|
@@ -123,6 +123,15 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
|
|
| 123 |
# 2. Extract Markdown
|
| 124 |
md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
# --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document ---
|
| 127 |
total_len = len(md_text)
|
| 128 |
|
|
@@ -274,13 +283,15 @@ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_
|
|
| 274 |
# "l2_chapters": all_l2_summaries,
|
| 275 |
# "l3_volume": l3_node
|
| 276 |
#}
|
| 277 |
-
|
| 278 |
-
final_data = {
|
|
|
|
|
|
|
| 279 |
"leaves": all_leaves,
|
| 280 |
"l1_clusters": all_l1_summaries,
|
| 281 |
"l2_chapters": all_l2_summaries,
|
| 282 |
"l3_volume": l3_node}
|
| 283 |
-
|
| 284 |
output_file = f"knowledge_tree_{timestamp}.json"
|
| 285 |
with open(output_file, "w") as f:
|
| 286 |
json.dump(final_data, f, indent=4)
|
|
@@ -334,7 +345,7 @@ Visual Clarity: Table Markdown is perfect for a quick bird's-eye view, such as t
|
|
| 334 |
# --- NESTED AND TABULAR MARKDOWN
|
| 335 |
def export_visual_formats(final_data, timestamp):
|
| 336 |
# --- NESTED MARKDOWN ---
|
| 337 |
-
|
| 338 |
md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
|
| 339 |
|
| 340 |
for l2 in final_data['l2_chapters']:
|
|
|
|
| 95 |
temperature=0.2 # Lower temperature = more stable JSON; the LLM is less "creative" with formatting at temperature of 0.2, and more likely to follow a perfect JSON structure
|
| 96 |
)
|
| 97 |
)
|
| 98 |
+
|
| 99 |
# LLM can technically generate multiple different versions of an answer if its asked to
|
| 100 |
# Groq returns these as a list called "choices", since even a single answer is inside a list, Python must be told to look at index 0 to get the actual content
|
| 101 |
# Then we access the "message" key, followed by "content" key to get the raw JSON string
|
|
|
|
| 123 |
# 2. Extract Markdown
|
| 124 |
md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
|
| 125 |
|
| 126 |
+
# Returns a list of dictionaries, one for each page
|
| 127 |
+
#pagesscanned = pymupdf4llm.to_markdown("your_document.pdf", page_chunks=True)
|
| 128 |
+
pagesscanned = pymupdf4llm.to_markdown(str(pdf_path), page_chunks=True)
|
| 129 |
+
|
| 130 |
+
# Instead of a single string of text, we have a list to pull directly the page numbers being scanned from each chunk's metadata
|
| 131 |
+
for p in pagesscanned:
|
| 132 |
+
real_page_num = p["metadata"]["page_number"] # This is the real-time detected page
|
| 133 |
+
text_content = p["text"]
|
| 134 |
+
|
| 135 |
# --- Initialize the number of characters permitted to be skipped, depending on the total number of words in the document ---
|
| 136 |
total_len = len(md_text)
|
| 137 |
|
|
|
|
| 283 |
# "l2_chapters": all_l2_summaries,
|
| 284 |
# "l3_volume": l3_node
|
| 285 |
#}
|
| 286 |
+
#"""
|
| 287 |
+
final_data = {
|
| 288 |
+
"metadata": {"pages": f"{pagesscanned}", "date": timestamp},
|
| 289 |
+
"date": timestamp,
|
| 290 |
"leaves": all_leaves,
|
| 291 |
"l1_clusters": all_l1_summaries,
|
| 292 |
"l2_chapters": all_l2_summaries,
|
| 293 |
"l3_volume": l3_node}
|
| 294 |
+
#"""
|
| 295 |
output_file = f"knowledge_tree_{timestamp}.json"
|
| 296 |
with open(output_file, "w") as f:
|
| 297 |
json.dump(final_data, f, indent=4)
|
|
|
|
| 345 |
# --- NESTED AND TABULAR MARKDOWN
|
| 346 |
def export_visual_formats(final_data, timestamp):
|
| 347 |
# --- NESTED MARKDOWN ---
|
| 348 |
+
md_nested = f"# 👑 VOLUME: {final_data['metadata']['pagesscanned']}\n"
|
| 349 |
md_nested += f"> {final_data['l3_volume']['content'] if final_data['l3_volume'] else 'N/A'}\n\n"
|
| 350 |
|
| 351 |
for l2 in final_data['l2_chapters']:
|