chunker / generated_updated_aggregator.py
prashantmatlani's picture
Upload generated_updated_aggregator.py
27f4d1e verified
import os
def generate_updated_aggregator():
script_content = r'''import os
import json
import datetime
import asyncio
import tiktoken
import pymupdf4llm
from groq import Groq
from dotenv import load_dotenv
from pathlib import Path
# 1. SETUP
load_dotenv()
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
MODEL = "llama-3.1-8b-instant"
encoding = tiktoken.get_encoding("cl100k_base")
def call_groq_json(system_prompt, user_content):
strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."
completion = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": strict_system_prompt},
{"role": "user", "content": user_content}
],
response_format={"type": "json_object"},
temperature=0.2
)
return json.loads(completion.choices[0].message.content)
def generate_summary_block(chunks):
combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])
prompt = "Synthesize these Jungian chunks into a dense Level-1 summary. JSON keys: 'summary_name', 'synthesis'."
return call_groq_json(prompt, combined)
async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30):
# Setup Directory for Markdown Files
timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
md_folder = Path(f"jungian_agent_data_{timestamp}")
md_folder.mkdir(exist_ok=True)
# 1. Determine Page Range
pages_to_read = None if whole else list(range(start_p, end_p))
print(f"🚀 {'WHOLE BOOK' if whole else f'Pages {start_p}-{end_p}'} processing started...")
# 2. Extract Markdown
md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
cursor = 0
all_leaves = []
summary_blocks = []
temp_group = []
CHUNK_GROUP_SIZE = 5
# context_buffer holds the 'rolling state'
context_buffer = {"predecessor": "Start", "latest_summary": "None"}
while cursor < len(md_text):
lookahead = md_text[cursor : cursor + 6000]
if not lookahead.strip(): break
prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."
try:
result = call_groq_json(prompt, lookahead)
# Semantic Jump Logic
break_text = result.get('break_text', "")
relative_break = lookahead.find(break_text) + len(break_text) if (break_text and break_text in lookahead) else 2000
new_chunk = {
"type": "leaf",
"filename": result.get('filename', 'untitled_chunk').replace(" ", "_"),
"content": result.get('rewritten_text', ''),
"parent_summary": context_buffer["latest_summary"]
}
all_leaves.append(new_chunk)
temp_group.append(new_chunk)
# PUSH TO UI
if queue:
await queue.put(new_chunk)
context_buffer["predecessor"] = new_chunk["content"]
cursor += relative_break
# PHASE II: AGGREGATION
if len(temp_group) >= CHUNK_GROUP_SIZE:
summary_res = generate_summary_block(temp_group)
summary_node = {
"type": "summary",
"name": summary_res['summary_name'].replace(" ", "_"),
"content": summary_res['synthesis'],
"children": [c['filename'] for c in temp_group]
}
summary_blocks.append(summary_node)
context_buffer["latest_summary"] = summary_node["content"]
# Update all chunks in this group with their official parent summary
for c in temp_group:
c["parent_summary"] = summary_node["content"]
# SAVE CONTEXTUAL MARKDOWN FILE
md_filename = md_folder / f"{c['filename']}.md"
with open(md_filename, "w", encoding="utf-8") as md_file:
md_file.write(f"--- CONTEXT ---\n{summary_node['content']}\n\n")
md_file.write(f"--- CONTENT ---\n{c['content']}")
if queue:
await queue.put(summary_node)
temp_group = []
except Exception as e:
print(f"Error: {e}")
cursor += 3000
continue
# Final Save of the Master JSON
final_data = {"leaves": all_leaves, "summaries": summary_blocks}
with open(f"knowledge_tree_{timestamp}.json", "w") as f:
json.dump(final_data, f, indent=4)
if queue:
await queue.put("DONE")
'''
return script_content
print(generate_updated_aggregator())