import os def generate_updated_aggregator(): script_content = r'''import os import json import datetime import asyncio import tiktoken import pymupdf4llm from groq import Groq from dotenv import load_dotenv from pathlib import Path # 1. SETUP load_dotenv() client = Groq(api_key=os.getenv("GROQ_API_KEY")) MODEL = "llama-3.1-8b-instant" encoding = tiktoken.get_encoding("cl100k_base") def call_groq_json(system_prompt, user_content): strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON." completion = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": strict_system_prompt}, {"role": "user", "content": user_content} ], response_format={"type": "json_object"}, temperature=0.2 ) return json.loads(completion.choices[0].message.content) def generate_summary_block(chunks): combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks]) prompt = "Synthesize these Jungian chunks into a dense Level-1 summary. JSON keys: 'summary_name', 'synthesis'." return call_groq_json(prompt, combined) async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30): # Setup Directory for Markdown Files timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M") md_folder = Path(f"jungian_agent_data_{timestamp}") md_folder.mkdir(exist_ok=True) # 1. Determine Page Range pages_to_read = None if whole else list(range(start_p, end_p)) print(f"🚀 {'WHOLE BOOK' if whole else f'Pages {start_p}-{end_p}'} processing started...") # 2. Extract Markdown md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read) cursor = 0 all_leaves = [] summary_blocks = [] temp_group = [] CHUNK_GROUP_SIZE = 5 # context_buffer holds the 'rolling state' context_buffer = {"predecessor": "Start", "latest_summary": "None"} while cursor < len(md_text): lookahead = md_text[cursor : cursor + 6000] if not lookahead.strip(): break prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'." try: result = call_groq_json(prompt, lookahead) # Semantic Jump Logic break_text = result.get('break_text', "") relative_break = lookahead.find(break_text) + len(break_text) if (break_text and break_text in lookahead) else 2000 new_chunk = { "type": "leaf", "filename": result.get('filename', 'untitled_chunk').replace(" ", "_"), "content": result.get('rewritten_text', ''), "parent_summary": context_buffer["latest_summary"] } all_leaves.append(new_chunk) temp_group.append(new_chunk) # PUSH TO UI if queue: await queue.put(new_chunk) context_buffer["predecessor"] = new_chunk["content"] cursor += relative_break # PHASE II: AGGREGATION if len(temp_group) >= CHUNK_GROUP_SIZE: summary_res = generate_summary_block(temp_group) summary_node = { "type": "summary", "name": summary_res['summary_name'].replace(" ", "_"), "content": summary_res['synthesis'], "children": [c['filename'] for c in temp_group] } summary_blocks.append(summary_node) context_buffer["latest_summary"] = summary_node["content"] # Update all chunks in this group with their official parent summary for c in temp_group: c["parent_summary"] = summary_node["content"] # SAVE CONTEXTUAL MARKDOWN FILE md_filename = md_folder / f"{c['filename']}.md" with open(md_filename, "w", encoding="utf-8") as md_file: md_file.write(f"--- CONTEXT ---\n{summary_node['content']}\n\n") md_file.write(f"--- CONTENT ---\n{c['content']}") if queue: await queue.put(summary_node) temp_group = [] except Exception as e: print(f"Error: {e}") cursor += 3000 continue # Final Save of the Master JSON final_data = {"leaves": all_leaves, "summaries": summary_blocks} with open(f"knowledge_tree_{timestamp}.json", "w") as f: json.dump(final_data, f, indent=4) if queue: await queue.put("DONE") ''' return script_content print(generate_updated_aggregator())