Spaces:
Running
Running
| import os | |
| def generate_updated_aggregator(): | |
| script_content = r'''import os | |
| import json | |
| import datetime | |
| import asyncio | |
| import tiktoken | |
| import pymupdf4llm | |
| from groq import Groq | |
| from dotenv import load_dotenv | |
| from pathlib import Path | |
| # 1. SETUP | |
| load_dotenv() | |
| client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| MODEL = "llama-3.1-8b-instant" | |
| encoding = tiktoken.get_encoding("cl100k_base") | |
| def call_groq_json(system_prompt, user_content): | |
| strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON." | |
| completion = client.chat.completions.create( | |
| model=MODEL, | |
| messages=[ | |
| {"role": "system", "content": strict_system_prompt}, | |
| {"role": "user", "content": user_content} | |
| ], | |
| response_format={"type": "json_object"}, | |
| temperature=0.2 | |
| ) | |
| return json.loads(completion.choices[0].message.content) | |
| def generate_summary_block(chunks): | |
| combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks]) | |
| prompt = "Synthesize these Jungian chunks into a dense Level-1 summary. JSON keys: 'summary_name', 'synthesis'." | |
| return call_groq_json(prompt, combined) | |
| async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30): | |
| # Setup Directory for Markdown Files | |
| timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M") | |
| md_folder = Path(f"jungian_agent_data_{timestamp}") | |
| md_folder.mkdir(exist_ok=True) | |
| # 1. Determine Page Range | |
| pages_to_read = None if whole else list(range(start_p, end_p)) | |
| print(f"🚀 {'WHOLE BOOK' if whole else f'Pages {start_p}-{end_p}'} processing started...") | |
| # 2. Extract Markdown | |
| md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read) | |
| cursor = 0 | |
| all_leaves = [] | |
| summary_blocks = [] | |
| temp_group = [] | |
| CHUNK_GROUP_SIZE = 5 | |
| # context_buffer holds the 'rolling state' | |
| context_buffer = {"predecessor": "Start", "latest_summary": "None"} | |
| while cursor < len(md_text): | |
| lookahead = md_text[cursor : cursor + 6000] | |
| if not lookahead.strip(): break | |
| prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'." | |
| try: | |
| result = call_groq_json(prompt, lookahead) | |
| # Semantic Jump Logic | |
| break_text = result.get('break_text', "") | |
| relative_break = lookahead.find(break_text) + len(break_text) if (break_text and break_text in lookahead) else 2000 | |
| new_chunk = { | |
| "type": "leaf", | |
| "filename": result.get('filename', 'untitled_chunk').replace(" ", "_"), | |
| "content": result.get('rewritten_text', ''), | |
| "parent_summary": context_buffer["latest_summary"] | |
| } | |
| all_leaves.append(new_chunk) | |
| temp_group.append(new_chunk) | |
| # PUSH TO UI | |
| if queue: | |
| await queue.put(new_chunk) | |
| context_buffer["predecessor"] = new_chunk["content"] | |
| cursor += relative_break | |
| # PHASE II: AGGREGATION | |
| if len(temp_group) >= CHUNK_GROUP_SIZE: | |
| summary_res = generate_summary_block(temp_group) | |
| summary_node = { | |
| "type": "summary", | |
| "name": summary_res['summary_name'].replace(" ", "_"), | |
| "content": summary_res['synthesis'], | |
| "children": [c['filename'] for c in temp_group] | |
| } | |
| summary_blocks.append(summary_node) | |
| context_buffer["latest_summary"] = summary_node["content"] | |
| # Update all chunks in this group with their official parent summary | |
| for c in temp_group: | |
| c["parent_summary"] = summary_node["content"] | |
| # SAVE CONTEXTUAL MARKDOWN FILE | |
| md_filename = md_folder / f"{c['filename']}.md" | |
| with open(md_filename, "w", encoding="utf-8") as md_file: | |
| md_file.write(f"--- CONTEXT ---\n{summary_node['content']}\n\n") | |
| md_file.write(f"--- CONTENT ---\n{c['content']}") | |
| if queue: | |
| await queue.put(summary_node) | |
| temp_group = [] | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| cursor += 3000 | |
| continue | |
| # Final Save of the Master JSON | |
| final_data = {"leaves": all_leaves, "summaries": summary_blocks} | |
| with open(f"knowledge_tree_{timestamp}.json", "w") as f: | |
| json.dump(final_data, f, indent=4) | |
| if queue: | |
| await queue.put("DONE") | |
| ''' | |
| return script_content | |
| print(generate_updated_aggregator()) | |