Spaces:

prashantmatlani
/

chunker

Running

App Files Files Community

chunker / generated_updated_aggregator.py

prashantmatlani

Upload generated_updated_aggregator.py

27f4d1e verified 17 days ago

raw

history blame contribute delete

5.13 kB


	import os

	def generate_updated_aggregator():
	script_content = r'''import os
	import json
	import datetime
	import asyncio
	import tiktoken
	import pymupdf4llm
	from groq import Groq
	from dotenv import load_dotenv
	from pathlib import Path

	# 1. SETUP
	load_dotenv()
	client = Groq(api_key=os.getenv("GROQ_API_KEY"))
	MODEL = "llama-3.1-8b-instant"
	encoding = tiktoken.get_encoding("cl100k_base")

	def call_groq_json(system_prompt, user_content):
	strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."
	completion = client.chat.completions.create(
	model=MODEL,
	messages=[
	{"role": "system", "content": strict_system_prompt},
	{"role": "user", "content": user_content}
	],
	response_format={"type": "json_object"},
	temperature=0.2
	)
	return json.loads(completion.choices[0].message.content)

	def generate_summary_block(chunks):
	combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])
	prompt = "Synthesize these Jungian chunks into a dense Level-1 summary. JSON keys: 'summary_name', 'synthesis'."
	return call_groq_json(prompt, combined)

	async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30):
	# Setup Directory for Markdown Files
	timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
	md_folder = Path(f"jungian_agent_data_{timestamp}")
	md_folder.mkdir(exist_ok=True)

	# 1. Determine Page Range
	pages_to_read = None if whole else list(range(start_p, end_p))
	print(f"🚀 {'WHOLE BOOK' if whole else f'Pages {start_p}-{end_p}'} processing started...")

	# 2. Extract Markdown
	md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)

	cursor = 0
	all_leaves = []
	summary_blocks = []
	temp_group = []
	CHUNK_GROUP_SIZE = 5

	# context_buffer holds the 'rolling state'
	context_buffer = {"predecessor": "Start", "latest_summary": "None"}

	while cursor < len(md_text):
	lookahead = md_text[cursor : cursor + 6000]
	if not lookahead.strip(): break

	prompt = f"Context: {context_buffer['latest_summary']} \| Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."

	try:
	result = call_groq_json(prompt, lookahead)

	# Semantic Jump Logic
	break_text = result.get('break_text', "")
	relative_break = lookahead.find(break_text) + len(break_text) if (break_text and break_text in lookahead) else 2000

	new_chunk = {
	"type": "leaf",
	"filename": result.get('filename', 'untitled_chunk').replace(" ", "_"),
	"content": result.get('rewritten_text', ''),
	"parent_summary": context_buffer["latest_summary"]
	}

	all_leaves.append(new_chunk)
	temp_group.append(new_chunk)

	# PUSH TO UI
	if queue:
	await queue.put(new_chunk)

	context_buffer["predecessor"] = new_chunk["content"]
	cursor += relative_break

	# PHASE II: AGGREGATION
	if len(temp_group) >= CHUNK_GROUP_SIZE:
	summary_res = generate_summary_block(temp_group)

	summary_node = {
	"type": "summary",
	"name": summary_res['summary_name'].replace(" ", "_"),
	"content": summary_res['synthesis'],
	"children": [c['filename'] for c in temp_group]
	}
	summary_blocks.append(summary_node)
	context_buffer["latest_summary"] = summary_node["content"]

	# Update all chunks in this group with their official parent summary
	for c in temp_group:
	c["parent_summary"] = summary_node["content"]

	# SAVE CONTEXTUAL MARKDOWN FILE
	md_filename = md_folder / f"{c['filename']}.md"
	with open(md_filename, "w", encoding="utf-8") as md_file:
	md_file.write(f"--- CONTEXT ---\n{summary_node['content']}\n\n")
	md_file.write(f"--- CONTENT ---\n{c['content']}")

	if queue:
	await queue.put(summary_node)

	temp_group = []

	except Exception as e:
	print(f"Error: {e}")
	cursor += 3000
	continue

	# Final Save of the Master JSON
	final_data = {"leaves": all_leaves, "summaries": summary_blocks}
	with open(f"knowledge_tree_{timestamp}.json", "w") as f:
	json.dump(final_data, f, indent=4)

	if queue:
	await queue.put("DONE")
	'''
	return script_content

	print(generate_updated_aggregator())