Spaces:

prashantmatlani
/

chunker

Running

File size: 5,131 Bytes

27f4d1e


import os

def generate_updated_aggregator():
    script_content = r'''import os

import json

import datetime

import asyncio

import tiktoken

import pymupdf4llm

from groq import Groq

from dotenv import load_dotenv

from pathlib import Path



# 1. SETUP

load_dotenv()

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

MODEL = "llama-3.1-8b-instant"

encoding = tiktoken.get_encoding("cl100k_base")



def call_groq_json(system_prompt, user_content):

    strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."

    completion = client.chat.completions.create(

        model=MODEL,

        messages=[

            {"role": "system", "content": strict_system_prompt},

            {"role": "user", "content": user_content}

        ],

        response_format={"type": "json_object"},

        temperature=0.2

    )

    return json.loads(completion.choices[0].message.content)



def generate_summary_block(chunks):

    combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])

    prompt = "Synthesize these Jungian chunks into a dense Level-1 summary. JSON keys: 'summary_name', 'synthesis'."

    return call_groq_json(prompt, combined)



async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30):

    # Setup Directory for Markdown Files

    timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")

    md_folder = Path(f"jungian_agent_data_{timestamp}")

    md_folder.mkdir(exist_ok=True)



    # 1. Determine Page Range

    pages_to_read = None if whole else list(range(start_p, end_p))

    print(f"🚀 {'WHOLE BOOK' if whole else f'Pages {start_p}-{end_p}'} processing started...")



    # 2. Extract Markdown

    md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)

    

    cursor = 0

    all_leaves = []

    summary_blocks = []

    temp_group = []

    CHUNK_GROUP_SIZE = 5

    

    # context_buffer holds the 'rolling state'

    context_buffer = {"predecessor": "Start", "latest_summary": "None"}



    while cursor < len(md_text):

        lookahead = md_text[cursor : cursor + 6000]

        if not lookahead.strip(): break



        prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."

        

        try:

            result = call_groq_json(prompt, lookahead)

            

            # Semantic Jump Logic

            break_text = result.get('break_text', "")

            relative_break = lookahead.find(break_text) + len(break_text) if (break_text and break_text in lookahead) else 2000

            

            new_chunk = {

                "type": "leaf",

                "filename": result.get('filename', 'untitled_chunk').replace(" ", "_"),

                "content": result.get('rewritten_text', ''),

                "parent_summary": context_buffer["latest_summary"]

            }

            

            all_leaves.append(new_chunk)

            temp_group.append(new_chunk)



            # PUSH TO UI

            if queue:

                await queue.put(new_chunk)



            context_buffer["predecessor"] = new_chunk["content"]

            cursor += relative_break



            # PHASE II: AGGREGATION

            if len(temp_group) >= CHUNK_GROUP_SIZE:

                summary_res = generate_summary_block(temp_group)

                

                summary_node = {

                    "type": "summary",

                    "name": summary_res['summary_name'].replace(" ", "_"),

                    "content": summary_res['synthesis'],

                    "children": [c['filename'] for c in temp_group]

                }

                summary_blocks.append(summary_node)

                context_buffer["latest_summary"] = summary_node["content"]

                

                # Update all chunks in this group with their official parent summary

                for c in temp_group:

                    c["parent_summary"] = summary_node["content"]

                    

                    # SAVE CONTEXTUAL MARKDOWN FILE

                    md_filename = md_folder / f"{c['filename']}.md"

                    with open(md_filename, "w", encoding="utf-8") as md_file:

                        md_file.write(f"--- CONTEXT ---\n{summary_node['content']}\n\n")

                        md_file.write(f"--- CONTENT ---\n{c['content']}")



                if queue:

                    await queue.put(summary_node)

                

                temp_group = []



        except Exception as e:

            print(f"Error: {e}")

            cursor += 3000

            continue



    # Final Save of the Master JSON

    final_data = {"leaves": all_leaves, "summaries": summary_blocks}

    with open(f"knowledge_tree_{timestamp}.json", "w") as f:

        json.dump(final_data, f, indent=4)

    

    if queue:

        await queue.put("DONE")

'''
    return script_content

print(generate_updated_aggregator())