File size: 5,131 Bytes
27f4d1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

import os

def generate_updated_aggregator():
    script_content = r'''import os

import json

import datetime

import asyncio

import tiktoken

import pymupdf4llm

from groq import Groq

from dotenv import load_dotenv

from pathlib import Path



# 1. SETUP

load_dotenv()

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

MODEL = "llama-3.1-8b-instant"

encoding = tiktoken.get_encoding("cl100k_base")



def call_groq_json(system_prompt, user_content):

    strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."

    completion = client.chat.completions.create(

        model=MODEL,

        messages=[

            {"role": "system", "content": strict_system_prompt},

            {"role": "user", "content": user_content}

        ],

        response_format={"type": "json_object"},

        temperature=0.2

    )

    return json.loads(completion.choices[0].message.content)



def generate_summary_block(chunks):

    combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])

    prompt = "Synthesize these Jungian chunks into a dense Level-1 summary. JSON keys: 'summary_name', 'synthesis'."

    return call_groq_json(prompt, combined)



async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30):

    # Setup Directory for Markdown Files

    timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")

    md_folder = Path(f"jungian_agent_data_{timestamp}")

    md_folder.mkdir(exist_ok=True)



    # 1. Determine Page Range

    pages_to_read = None if whole else list(range(start_p, end_p))

    print(f"🚀 {'WHOLE BOOK' if whole else f'Pages {start_p}-{end_p}'} processing started...")



    # 2. Extract Markdown

    md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)

    

    cursor = 0

    all_leaves = []

    summary_blocks = []

    temp_group = []

    CHUNK_GROUP_SIZE = 5

    

    # context_buffer holds the 'rolling state'

    context_buffer = {"predecessor": "Start", "latest_summary": "None"}



    while cursor < len(md_text):

        lookahead = md_text[cursor : cursor + 6000]

        if not lookahead.strip(): break



        prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."

        

        try:

            result = call_groq_json(prompt, lookahead)

            

            # Semantic Jump Logic

            break_text = result.get('break_text', "")

            relative_break = lookahead.find(break_text) + len(break_text) if (break_text and break_text in lookahead) else 2000

            

            new_chunk = {

                "type": "leaf",

                "filename": result.get('filename', 'untitled_chunk').replace(" ", "_"),

                "content": result.get('rewritten_text', ''),

                "parent_summary": context_buffer["latest_summary"]

            }

            

            all_leaves.append(new_chunk)

            temp_group.append(new_chunk)



            # PUSH TO UI

            if queue:

                await queue.put(new_chunk)



            context_buffer["predecessor"] = new_chunk["content"]

            cursor += relative_break



            # PHASE II: AGGREGATION

            if len(temp_group) >= CHUNK_GROUP_SIZE:

                summary_res = generate_summary_block(temp_group)

                

                summary_node = {

                    "type": "summary",

                    "name": summary_res['summary_name'].replace(" ", "_"),

                    "content": summary_res['synthesis'],

                    "children": [c['filename'] for c in temp_group]

                }

                summary_blocks.append(summary_node)

                context_buffer["latest_summary"] = summary_node["content"]

                

                # Update all chunks in this group with their official parent summary

                for c in temp_group:

                    c["parent_summary"] = summary_node["content"]

                    

                    # SAVE CONTEXTUAL MARKDOWN FILE

                    md_filename = md_folder / f"{c['filename']}.md"

                    with open(md_filename, "w", encoding="utf-8") as md_file:

                        md_file.write(f"--- CONTEXT ---\n{summary_node['content']}\n\n")

                        md_file.write(f"--- CONTENT ---\n{c['content']}")



                if queue:

                    await queue.put(summary_node)

                

                temp_group = []



        except Exception as e:

            print(f"Error: {e}")

            cursor += 3000

            continue



    # Final Save of the Master JSON

    final_data = {"leaves": all_leaves, "summaries": summary_blocks}

    with open(f"knowledge_tree_{timestamp}.json", "w") as f:

        json.dump(final_data, f, indent=4)

    

    if queue:

        await queue.put("DONE")

'''
    return script_content

print(generate_updated_aggregator())