prashantmatlani commited on
Commit
537abbd
·
verified ·
1 Parent(s): c46bd99

chunker premier push to hf

Browse files
Files changed (6) hide show
  1. .gitignore +18 -0
  2. Dockerfile +17 -0
  3. index.html +89 -0
  4. main.py +59 -0
  5. phase0102_chunker_aggregator_2.py +145 -0
  6. requirements.txt +7 -0
.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ node_modules/
3
+ *.log
4
+
5
+ # Python
6
+ __pycache__/
7
+ *.pyc
8
+
9
+ # Env
10
+ .env
11
+
12
+ # VS Code / Visual Studio
13
+ .vs/
14
+ .vscode/
15
+
16
+ # OS
17
+ .DS_Store
18
+ Thumbs.db
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /code
4
+
5
+ # Install system dependencies for PDF processing
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ libgl1-mesa-glx \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY ./requirements.txt /code/requirements.txt
12
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
13
+
14
+ COPY . .
15
+
16
+ # HF Spaces expects the app on port 7860
17
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <!DOCTYPE html>
3
+ <html lang="en">
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>Jung Chunker UI</title>
8
+ <style>
9
+ :root { --bg: #fdf6e3; --text: #586e75; --accent: #268bd2; }
10
+ body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: var(--bg); color: var(--text); margin: 0; display: flex; height: 100vh; }
11
+
12
+ /* Sidebar for the Tree */
13
+ #sidebar { width: 350px; border-right: 1px solid #ddd; overflow-y: auto; padding: 20px; background: #eee8d5; }
14
+ #viewer { flex-grow: 1; padding: 40px; overflow-y: auto; line-height: 1.6; }
15
+
16
+ .tree-node { margin-left: 15px; border-left: 2px solid #ccc; padding-left: 10px; margin-bottom: 10px; }
17
+ .summary-block { font-weight: bold; color: var(--accent); cursor: pointer; display: block; margin-top: 15px; }
18
+ .leaf-node { font-size: 0.9em; cursor: pointer; color: #657b83; display: block; margin: 5px 0; }
19
+ .leaf-node:hover { text-decoration: underline; }
20
+
21
+ h1 { font-size: 1.2em; border-bottom: 1px solid #ccc; padding-bottom: 10px; }
22
+ .chunk-content { background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
23
+ .upload-section { margin-bottom: 20px; padding: 15px; background: #fff; border-radius: 5px; }
24
+ </style>
25
+ </head>
26
+ <body>
27
+
28
+ <div id="sidebar">
29
+ <h1>Jungian Chunker</h1>
30
+
31
+ <div class="upload-section">
32
+ <input type="file" id="pdfUpload" accept=".pdf">
33
+ <button onclick="uploadFile()">Upload PDF</button>
34
+ <div id="status" style="font-size: 0.8em; margin-top: 5px;">Ready.</div>
35
+ </div>
36
+
37
+ <div id="tree-container">
38
+ <!-- Tree nodes will appear here dynamically -->
39
+ <p style="font-style: italic;">Tree will build here...</p>
40
+ </div>
41
+ </div>
42
+
43
+ <div id="viewer">
44
+ <div id="content-display" class="chunk-content">
45
+ <h2>Content Viewer</h2>
46
+ <p>Select a chunk from the tree to read the self-sufficient version here.</p>
47
+ </div>
48
+ </div>
49
+
50
+ <script>
51
+ // Placeholder function to simulate file upload
52
+ async function uploadFile() {
53
+ const fileInput = document.getElementById('pdfUpload');
54
+ const status = document.getElementById('status');
55
+ if (!fileInput.files[0]) return alert("Select a file first!");
56
+
57
+ status.innerText = "Uploading & Processing...";
58
+
59
+ // In your real HF Space, you'll use:
60
+ // const formData = new FormData();
61
+ // formData.append('file', fileInput.files[0]);
62
+ // await fetch('/upload', {method: 'POST', body: formData});
63
+
64
+ status.innerText = "Simulating Tree Build...";
65
+ mockTreeBuild();
66
+ }
67
+
68
+ // This simulates the UI updating as your Python script works
69
+ function mockTreeBuild() {
70
+ const container = document.getElementById('tree-container');
71
+ container.innerHTML = ""; // Clear
72
+
73
+ // Example of a Summary Node
74
+ const summary = document.createElement('div');
75
+ summary.className = "tree-node";
76
+ summary.innerHTML = `<span class="summary-block">⭐ Psychological Development</span>
77
+ <span class="leaf-node" onclick="view('Introversion vs Extraversion', 'This is the self-sufficient text about mechanisms...')">∟ extraverted_vs_introverted</span>
78
+ <span class="leaf-node" onclick="view('The Personal Equation', 'Psychological constellation affects the observer...')">∟ personal_equation</span>`;
79
+ container.appendChild(summary);
80
+ }
81
+
82
+ function view(title, text) {
83
+ const display = document.getElementById('content-display');
84
+ display.innerHTML = `<h2>${title}</h2><p>${text}</p>`;
85
+ }
86
+ </script>
87
+
88
+ </body>
89
+ </html>
main.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # /pb/py/chunker/main.py
3
+ # ./main.py
4
+
5
+ import os
6
+ import asyncio
7
+ import json
8
+ from fastapi import FastAPI, UploadFile, File, BackgroundTask
9
+ from fastapi.responses import HTMLResponse, StreamingResponse
10
+ from fastapi.staticfiles import StaticFiles
11
+ import shutil
12
+
13
+ # Import chunking logic from the existing combined script
14
+ # Note: Ensure script functions are wrap-able or callable
15
+ from phase0102_chunker_aggregator_2 import run_chunking_process
16
+
17
+ app = FastAPI()
18
+
19
+ # Global store to keep track of progress for the UI
20
+ progress_queue = asyncio.Queue()
21
+
22
+ @app.get("/", response_class=HTMLResponse)
23
+ async def get_ui():
24
+ with open("index.html", "r") as f:
25
+ return f.read()
26
+
27
+ @app.post("/upload")
28
+ async def handle_upload(file: UploadFile = File(...)):
29
+ # Save the uploaded PDF to a local temp file
30
+ temp_path = f"temp_{file.filename}"
31
+ with open(temp_path, "wb") as buffer:
32
+ shutil.copyfileobj(file.file, buffer)
33
+
34
+ # Start the chunking in the background so the UI doesn't freeze
35
+ # We pass the queue so the script can "push" updates to it
36
+ asyncio.create_task(run_chunking_process(temp_path, progress_queue))
37
+
38
+ return {"status": "Processing started", "filename": file.filename}
39
+
40
+ @app.get("/stream")
41
+ async def stream_updates():
42
+ """
43
+ This is the SSE endpoint. The browser listens here to get
44
+ real-time updates as chunks are created.
45
+ """
46
+ async def event_generator():
47
+ while True:
48
+ # Wait for a new chunk/summary from the background task
49
+ data = await progress_queue.get()
50
+ if data == "DONE":
51
+ yield "data: {\"type\": \"done\"}\n\n"
52
+ break
53
+ yield f"data: {json.dumps(data)}\n\n"
54
+
55
+ return StreamingResponse(event_generator(), media_type="text/event-stream")
56
+
57
+ if __name__ == "__main__":
58
+ import uvicorn
59
+ uvicorn.run(app, host="0.0.0.0", port=7860)
phase0102_chunker_aggregator_2.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import json
4
+ import datetime
5
+ import asyncio
6
+ import tiktoken
7
+ import pymupdf4llm
8
+ from groq import Groq
9
+
10
+ from dotenv import load_dotenv
11
+ from pathlib import Path
12
+
13
+ import datetime
14
+ import sys
15
+
16
+
17
+ load_dotenv()
18
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
19
+ MODEL = "llama-3.1-8b-instant"
20
+ encoding = tiktoken.get_encoding("cl100k_base")
21
+
22
+ # 2. Define the folder and the filename
23
+ #pdf_folder = Path("C:\\Users\\wd052\\OneDrive\\Desktop\\00\\01\\PDFs\\J\\CW")
24
+ #pdf_path = r"C:\Users\wd052\OneDrive\Desktop\00\01\PDFs\J\CW\Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
25
+ pdf_folder = Path("C:/Users/wd052/OneDrive/Desktop/00/01/PDFs/J/CW")
26
+ pdf_name = "Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
27
+
28
+ # Combine them
29
+ pdf_path = pdf_folder / pdf_name
30
+
31
+ WHOLE = False # Set to True to process the whole book; False to process a page range
32
+ START_PAGE = 20
33
+ END_PAGE = 30
34
+
35
+ def call_groq_json(system_prompt, user_content):
36
+ strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."
37
+ completion = client.chat.completions.create(
38
+ model=MODEL,
39
+ messages=[
40
+ {"role": "system", "content": strict_system_prompt},
41
+ {"role": "user", "content": user_content}
42
+ ],
43
+ response_format={"type": "json_object"},
44
+ temperature=0.2
45
+ )
46
+ print(f"\nLLM raw response: {completion.choices[0].message.content}\n")
47
+ return json.loads(completion.choices[0].message.content)
48
+
49
+ #async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30):
50
+ async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_PAGE, end_p=END_PAGE):
51
+ """
52
+ Main entry point for the chunking logic.
53
+ If queue is provided, it 'yields' results to the UI.
54
+ """
55
+ print(f"\nwhole: {whole}, start_p: {start_p}, end_p: {end_p}")
56
+
57
+ # 1. Determine Page Range
58
+ if whole:
59
+ # PyMuPDF4LLM uses None to process all pages
60
+ pages_to_read = None
61
+ print("📚 Processing the WHOLE book...")
62
+ else:
63
+ pages_to_read = list(range(start_p, end_p))
64
+ print(f"📑 Processing pages {start_p} to {end_p}...")
65
+
66
+ print(f"\ngot to CW6 chunking logic with pdf_path: {pdf_path} and pages_to_read: {pages_to_read}")
67
+ # 2. Extract Markdown
68
+ md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
69
+
70
+ cursor = 0
71
+ all_leaves = []
72
+ summary_blocks = []
73
+ temp_group = []
74
+ CHUNK_GROUP_SIZE = 5
75
+
76
+ context_buffer = {"predecessor": "Start", "latest_summary": "None"}
77
+
78
+ while cursor < len(md_text):
79
+ lookahead = md_text[cursor : cursor + 6000]
80
+ if not lookahead.strip(): break
81
+
82
+ prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."
83
+
84
+ try:
85
+ result = call_groq_json(prompt, lookahead)
86
+
87
+ # Semantic Jump Logic
88
+ break_text = result.get('break_text', "")
89
+ relative_break = lookahead.find(break_text) + len(break_text) if break_text in lookahead else 2000
90
+
91
+ new_chunk = {
92
+ "type": "leaf",
93
+ "filename": result.get('filename', 'untitled'),
94
+ "content": result.get('rewritten_text', '')
95
+ }
96
+
97
+ all_leaves.append(new_chunk)
98
+ temp_group.append(new_chunk)
99
+
100
+ # PUSH TO UI
101
+ if queue:
102
+ await queue.put(new_chunk)
103
+
104
+ context_buffer["predecessor"] = new_chunk["content"]
105
+ cursor += relative_break
106
+
107
+ # PHASE II: AGGREGATION
108
+ if len(temp_group) >= CHUNK_GROUP_SIZE:
109
+ from phase0102_chunker_aggregator_2 import generate_summary_block # Ensure helper is available
110
+ print(f"\nreached phase0102")
111
+ summary_res = generate_summary_block(temp_group)
112
+
113
+ summary_node = {
114
+ "type": "summary",
115
+ "name": summary_res['summary_name'],
116
+ "content": summary_res['synthesis'],
117
+ "children": [c['filename'] for c in temp_group]
118
+ }
119
+ summary_blocks.append(summary_node)
120
+ context_buffer["latest_summary"] = summary_node["content"]
121
+
122
+ if queue:
123
+ await queue.put(summary_node)
124
+
125
+ temp_group = []
126
+
127
+ except Exception as e:
128
+ print(f"Error: {e}")
129
+ cursor += 3000
130
+ continue
131
+
132
+ # Final Save
133
+ timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
134
+ final_data = {"leaves": all_leaves, "summaries": summary_blocks}
135
+ with open(f"knowledge_tree_{timestamp}.json", "w") as f:
136
+ json.dump(final_data, f, indent=4)
137
+
138
+ if queue:
139
+ await queue.put("DONE")
140
+
141
+ # Helper for summary
142
+ def generate_summary_block(chunks):
143
+ combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])
144
+ prompt = "Synthesize these Jungian chunks into a dense Level-1 summary. JSON keys: 'summary_name', 'synthesis'."
145
+ return call_groq_json(prompt, combined)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-dotenv
4
+ groq
5
+ tiktoken
6
+ pymupdf4llm
7
+ python-multipart