Spaces:
Running
Running
chunker premier push to hf
Browse files- .gitignore +18 -0
- Dockerfile +17 -0
- index.html +89 -0
- main.py +59 -0
- phase0102_chunker_aggregator_2.py +145 -0
- requirements.txt +7 -0
.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv/
|
| 2 |
+
node_modules/
|
| 3 |
+
*.log
|
| 4 |
+
|
| 5 |
+
# Python
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.pyc
|
| 8 |
+
|
| 9 |
+
# Env
|
| 10 |
+
.env
|
| 11 |
+
|
| 12 |
+
# VS Code / Visual Studio
|
| 13 |
+
.vs/
|
| 14 |
+
.vscode/
|
| 15 |
+
|
| 16 |
+
# OS
|
| 17 |
+
.DS_Store
|
| 18 |
+
Thumbs.db
|
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11
|
| 2 |
+
|
| 3 |
+
WORKDIR /code
|
| 4 |
+
|
| 5 |
+
# Install system dependencies for PDF processing
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
libgl1-mesa-glx \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
COPY ./requirements.txt /code/requirements.txt
|
| 12 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 13 |
+
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# HF Spaces expects the app on port 7860
|
| 17 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
<!DOCTYPE html>
|
| 3 |
+
<html lang="en">
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<title>Jung Chunker UI</title>
|
| 8 |
+
<style>
|
| 9 |
+
:root { --bg: #fdf6e3; --text: #586e75; --accent: #268bd2; }
|
| 10 |
+
body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: var(--bg); color: var(--text); margin: 0; display: flex; height: 100vh; }
|
| 11 |
+
|
| 12 |
+
/* Sidebar for the Tree */
|
| 13 |
+
#sidebar { width: 350px; border-right: 1px solid #ddd; overflow-y: auto; padding: 20px; background: #eee8d5; }
|
| 14 |
+
#viewer { flex-grow: 1; padding: 40px; overflow-y: auto; line-height: 1.6; }
|
| 15 |
+
|
| 16 |
+
.tree-node { margin-left: 15px; border-left: 2px solid #ccc; padding-left: 10px; margin-bottom: 10px; }
|
| 17 |
+
.summary-block { font-weight: bold; color: var(--accent); cursor: pointer; display: block; margin-top: 15px; }
|
| 18 |
+
.leaf-node { font-size: 0.9em; cursor: pointer; color: #657b83; display: block; margin: 5px 0; }
|
| 19 |
+
.leaf-node:hover { text-decoration: underline; }
|
| 20 |
+
|
| 21 |
+
h1 { font-size: 1.2em; border-bottom: 1px solid #ccc; padding-bottom: 10px; }
|
| 22 |
+
.chunk-content { background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
|
| 23 |
+
.upload-section { margin-bottom: 20px; padding: 15px; background: #fff; border-radius: 5px; }
|
| 24 |
+
</style>
|
| 25 |
+
</head>
|
| 26 |
+
<body>
|
| 27 |
+
|
| 28 |
+
<div id="sidebar">
|
| 29 |
+
<h1>Jungian Chunker</h1>
|
| 30 |
+
|
| 31 |
+
<div class="upload-section">
|
| 32 |
+
<input type="file" id="pdfUpload" accept=".pdf">
|
| 33 |
+
<button onclick="uploadFile()">Upload PDF</button>
|
| 34 |
+
<div id="status" style="font-size: 0.8em; margin-top: 5px;">Ready.</div>
|
| 35 |
+
</div>
|
| 36 |
+
|
| 37 |
+
<div id="tree-container">
|
| 38 |
+
<!-- Tree nodes will appear here dynamically -->
|
| 39 |
+
<p style="font-style: italic;">Tree will build here...</p>
|
| 40 |
+
</div>
|
| 41 |
+
</div>
|
| 42 |
+
|
| 43 |
+
<div id="viewer">
|
| 44 |
+
<div id="content-display" class="chunk-content">
|
| 45 |
+
<h2>Content Viewer</h2>
|
| 46 |
+
<p>Select a chunk from the tree to read the self-sufficient version here.</p>
|
| 47 |
+
</div>
|
| 48 |
+
</div>
|
| 49 |
+
|
| 50 |
+
<script>
|
| 51 |
+
// Placeholder function to simulate file upload
|
| 52 |
+
async function uploadFile() {
|
| 53 |
+
const fileInput = document.getElementById('pdfUpload');
|
| 54 |
+
const status = document.getElementById('status');
|
| 55 |
+
if (!fileInput.files[0]) return alert("Select a file first!");
|
| 56 |
+
|
| 57 |
+
status.innerText = "Uploading & Processing...";
|
| 58 |
+
|
| 59 |
+
// In your real HF Space, you'll use:
|
| 60 |
+
// const formData = new FormData();
|
| 61 |
+
// formData.append('file', fileInput.files[0]);
|
| 62 |
+
// await fetch('/upload', {method: 'POST', body: formData});
|
| 63 |
+
|
| 64 |
+
status.innerText = "Simulating Tree Build...";
|
| 65 |
+
mockTreeBuild();
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
// This simulates the UI updating as your Python script works
|
| 69 |
+
function mockTreeBuild() {
|
| 70 |
+
const container = document.getElementById('tree-container');
|
| 71 |
+
container.innerHTML = ""; // Clear
|
| 72 |
+
|
| 73 |
+
// Example of a Summary Node
|
| 74 |
+
const summary = document.createElement('div');
|
| 75 |
+
summary.className = "tree-node";
|
| 76 |
+
summary.innerHTML = `<span class="summary-block">⭐ Psychological Development</span>
|
| 77 |
+
<span class="leaf-node" onclick="view('Introversion vs Extraversion', 'This is the self-sufficient text about mechanisms...')">∟ extraverted_vs_introverted</span>
|
| 78 |
+
<span class="leaf-node" onclick="view('The Personal Equation', 'Psychological constellation affects the observer...')">∟ personal_equation</span>`;
|
| 79 |
+
container.appendChild(summary);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
function view(title, text) {
|
| 83 |
+
const display = document.getElementById('content-display');
|
| 84 |
+
display.innerHTML = `<h2>${title}</h2><p>${text}</p>`;
|
| 85 |
+
}
|
| 86 |
+
</script>
|
| 87 |
+
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
main.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# /pb/py/chunker/main.py
|
| 3 |
+
# ./main.py
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import asyncio
|
| 7 |
+
import json
|
| 8 |
+
from fastapi import FastAPI, UploadFile, File, BackgroundTask
|
| 9 |
+
from fastapi.responses import HTMLResponse, StreamingResponse
|
| 10 |
+
from fastapi.staticfiles import StaticFiles
|
| 11 |
+
import shutil
|
| 12 |
+
|
| 13 |
+
# Import chunking logic from the existing combined script
|
| 14 |
+
# Note: Ensure script functions are wrap-able or callable
|
| 15 |
+
from phase0102_chunker_aggregator_2 import run_chunking_process
|
| 16 |
+
|
| 17 |
+
app = FastAPI()
|
| 18 |
+
|
| 19 |
+
# Global store to keep track of progress for the UI
|
| 20 |
+
progress_queue = asyncio.Queue()
|
| 21 |
+
|
| 22 |
+
@app.get("/", response_class=HTMLResponse)
|
| 23 |
+
async def get_ui():
|
| 24 |
+
with open("index.html", "r") as f:
|
| 25 |
+
return f.read()
|
| 26 |
+
|
| 27 |
+
@app.post("/upload")
|
| 28 |
+
async def handle_upload(file: UploadFile = File(...)):
|
| 29 |
+
# Save the uploaded PDF to a local temp file
|
| 30 |
+
temp_path = f"temp_{file.filename}"
|
| 31 |
+
with open(temp_path, "wb") as buffer:
|
| 32 |
+
shutil.copyfileobj(file.file, buffer)
|
| 33 |
+
|
| 34 |
+
# Start the chunking in the background so the UI doesn't freeze
|
| 35 |
+
# We pass the queue so the script can "push" updates to it
|
| 36 |
+
asyncio.create_task(run_chunking_process(temp_path, progress_queue))
|
| 37 |
+
|
| 38 |
+
return {"status": "Processing started", "filename": file.filename}
|
| 39 |
+
|
| 40 |
+
@app.get("/stream")
|
| 41 |
+
async def stream_updates():
|
| 42 |
+
"""
|
| 43 |
+
This is the SSE endpoint. The browser listens here to get
|
| 44 |
+
real-time updates as chunks are created.
|
| 45 |
+
"""
|
| 46 |
+
async def event_generator():
|
| 47 |
+
while True:
|
| 48 |
+
# Wait for a new chunk/summary from the background task
|
| 49 |
+
data = await progress_queue.get()
|
| 50 |
+
if data == "DONE":
|
| 51 |
+
yield "data: {\"type\": \"done\"}\n\n"
|
| 52 |
+
break
|
| 53 |
+
yield f"data: {json.dumps(data)}\n\n"
|
| 54 |
+
|
| 55 |
+
return StreamingResponse(event_generator(), media_type="text/event-stream")
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
import uvicorn
|
| 59 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
phase0102_chunker_aggregator_2.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import datetime
|
| 5 |
+
import asyncio
|
| 6 |
+
import tiktoken
|
| 7 |
+
import pymupdf4llm
|
| 8 |
+
from groq import Groq
|
| 9 |
+
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
import datetime
|
| 14 |
+
import sys
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
load_dotenv()
|
| 18 |
+
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
| 19 |
+
MODEL = "llama-3.1-8b-instant"
|
| 20 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
| 21 |
+
|
| 22 |
+
# 2. Define the folder and the filename
|
| 23 |
+
#pdf_folder = Path("C:\\Users\\wd052\\OneDrive\\Desktop\\00\\01\\PDFs\\J\\CW")
|
| 24 |
+
#pdf_path = r"C:\Users\wd052\OneDrive\Desktop\00\01\PDFs\J\CW\Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
|
| 25 |
+
pdf_folder = Path("C:/Users/wd052/OneDrive/Desktop/00/01/PDFs/J/CW")
|
| 26 |
+
pdf_name = "Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
|
| 27 |
+
|
| 28 |
+
# Combine them
|
| 29 |
+
pdf_path = pdf_folder / pdf_name
|
| 30 |
+
|
| 31 |
+
WHOLE = False # Set to True to process the whole book; False to process a page range
|
| 32 |
+
START_PAGE = 20
|
| 33 |
+
END_PAGE = 30
|
| 34 |
+
|
| 35 |
+
def call_groq_json(system_prompt, user_content):
|
| 36 |
+
strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."
|
| 37 |
+
completion = client.chat.completions.create(
|
| 38 |
+
model=MODEL,
|
| 39 |
+
messages=[
|
| 40 |
+
{"role": "system", "content": strict_system_prompt},
|
| 41 |
+
{"role": "user", "content": user_content}
|
| 42 |
+
],
|
| 43 |
+
response_format={"type": "json_object"},
|
| 44 |
+
temperature=0.2
|
| 45 |
+
)
|
| 46 |
+
print(f"\nLLM raw response: {completion.choices[0].message.content}\n")
|
| 47 |
+
return json.loads(completion.choices[0].message.content)
|
| 48 |
+
|
| 49 |
+
#async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30):
|
| 50 |
+
async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_PAGE, end_p=END_PAGE):
|
| 51 |
+
"""
|
| 52 |
+
Main entry point for the chunking logic.
|
| 53 |
+
If queue is provided, it 'yields' results to the UI.
|
| 54 |
+
"""
|
| 55 |
+
print(f"\nwhole: {whole}, start_p: {start_p}, end_p: {end_p}")
|
| 56 |
+
|
| 57 |
+
# 1. Determine Page Range
|
| 58 |
+
if whole:
|
| 59 |
+
# PyMuPDF4LLM uses None to process all pages
|
| 60 |
+
pages_to_read = None
|
| 61 |
+
print("📚 Processing the WHOLE book...")
|
| 62 |
+
else:
|
| 63 |
+
pages_to_read = list(range(start_p, end_p))
|
| 64 |
+
print(f"📑 Processing pages {start_p} to {end_p}...")
|
| 65 |
+
|
| 66 |
+
print(f"\ngot to CW6 chunking logic with pdf_path: {pdf_path} and pages_to_read: {pages_to_read}")
|
| 67 |
+
# 2. Extract Markdown
|
| 68 |
+
md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
|
| 69 |
+
|
| 70 |
+
cursor = 0
|
| 71 |
+
all_leaves = []
|
| 72 |
+
summary_blocks = []
|
| 73 |
+
temp_group = []
|
| 74 |
+
CHUNK_GROUP_SIZE = 5
|
| 75 |
+
|
| 76 |
+
context_buffer = {"predecessor": "Start", "latest_summary": "None"}
|
| 77 |
+
|
| 78 |
+
while cursor < len(md_text):
|
| 79 |
+
lookahead = md_text[cursor : cursor + 6000]
|
| 80 |
+
if not lookahead.strip(): break
|
| 81 |
+
|
| 82 |
+
prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
result = call_groq_json(prompt, lookahead)
|
| 86 |
+
|
| 87 |
+
# Semantic Jump Logic
|
| 88 |
+
break_text = result.get('break_text', "")
|
| 89 |
+
relative_break = lookahead.find(break_text) + len(break_text) if break_text in lookahead else 2000
|
| 90 |
+
|
| 91 |
+
new_chunk = {
|
| 92 |
+
"type": "leaf",
|
| 93 |
+
"filename": result.get('filename', 'untitled'),
|
| 94 |
+
"content": result.get('rewritten_text', '')
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
all_leaves.append(new_chunk)
|
| 98 |
+
temp_group.append(new_chunk)
|
| 99 |
+
|
| 100 |
+
# PUSH TO UI
|
| 101 |
+
if queue:
|
| 102 |
+
await queue.put(new_chunk)
|
| 103 |
+
|
| 104 |
+
context_buffer["predecessor"] = new_chunk["content"]
|
| 105 |
+
cursor += relative_break
|
| 106 |
+
|
| 107 |
+
# PHASE II: AGGREGATION
|
| 108 |
+
if len(temp_group) >= CHUNK_GROUP_SIZE:
|
| 109 |
+
from phase0102_chunker_aggregator_2 import generate_summary_block # Ensure helper is available
|
| 110 |
+
print(f"\nreached phase0102")
|
| 111 |
+
summary_res = generate_summary_block(temp_group)
|
| 112 |
+
|
| 113 |
+
summary_node = {
|
| 114 |
+
"type": "summary",
|
| 115 |
+
"name": summary_res['summary_name'],
|
| 116 |
+
"content": summary_res['synthesis'],
|
| 117 |
+
"children": [c['filename'] for c in temp_group]
|
| 118 |
+
}
|
| 119 |
+
summary_blocks.append(summary_node)
|
| 120 |
+
context_buffer["latest_summary"] = summary_node["content"]
|
| 121 |
+
|
| 122 |
+
if queue:
|
| 123 |
+
await queue.put(summary_node)
|
| 124 |
+
|
| 125 |
+
temp_group = []
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
print(f"Error: {e}")
|
| 129 |
+
cursor += 3000
|
| 130 |
+
continue
|
| 131 |
+
|
| 132 |
+
# Final Save
|
| 133 |
+
timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
|
| 134 |
+
final_data = {"leaves": all_leaves, "summaries": summary_blocks}
|
| 135 |
+
with open(f"knowledge_tree_{timestamp}.json", "w") as f:
|
| 136 |
+
json.dump(final_data, f, indent=4)
|
| 137 |
+
|
| 138 |
+
if queue:
|
| 139 |
+
await queue.put("DONE")
|
| 140 |
+
|
| 141 |
+
# Helper for summary
|
| 142 |
+
def generate_summary_block(chunks):
|
| 143 |
+
combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])
|
| 144 |
+
prompt = "Synthesize these Jungian chunks into a dense Level-1 summary. JSON keys: 'summary_name', 'synthesis'."
|
| 145 |
+
return call_groq_json(prompt, combined)
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
python-dotenv
|
| 4 |
+
groq
|
| 5 |
+
tiktoken
|
| 6 |
+
pymupdf4llm
|
| 7 |
+
python-multipart
|