Spaces:

prashantmatlani
/

chunker

Running

App Files Files Community

prashantmatlani commited on 17 days ago

Commit

537abbd

verified ·

1 Parent(s): c46bd99

chunker premier push to hf

Browse files

Files changed (6) hide show

.gitignore +18 -0
Dockerfile +17 -0
index.html +89 -0
main.py +59 -0
phase0102_chunker_aggregator_2.py +145 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+venv/
+node_modules/
+*.log
+# Python
+__pycache__/
+*.pyc
+# Env
+.env
+# VS Code / Visual Studio
+.vs/
+.vscode/
+# OS
+.DS_Store
+Thumbs.db

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.11
+WORKDIR /code
+# Install system dependencies for PDF processing
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libgl1-mesa-glx \
+    && rm -rf /var/lib/apt/lists/*
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+# HF Spaces expects the app on port 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

index.html ADDED Viewed

	@@ -0,0 +1,89 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Jung Chunker UI</title>
+    <style>
+        :root { --bg: #fdf6e3; --text: #586e75; --accent: #268bd2; }
+        body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: var(--bg); color: var(--text); margin: 0; display: flex; height: 100vh; }
+        /* Sidebar for the Tree */
+        #sidebar { width: 350px; border-right: 1px solid #ddd; overflow-y: auto; padding: 20px; background: #eee8d5; }
+        #viewer { flex-grow: 1; padding: 40px; overflow-y: auto; line-height: 1.6; }
+        .tree-node { margin-left: 15px; border-left: 2px solid #ccc; padding-left: 10px; margin-bottom: 10px; }
+        .summary-block { font-weight: bold; color: var(--accent); cursor: pointer; display: block; margin-top: 15px; }
+        .leaf-node { font-size: 0.9em; cursor: pointer; color: #657b83; display: block; margin: 5px 0; }
+        .leaf-node:hover { text-decoration: underline; }
+        h1 { font-size: 1.2em; border-bottom: 1px solid #ccc; padding-bottom: 10px; }
+        .chunk-content { background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
+        .upload-section { margin-bottom: 20px; padding: 15px; background: #fff; border-radius: 5px; }
+    </style>
+</head>
+<body>
+<div id="sidebar">
+    <h1>Jungian Chunker</h1>
+    <div class="upload-section">
+        <input type="file" id="pdfUpload" accept=".pdf">
+        <button onclick="uploadFile()">Upload PDF</button>
+        <div id="status" style="font-size: 0.8em; margin-top: 5px;">Ready.</div>
+    </div>
+    <div id="tree-container">
+        <!-- Tree nodes will appear here dynamically -->
+        <p style="font-style: italic;">Tree will build here...</p>
+    </div>
+</div>
+<div id="viewer">
+    <div id="content-display" class="chunk-content">
+        <h2>Content Viewer</h2>
+        <p>Select a chunk from the tree to read the self-sufficient version here.</p>
+    </div>
+</div>
+<script>
+    // Placeholder function to simulate file upload
+    async function uploadFile() {
+        const fileInput = document.getElementById('pdfUpload');
+        const status = document.getElementById('status');
+        if (!fileInput.files[0]) return alert("Select a file first!");
+        status.innerText = "Uploading & Processing...";
+        // In your real HF Space, you'll use:
+        // const formData = new FormData();
+        // formData.append('file', fileInput.files[0]);
+        // await fetch('/upload', {method: 'POST', body: formData});
+        status.innerText = "Simulating Tree Build...";
+        mockTreeBuild();
+    }
+    // This simulates the UI updating as your Python script works
+    function mockTreeBuild() {
+        const container = document.getElementById('tree-container');
+        container.innerHTML = ""; // Clear
+        // Example of a Summary Node
+        const summary = document.createElement('div');
+        summary.className = "tree-node";
+        summary.innerHTML = `<span class="summary-block">⭐ Psychological Development</span>
+                             <span class="leaf-node" onclick="view('Introversion vs Extraversion', 'This is the self-sufficient text about mechanisms...')">∟ extraverted_vs_introverted</span>
+                             <span class="leaf-node" onclick="view('The Personal Equation', 'Psychological constellation affects the observer...')">∟ personal_equation</span>`;
+        container.appendChild(summary);
+    }
+    function view(title, text) {
+        const display = document.getElementById('content-display');
+        display.innerHTML = `<h2>${title}</h2><p>${text}</p>`;
+    }
+</script>
+</body>
+</html>

main.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# /pb/py/chunker/main.py
+# ./main.py
+import os
+import asyncio
+import json
+from fastapi import FastAPI, UploadFile, File, BackgroundTask
+from fastapi.responses import HTMLResponse, StreamingResponse
+from fastapi.staticfiles import StaticFiles
+import shutil
+# Import chunking logic from the existing combined script
+# Note: Ensure script functions are wrap-able or callable
+from phase0102_chunker_aggregator_2 import run_chunking_process
+app = FastAPI()
+# Global store to keep track of progress for the UI
+progress_queue = asyncio.Queue()
+@app.get("/", response_class=HTMLResponse)
+async def get_ui():
+    with open("index.html", "r") as f:
+        return f.read()
+@app.post("/upload")
+async def handle_upload(file: UploadFile = File(...)):
+    # Save the uploaded PDF to a local temp file
+    temp_path = f"temp_{file.filename}"
+    with open(temp_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    # Start the chunking in the background so the UI doesn't freeze
+    # We pass the queue so the script can "push" updates to it
+    asyncio.create_task(run_chunking_process(temp_path, progress_queue))
+    return {"status": "Processing started", "filename": file.filename}
+@app.get("/stream")
+async def stream_updates():
+    """
+    This is the SSE endpoint. The browser listens here to get
+    real-time updates as chunks are created.
+    """
+    async def event_generator():
+        while True:
+            # Wait for a new chunk/summary from the background task
+            data = await progress_queue.get()
+            if data == "DONE":
+                yield "data: {\"type\": \"done\"}\n\n"
+                break
+            yield f"data: {json.dumps(data)}\n\n"
+    return StreamingResponse(event_generator(), media_type="text/event-stream")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

phase0102_chunker_aggregator_2.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import json
+import datetime
+import asyncio
+import tiktoken
+import pymupdf4llm
+from groq import Groq
+from dotenv import load_dotenv
+from pathlib import Path
+import datetime
+import sys
+load_dotenv()
+client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+MODEL = "llama-3.1-8b-instant"
+encoding = tiktoken.get_encoding("cl100k_base")
+# 2. Define the folder and the filename
+#pdf_folder = Path("C:\\Users\\wd052\\OneDrive\\Desktop\\00\\01\\PDFs\\J\\CW")
+#pdf_path = r"C:\Users\wd052\OneDrive\Desktop\00\01\PDFs\J\CW\Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
+pdf_folder = Path("C:/Users/wd052/OneDrive/Desktop/00/01/PDFs/J/CW")
+pdf_name = "Collected Works of Dr. C.G. Jung - Vol. 6 - Psychological-Types.pdf"
+# Combine them
+pdf_path = pdf_folder / pdf_name
+WHOLE = False # Set to True to process the whole book; False to process a page range
+START_PAGE = 20
+END_PAGE = 30
+def call_groq_json(system_prompt, user_content):
+    strict_system_prompt = system_prompt + "\nIMPORTANT: Ensure all internal quotes are escaped. Respond ONLY in valid JSON."
+    completion = client.chat.completions.create(
+        model=MODEL,
+        messages=[
+            {"role": "system", "content": strict_system_prompt},
+            {"role": "user", "content": user_content}
+        ],
+        response_format={"type": "json_object"},
+        temperature=0.2
+    )
+    print(f"\nLLM raw response: {completion.choices[0].message.content}\n")
+    return json.loads(completion.choices[0].message.content)
+#async def run_chunking_process(pdf_path, queue=None, whole=False, start_p=20, end_p=30):
+async def run_chunking_process(pdf_path, queue=None, whole=WHOLE, start_p=START_PAGE, end_p=END_PAGE):
+    """
+    Main entry point for the chunking logic.
+    If queue is provided, it 'yields' results to the UI.
+    """
+    print(f"\nwhole: {whole}, start_p: {start_p}, end_p: {end_p}")
+    # 1. Determine Page Range
+    if whole:
+        # PyMuPDF4LLM uses None to process all pages
+        pages_to_read = None
+        print("📚 Processing the WHOLE book...")
+    else:
+        pages_to_read = list(range(start_p, end_p))
+        print(f"📑 Processing pages {start_p} to {end_p}...")
+    print(f"\ngot to CW6 chunking logic with pdf_path: {pdf_path} and pages_to_read: {pages_to_read}")
+    # 2. Extract Markdown
+    md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=pages_to_read)
+    cursor = 0
+    all_leaves = []
+    summary_blocks = []
+    temp_group = []
+    CHUNK_GROUP_SIZE = 5
+    context_buffer = {"predecessor": "Start", "latest_summary": "None"}
+    while cursor < len(md_text):
+        lookahead = md_text[cursor : cursor + 6000]
+        if not lookahead.strip(): break
+        prompt = f"Context: {context_buffer['latest_summary']} | Prev: {context_buffer['predecessor'][:200]}...\nExtract a self-sufficient Jungian chunk. JSON keys: 'break_text', 'rewritten_text', 'filename'."
+        try:
+            result = call_groq_json(prompt, lookahead)
+            # Semantic Jump Logic
+            break_text = result.get('break_text', "")
+            relative_break = lookahead.find(break_text) + len(break_text) if break_text in lookahead else 2000
+            new_chunk = {
+                "type": "leaf",
+                "filename": result.get('filename', 'untitled'),
+                "content": result.get('rewritten_text', '')
+            }
+            all_leaves.append(new_chunk)
+            temp_group.append(new_chunk)
+            # PUSH TO UI
+            if queue:
+                await queue.put(new_chunk)
+            context_buffer["predecessor"] = new_chunk["content"]
+            cursor += relative_break
+            # PHASE II: AGGREGATION
+            if len(temp_group) >= CHUNK_GROUP_SIZE:
+                from phase0102_chunker_aggregator_2 import generate_summary_block # Ensure helper is available
+                print(f"\nreached phase0102")
+                summary_res = generate_summary_block(temp_group)
+                summary_node = {
+                    "type": "summary",
+                    "name": summary_res['summary_name'],
+                    "content": summary_res['synthesis'],
+                    "children": [c['filename'] for c in temp_group]
+                }
+                summary_blocks.append(summary_node)
+                context_buffer["latest_summary"] = summary_node["content"]
+                if queue:
+                    await queue.put(summary_node)
+                temp_group = []
+        except Exception as e:
+            print(f"Error: {e}")
+            cursor += 3000
+            continue
+    # Final Save
+    timestamp = datetime.datetime.now().strftime("%m%d%Y_%H%M")
+    final_data = {"leaves": all_leaves, "summaries": summary_blocks}
+    with open(f"knowledge_tree_{timestamp}.json", "w") as f:
+        json.dump(final_data, f, indent=4)
+    if queue:
+        await queue.put("DONE")
+# Helper for summary
+def generate_summary_block(chunks):
+    combined = "\n\n".join([f"{c['filename']}: {c['content']}" for c in chunks])
+    prompt = "Synthesize these Jungian chunks into a dense Level-1 summary. JSON keys: 'summary_name', 'synthesis'."
+    return call_groq_json(prompt, combined)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn
+python-dotenv
+groq
+tiktoken
+pymupdf4llm
+python-multipart