""" Rebuild the PageIndex (document_trees) for an already-ingested PDF. Why this exists: - Ingestion deletes the uploaded temp PDF after processing. - PageIndex behavior evolves (better TOC handling, page_numbers, etc.). - You may want to refresh only the structural index without re-embedding/re-uploading chunks. Usage (PowerShell): conda activate rag_env python scripts/rebuild_pageindex.py --pdf "C:\path\to\file.pdf" --access-token "" Notes: - This only rewrites `document_trees` (and optionally `identity_json` if you choose to extend it). - It does NOT touch the vector store, RAPTOR summaries, or ingested_files registry. """ from __future__ import annotations import argparse import os import sys from pathlib import Path # Ensure repo root is on sys.path so `import backend...` works when executed as a script. REPO_ROOT = Path(__file__).resolve().parents[1] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) from backend.core.pipeline import ( _build_document_tree, _build_service_supabase_client, get_file_fingerprint, partition_document, ) from backend.core.auth_utils import extract_jwt_sub def main() -> int: parser = argparse.ArgumentParser(description="Rebuild PageIndex tree for a PDF.") parser.add_argument("--pdf", required=True, help="Path to local PDF file.") parser.add_argument( "--access-token", required=False, default=None, help="User JWT (same X-Auth-Token used by the API). Optional if --user-id is provided.", ) parser.add_argument( "--user-id", required=False, default=None, help="Supabase auth user_id (sub). Use this if you don't want to paste a JWT.", ) args = parser.parse_args() pdf_path = os.path.abspath(args.pdf) if not os.path.exists(pdf_path): raise SystemExit(f"PDF not found: {pdf_path}") if args.user_id: user_id = str(args.user_id).strip() elif args.access_token: user_id = extract_jwt_sub(args.access_token) else: raise SystemExit("Provide either --user-id or --access-token.") file_hash = get_file_fingerprint(pdf_path) elements = partition_document(pdf_path) doc_tree = _build_document_tree(elements) sb = _build_service_supabase_client() sb.table("document_trees").upsert( {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree}, on_conflict="user_id,file_hash", ).execute() print(f"Rebuilt PageIndex tree for file_hash={file_hash} user_id={user_id}") return 0 if __name__ == "__main__": raise SystemExit(main())