Spaces:
Running
Running
| """ | |
| Rebuild the PageIndex (document_trees) for an already-ingested PDF. | |
| Why this exists: | |
| - Ingestion deletes the uploaded temp PDF after processing. | |
| - PageIndex behavior evolves (better TOC handling, page_numbers, etc.). | |
| - You may want to refresh only the structural index without re-embedding/re-uploading chunks. | |
| Usage (PowerShell): | |
| conda activate rag_env | |
| python scripts/rebuild_pageindex.py --pdf "C:\path\to\file.pdf" --access-token "<JWT>" | |
| Notes: | |
| - This only rewrites `document_trees` (and optionally `identity_json` if you choose to extend it). | |
| - It does NOT touch the vector store, RAPTOR summaries, or ingested_files registry. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import os | |
| import sys | |
| from pathlib import Path | |
| # Ensure repo root is on sys.path so `import backend...` works when executed as a script. | |
| REPO_ROOT = Path(__file__).resolve().parents[1] | |
| if str(REPO_ROOT) not in sys.path: | |
| sys.path.insert(0, str(REPO_ROOT)) | |
| from backend.core.pipeline import ( | |
| _build_document_tree, | |
| _build_service_supabase_client, | |
| get_file_fingerprint, | |
| partition_document, | |
| ) | |
| from backend.core.auth_utils import extract_jwt_sub | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Rebuild PageIndex tree for a PDF.") | |
| parser.add_argument("--pdf", required=True, help="Path to local PDF file.") | |
| parser.add_argument( | |
| "--access-token", | |
| required=False, | |
| default=None, | |
| help="User JWT (same X-Auth-Token used by the API). Optional if --user-id is provided.", | |
| ) | |
| parser.add_argument( | |
| "--user-id", | |
| required=False, | |
| default=None, | |
| help="Supabase auth user_id (sub). Use this if you don't want to paste a JWT.", | |
| ) | |
| args = parser.parse_args() | |
| pdf_path = os.path.abspath(args.pdf) | |
| if not os.path.exists(pdf_path): | |
| raise SystemExit(f"PDF not found: {pdf_path}") | |
| if args.user_id: | |
| user_id = str(args.user_id).strip() | |
| elif args.access_token: | |
| user_id = extract_jwt_sub(args.access_token) | |
| else: | |
| raise SystemExit("Provide either --user-id or --access-token.") | |
| file_hash = get_file_fingerprint(pdf_path) | |
| elements = partition_document(pdf_path) | |
| doc_tree = _build_document_tree(elements) | |
| sb = _build_service_supabase_client() | |
| sb.table("document_trees").upsert( | |
| {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree}, | |
| on_conflict="user_id,file_hash", | |
| ).execute() | |
| print(f"Rebuilt PageIndex tree for file_hash={file_hash} user_id={user_id}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |