morpheus-rag / scripts /rebuild_pageindex.py
nothex
Harden ingestion and retrieval reliability across the pipeline
4abd98f
"""
Rebuild the PageIndex (document_trees) for an already-ingested PDF.
Why this exists:
- Ingestion deletes the uploaded temp PDF after processing.
- PageIndex behavior evolves (better TOC handling, page_numbers, etc.).
- You may want to refresh only the structural index without re-embedding/re-uploading chunks.
Usage (PowerShell):
conda activate rag_env
python scripts/rebuild_pageindex.py --pdf "C:\path\to\file.pdf" --access-token "<JWT>"
Notes:
- This only rewrites `document_trees` (and optionally `identity_json` if you choose to extend it).
- It does NOT touch the vector store, RAPTOR summaries, or ingested_files registry.
"""
from __future__ import annotations
import argparse
import os
import sys
from pathlib import Path
# Ensure repo root is on sys.path so `import backend...` works when executed as a script.
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from backend.core.pipeline import (
_build_document_tree,
_build_service_supabase_client,
get_file_fingerprint,
partition_document,
)
from backend.core.auth_utils import extract_jwt_sub
def main() -> int:
parser = argparse.ArgumentParser(description="Rebuild PageIndex tree for a PDF.")
parser.add_argument("--pdf", required=True, help="Path to local PDF file.")
parser.add_argument(
"--access-token",
required=False,
default=None,
help="User JWT (same X-Auth-Token used by the API). Optional if --user-id is provided.",
)
parser.add_argument(
"--user-id",
required=False,
default=None,
help="Supabase auth user_id (sub). Use this if you don't want to paste a JWT.",
)
args = parser.parse_args()
pdf_path = os.path.abspath(args.pdf)
if not os.path.exists(pdf_path):
raise SystemExit(f"PDF not found: {pdf_path}")
if args.user_id:
user_id = str(args.user_id).strip()
elif args.access_token:
user_id = extract_jwt_sub(args.access_token)
else:
raise SystemExit("Provide either --user-id or --access-token.")
file_hash = get_file_fingerprint(pdf_path)
elements = partition_document(pdf_path)
doc_tree = _build_document_tree(elements)
sb = _build_service_supabase_client()
sb.table("document_trees").upsert(
{"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
on_conflict="user_id,file_hash",
).execute()
print(f"Rebuilt PageIndex tree for file_hash={file_hash} user_id={user_id}")
return 0
if __name__ == "__main__":
raise SystemExit(main())