Spaces:

nothex
/

morpheus-rag

Running

morpheus-rag / scripts /rebuild_pageindex.py

nothex

Harden ingestion and retrieval reliability across the pipeline

4abd98f 11 days ago

2.64 kB

	"""
	Rebuild the PageIndex (document_trees) for an already-ingested PDF.

	Why this exists:
	- Ingestion deletes the uploaded temp PDF after processing.
	- PageIndex behavior evolves (better TOC handling, page_numbers, etc.).
	- You may want to refresh only the structural index without re-embedding/re-uploading chunks.

	Usage (PowerShell):
	conda activate rag_env
	python scripts/rebuild_pageindex.py --pdf "C:\path\to\file.pdf" --access-token "<JWT>"

	Notes:
	- This only rewrites `document_trees` (and optionally `identity_json` if you choose to extend it).
	- It does NOT touch the vector store, RAPTOR summaries, or ingested_files registry.
	"""

	from __future__ import annotations

	import argparse
	import os
	import sys
	from pathlib import Path

	# Ensure repo root is on sys.path so `import backend...` works when executed as a script.
	REPO_ROOT = Path(__file__).resolve().parents[1]
	if str(REPO_ROOT) not in sys.path:
	sys.path.insert(0, str(REPO_ROOT))

	from backend.core.pipeline import (
	_build_document_tree,
	_build_service_supabase_client,
	get_file_fingerprint,
	partition_document,
	)
	from backend.core.auth_utils import extract_jwt_sub


	def main() -> int:
	parser = argparse.ArgumentParser(description="Rebuild PageIndex tree for a PDF.")
	parser.add_argument("--pdf", required=True, help="Path to local PDF file.")
	parser.add_argument(
	"--access-token",
	required=False,
	default=None,
	help="User JWT (same X-Auth-Token used by the API). Optional if --user-id is provided.",
	)
	parser.add_argument(
	"--user-id",
	required=False,
	default=None,
	help="Supabase auth user_id (sub). Use this if you don't want to paste a JWT.",
	)
	args = parser.parse_args()

	pdf_path = os.path.abspath(args.pdf)
	if not os.path.exists(pdf_path):
	raise SystemExit(f"PDF not found: {pdf_path}")

	if args.user_id:
	user_id = str(args.user_id).strip()
	elif args.access_token:
	user_id = extract_jwt_sub(args.access_token)
	else:
	raise SystemExit("Provide either --user-id or --access-token.")
	file_hash = get_file_fingerprint(pdf_path)

	elements = partition_document(pdf_path)
	doc_tree = _build_document_tree(elements)

	sb = _build_service_supabase_client()
	sb.table("document_trees").upsert(
	{"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
	on_conflict="user_id,file_hash",
	).execute()

	print(f"Rebuilt PageIndex tree for file_hash={file_hash} user_id={user_id}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())