cartographer / scripts /prebake_repos.py
umanggarg's picture
prebake: always re-ingest so contextual retrieval runs at premium
9f829fa
"""
scripts/prebake_repos.py — Generate the canonical artifact set for one
or more repos using the premium tier (Claude Sonnet 4.6).
For each repo the CLI ensures:
- the repo is ingested with contextual retrieval (force re-index if missing)
- tour data is generated and persisted to Qdrant
- architecture and class diagrams are generated and persisted
- README is generated and persisted
- the repo_map is built and persisted
All generation calls go through the premium client when ANTHROPIC_API_KEY
is set, so the cached artifacts represent the highest quality this app
can produce. Once cached, every subsequent visitor reads them from Qdrant
without re-running an LLM.
Usage:
python -m scripts.prebake_repos # default Karpathy set
python -m scripts.prebake_repos owner/repo other/repo # specific repos
python -m scripts.prebake_repos --force karpathy/nanoGPT # rebuild even if cached
Environment:
ANTHROPIC_API_KEY — required for premium quality. Without it the script
runs against the free cascade with a warning.
"""
import argparse
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from backend.config import settings # noqa: E402
from backend.services.generation import GenerationService # noqa: E402
from backend.services.diagram_service import DiagramService # noqa: E402
from backend.services.readme_service import ReadmeService # noqa: E402
from backend.services.repo_map_service import RepoMapService # noqa: E402
from backend.services.ingestion_service import IngestionService # noqa: E402
from ingestion.embedder import Embedder # noqa: E402
from ingestion.qdrant_store import QdrantStore # noqa: E402
# Default set: Karpathy's well-known learning-oriented repos. These are the
# repos most users land on for tutorials / understanding fundamental code.
# Extend or override via CLI args.
DEFAULT_REPOS = [
"karpathy/autoresearch",
"karpathy/micrograd",
"karpathy/nanochat",
"karpathy/nanoGPT",
]
# Diagram types we cache. "architecture" + "class" cover the two non-tour
# diagram views surfaced in the UI.
DIAGRAM_TYPES = ["architecture", "class"]
def repo_indexed(store: QdrantStore, repo: str) -> bool:
"""Return True if Qdrant has any chunks for this repo."""
try:
return store.count(repo=repo) > 0
except Exception:
return False
def ingest(repo: str, store: QdrantStore, gen: GenerationService, embedder: Embedder) -> bool:
"""Re-ingest a repo via GitHub with force=True so contextual retrieval
runs. Even when the repo is already indexed we re-run — premium prebake
must end with premium-quality contextual descriptions on every chunk,
not just whatever the previous (possibly free-tier) ingestion left
behind. The Voyage embeddings are deduplicated by content hash so this
isn't as expensive as it sounds: only changed/new chunks pay the
embed cost; only chunks needing fresh contextual retrieval pay the
LLM cost."""
already = repo_indexed(store, repo)
if already:
print(f" ▸ re-ingesting {repo} ({store.count(repo=repo)} chunks already indexed)…")
else:
print(f" ▸ ingesting {repo}…")
ingestion = IngestionService(store=store, embedder=embedder, gen=gen)
repo_url = f"https://github.com/{repo}"
try:
# force=True triggers contextual retrieval enrichment. Because
# premium_mode is on, gen.generate() routes those calls to the
# premium client → claude-sonnet-4-6. progress callback prints
# sparse milestones to stdout for visibility.
last_step = [""]
def on_progress(step: str, detail: str) -> None:
if step != last_step[0]:
print(f" · {step}")
last_step[0] = step
result = ingestion.ingest(repo_url, force=True, progress=on_progress)
print(f" ✓ ingested ({result.get('chunks_stored', '?')} chunks)")
return True
except Exception as e:
print(f" ✗ ingestion crashed: {e}")
return False
def bake_tour(repo: str, diagram_svc: DiagramService, force: bool) -> bool:
"""Run the tour pipeline; persist to Qdrant via the service's own cache logic."""
if not force and diagram_svc._load_tour(repo) is not None:
print(" ✓ tour cached — skipping (use --force to rebuild)")
return True
print(" ▸ tour…")
last_stage = None
try:
for event in diagram_svc.build_tour_stream(repo, force=force):
stage = event.get("stage")
if stage and stage != last_stage:
print(f" · {stage} ({int((event.get('progress') or 0) * 100)}%)")
last_stage = stage
if stage == "error":
print(f" ✗ tour failed: {event.get('error')}")
return False
print(" ✓ tour cached")
return True
except Exception as e:
print(f" ✗ tour crashed: {e}")
return False
def bake_diagram(repo: str, diagram_type: str, diagram_svc: DiagramService, force: bool) -> bool:
if not force and diagram_svc._load_diagram(repo, diagram_type) is not None:
print(f" ✓ {diagram_type} diagram cached — skipping (use --force to rebuild)")
return True
print(f" ▸ {diagram_type} diagram…")
try:
for event in diagram_svc.build_diagram_stream(repo, diagram_type, force=force):
stage = event.get("stage")
if stage == "error":
print(f" ✗ {diagram_type} diagram failed: {event.get('error')}")
return False
print(f" ✓ {diagram_type} diagram cached")
return True
except Exception as e:
print(f" ✗ {diagram_type} crashed: {e}")
return False
def bake_readme(repo: str, readme_svc: ReadmeService, store: QdrantStore, force: bool) -> bool:
if not force and store.load_artifact(repo, "readme"):
print(" ✓ readme cached — skipping (use --force to rebuild)")
return True
print(" ▸ readme…")
try:
for event in readme_svc.build_readme_stream(repo, force=force):
if event.get("stage") == "error":
print(f" ✗ readme failed: {event.get('error')}")
return False
print(" ✓ readme cached")
return True
except Exception as e:
print(f" ✗ readme crashed: {e}")
return False
def bake_repo_map(repo: str, repo_map_svc: RepoMapService, force: bool) -> bool:
if force:
repo_map_svc.invalidate(repo)
print(" ▸ repo_map…")
try:
repo_map_svc.get_or_build(repo)
print(" ✓ repo_map cached")
return True
except Exception as e:
print(f" ✗ repo_map crashed: {e}")
return False
def bake_one(
repo: str,
store: QdrantStore,
gen: GenerationService,
embedder: Embedder,
diagram_svc: DiagramService,
readme_svc: ReadmeService,
repo_map_svc: RepoMapService,
force: bool,
) -> bool:
print(f"\n=== {repo} ===")
started = time.monotonic()
if not ingest(repo, store, gen, embedder):
return False
bake_repo_map(repo, repo_map_svc, force)
bake_tour(repo, diagram_svc, force)
for dtype in DIAGRAM_TYPES:
bake_diagram(repo, dtype, diagram_svc, force)
bake_readme(repo, readme_svc, store, force)
elapsed = time.monotonic() - started
print(f" ⏱ {elapsed:.1f}s")
return True
def main() -> int:
parser = argparse.ArgumentParser(description="Pre-bake artifact cache for canonical repos.")
parser.add_argument("repos", nargs="*", help="Specific repos to bake (default: Karpathy set).")
parser.add_argument("--force", action="store_true", help="Rebuild artifacts even if already cached.")
args = parser.parse_args()
repos = args.repos or DEFAULT_REPOS
if not settings.anthropic_api_key:
print("⚠ ANTHROPIC_API_KEY not set — running against the free cascade.")
print(" Cached artifacts will not represent premium quality.")
else:
print(f"Premium tier: enabled (model is configured in GenerationService).")
store = QdrantStore()
embedder = Embedder()
gen = GenerationService()
gen.premium_mode = True # whole script runs at premium quality
diagram_svc = DiagramService(store, gen)
repo_map_svc = RepoMapService(store)
readme_svc = ReadmeService(repo_map_svc, gen, store)
print(f"\nBaking {len(repos)} repo(s) with premium_mode=True\n")
ok = 0
for repo in repos:
if bake_one(repo, store, gen, embedder, diagram_svc, readme_svc, repo_map_svc, args.force):
ok += 1
print(f"\nDone: {ok}/{len(repos)} baked.")
return 0 if ok == len(repos) else 1
if __name__ == "__main__":
sys.exit(main())