Spaces:
Running
Running
| """ | |
| scripts/prebake_repos.py — Generate the canonical artifact set for one | |
| or more repos using the premium tier (Claude Sonnet 4.6). | |
| For each repo the CLI ensures: | |
| - the repo is ingested with contextual retrieval (force re-index if missing) | |
| - tour data is generated and persisted to Qdrant | |
| - architecture and class diagrams are generated and persisted | |
| - README is generated and persisted | |
| - the repo_map is built and persisted | |
| All generation calls go through the premium client when ANTHROPIC_API_KEY | |
| is set, so the cached artifacts represent the highest quality this app | |
| can produce. Once cached, every subsequent visitor reads them from Qdrant | |
| without re-running an LLM. | |
| Usage: | |
| python -m scripts.prebake_repos # default Karpathy set | |
| python -m scripts.prebake_repos owner/repo other/repo # specific repos | |
| python -m scripts.prebake_repos --force karpathy/nanoGPT # rebuild even if cached | |
| Environment: | |
| ANTHROPIC_API_KEY — required for premium quality. Without it the script | |
| runs against the free cascade with a warning. | |
| """ | |
| import argparse | |
| import sys | |
| import time | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(ROOT)) | |
| from backend.config import settings # noqa: E402 | |
| from backend.services.generation import GenerationService # noqa: E402 | |
| from backend.services.diagram_service import DiagramService # noqa: E402 | |
| from backend.services.readme_service import ReadmeService # noqa: E402 | |
| from backend.services.repo_map_service import RepoMapService # noqa: E402 | |
| from backend.services.ingestion_service import IngestionService # noqa: E402 | |
| from ingestion.embedder import Embedder # noqa: E402 | |
| from ingestion.qdrant_store import QdrantStore # noqa: E402 | |
| # Default set: Karpathy's well-known learning-oriented repos. These are the | |
| # repos most users land on for tutorials / understanding fundamental code. | |
| # Extend or override via CLI args. | |
| DEFAULT_REPOS = [ | |
| "karpathy/autoresearch", | |
| "karpathy/micrograd", | |
| "karpathy/nanochat", | |
| "karpathy/nanoGPT", | |
| ] | |
| # Diagram types we cache. "architecture" + "class" cover the two non-tour | |
| # diagram views surfaced in the UI. | |
| DIAGRAM_TYPES = ["architecture", "class"] | |
| def repo_indexed(store: QdrantStore, repo: str) -> bool: | |
| """Return True if Qdrant has any chunks for this repo.""" | |
| try: | |
| return store.count(repo=repo) > 0 | |
| except Exception: | |
| return False | |
| def ingest(repo: str, store: QdrantStore, gen: GenerationService, embedder: Embedder) -> bool: | |
| """Re-ingest a repo via GitHub with force=True so contextual retrieval | |
| runs. Even when the repo is already indexed we re-run — premium prebake | |
| must end with premium-quality contextual descriptions on every chunk, | |
| not just whatever the previous (possibly free-tier) ingestion left | |
| behind. The Voyage embeddings are deduplicated by content hash so this | |
| isn't as expensive as it sounds: only changed/new chunks pay the | |
| embed cost; only chunks needing fresh contextual retrieval pay the | |
| LLM cost.""" | |
| already = repo_indexed(store, repo) | |
| if already: | |
| print(f" ▸ re-ingesting {repo} ({store.count(repo=repo)} chunks already indexed)…") | |
| else: | |
| print(f" ▸ ingesting {repo}…") | |
| ingestion = IngestionService(store=store, embedder=embedder, gen=gen) | |
| repo_url = f"https://github.com/{repo}" | |
| try: | |
| # force=True triggers contextual retrieval enrichment. Because | |
| # premium_mode is on, gen.generate() routes those calls to the | |
| # premium client → claude-sonnet-4-6. progress callback prints | |
| # sparse milestones to stdout for visibility. | |
| last_step = [""] | |
| def on_progress(step: str, detail: str) -> None: | |
| if step != last_step[0]: | |
| print(f" · {step}") | |
| last_step[0] = step | |
| result = ingestion.ingest(repo_url, force=True, progress=on_progress) | |
| print(f" ✓ ingested ({result.get('chunks_stored', '?')} chunks)") | |
| return True | |
| except Exception as e: | |
| print(f" ✗ ingestion crashed: {e}") | |
| return False | |
| def bake_tour(repo: str, diagram_svc: DiagramService, force: bool) -> bool: | |
| """Run the tour pipeline; persist to Qdrant via the service's own cache logic.""" | |
| if not force and diagram_svc._load_tour(repo) is not None: | |
| print(" ✓ tour cached — skipping (use --force to rebuild)") | |
| return True | |
| print(" ▸ tour…") | |
| last_stage = None | |
| try: | |
| for event in diagram_svc.build_tour_stream(repo, force=force): | |
| stage = event.get("stage") | |
| if stage and stage != last_stage: | |
| print(f" · {stage} ({int((event.get('progress') or 0) * 100)}%)") | |
| last_stage = stage | |
| if stage == "error": | |
| print(f" ✗ tour failed: {event.get('error')}") | |
| return False | |
| print(" ✓ tour cached") | |
| return True | |
| except Exception as e: | |
| print(f" ✗ tour crashed: {e}") | |
| return False | |
| def bake_diagram(repo: str, diagram_type: str, diagram_svc: DiagramService, force: bool) -> bool: | |
| if not force and diagram_svc._load_diagram(repo, diagram_type) is not None: | |
| print(f" ✓ {diagram_type} diagram cached — skipping (use --force to rebuild)") | |
| return True | |
| print(f" ▸ {diagram_type} diagram…") | |
| try: | |
| for event in diagram_svc.build_diagram_stream(repo, diagram_type, force=force): | |
| stage = event.get("stage") | |
| if stage == "error": | |
| print(f" ✗ {diagram_type} diagram failed: {event.get('error')}") | |
| return False | |
| print(f" ✓ {diagram_type} diagram cached") | |
| return True | |
| except Exception as e: | |
| print(f" ✗ {diagram_type} crashed: {e}") | |
| return False | |
| def bake_readme(repo: str, readme_svc: ReadmeService, store: QdrantStore, force: bool) -> bool: | |
| if not force and store.load_artifact(repo, "readme"): | |
| print(" ✓ readme cached — skipping (use --force to rebuild)") | |
| return True | |
| print(" ▸ readme…") | |
| try: | |
| for event in readme_svc.build_readme_stream(repo, force=force): | |
| if event.get("stage") == "error": | |
| print(f" ✗ readme failed: {event.get('error')}") | |
| return False | |
| print(" ✓ readme cached") | |
| return True | |
| except Exception as e: | |
| print(f" ✗ readme crashed: {e}") | |
| return False | |
| def bake_repo_map(repo: str, repo_map_svc: RepoMapService, force: bool) -> bool: | |
| if force: | |
| repo_map_svc.invalidate(repo) | |
| print(" ▸ repo_map…") | |
| try: | |
| repo_map_svc.get_or_build(repo) | |
| print(" ✓ repo_map cached") | |
| return True | |
| except Exception as e: | |
| print(f" ✗ repo_map crashed: {e}") | |
| return False | |
| def bake_one( | |
| repo: str, | |
| store: QdrantStore, | |
| gen: GenerationService, | |
| embedder: Embedder, | |
| diagram_svc: DiagramService, | |
| readme_svc: ReadmeService, | |
| repo_map_svc: RepoMapService, | |
| force: bool, | |
| ) -> bool: | |
| print(f"\n=== {repo} ===") | |
| started = time.monotonic() | |
| if not ingest(repo, store, gen, embedder): | |
| return False | |
| bake_repo_map(repo, repo_map_svc, force) | |
| bake_tour(repo, diagram_svc, force) | |
| for dtype in DIAGRAM_TYPES: | |
| bake_diagram(repo, dtype, diagram_svc, force) | |
| bake_readme(repo, readme_svc, store, force) | |
| elapsed = time.monotonic() - started | |
| print(f" ⏱ {elapsed:.1f}s") | |
| return True | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Pre-bake artifact cache for canonical repos.") | |
| parser.add_argument("repos", nargs="*", help="Specific repos to bake (default: Karpathy set).") | |
| parser.add_argument("--force", action="store_true", help="Rebuild artifacts even if already cached.") | |
| args = parser.parse_args() | |
| repos = args.repos or DEFAULT_REPOS | |
| if not settings.anthropic_api_key: | |
| print("⚠ ANTHROPIC_API_KEY not set — running against the free cascade.") | |
| print(" Cached artifacts will not represent premium quality.") | |
| else: | |
| print(f"Premium tier: enabled (model is configured in GenerationService).") | |
| store = QdrantStore() | |
| embedder = Embedder() | |
| gen = GenerationService() | |
| gen.premium_mode = True # whole script runs at premium quality | |
| diagram_svc = DiagramService(store, gen) | |
| repo_map_svc = RepoMapService(store) | |
| readme_svc = ReadmeService(repo_map_svc, gen, store) | |
| print(f"\nBaking {len(repos)} repo(s) with premium_mode=True\n") | |
| ok = 0 | |
| for repo in repos: | |
| if bake_one(repo, store, gen, embedder, diagram_svc, readme_svc, repo_map_svc, args.force): | |
| ok += 1 | |
| print(f"\nDone: {ok}/{len(repos)} baked.") | |
| return 0 if ok == len(repos) else 1 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |