File size: 9,589 Bytes
e528876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f829fa
 
 
 
 
 
 
 
 
 
 
 
 
e528876
 
 
9f829fa
 
 
 
e528876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f313bc
 
 
 
 
 
e528876
5f313bc
 
e528876
5f313bc
 
e528876
5f313bc
e528876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
"""
scripts/prebake_repos.py — Generate the canonical artifact set for one
or more repos using the premium tier (Claude Sonnet 4.6).

For each repo the CLI ensures:
  - the repo is ingested with contextual retrieval (force re-index if missing)
  - tour data is generated and persisted to Qdrant
  - architecture and class diagrams are generated and persisted
  - README is generated and persisted
  - the repo_map is built and persisted

All generation calls go through the premium client when ANTHROPIC_API_KEY
is set, so the cached artifacts represent the highest quality this app
can produce. Once cached, every subsequent visitor reads them from Qdrant
without re-running an LLM.

Usage:
  python -m scripts.prebake_repos                                # default Karpathy set
  python -m scripts.prebake_repos owner/repo other/repo          # specific repos
  python -m scripts.prebake_repos --force karpathy/nanoGPT       # rebuild even if cached

Environment:
  ANTHROPIC_API_KEY — required for premium quality. Without it the script
                       runs against the free cascade with a warning.
"""

import argparse
import sys
import time
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from backend.config import settings  # noqa: E402
from backend.services.generation     import GenerationService     # noqa: E402
from backend.services.diagram_service import DiagramService        # noqa: E402
from backend.services.readme_service  import ReadmeService         # noqa: E402
from backend.services.repo_map_service import RepoMapService       # noqa: E402
from backend.services.ingestion_service import IngestionService    # noqa: E402
from ingestion.embedder      import Embedder                        # noqa: E402
from ingestion.qdrant_store  import QdrantStore                     # noqa: E402


# Default set: Karpathy's well-known learning-oriented repos. These are the
# repos most users land on for tutorials / understanding fundamental code.
# Extend or override via CLI args.
DEFAULT_REPOS = [
    "karpathy/autoresearch",
    "karpathy/micrograd",
    "karpathy/nanochat",
    "karpathy/nanoGPT",
]

# Diagram types we cache. "architecture" + "class" cover the two non-tour
# diagram views surfaced in the UI.
DIAGRAM_TYPES = ["architecture", "class"]


def repo_indexed(store: QdrantStore, repo: str) -> bool:
    """Return True if Qdrant has any chunks for this repo."""
    try:
        return store.count(repo=repo) > 0
    except Exception:
        return False


def ingest(repo: str, store: QdrantStore, gen: GenerationService, embedder: Embedder) -> bool:
    """Re-ingest a repo via GitHub with force=True so contextual retrieval
    runs. Even when the repo is already indexed we re-run — premium prebake
    must end with premium-quality contextual descriptions on every chunk,
    not just whatever the previous (possibly free-tier) ingestion left
    behind. The Voyage embeddings are deduplicated by content hash so this
    isn't as expensive as it sounds: only changed/new chunks pay the
    embed cost; only chunks needing fresh contextual retrieval pay the
    LLM cost."""
    already = repo_indexed(store, repo)
    if already:
        print(f"  ▸ re-ingesting {repo} ({store.count(repo=repo)} chunks already indexed)…")
    else:
        print(f"  ▸ ingesting {repo}…")
    ingestion = IngestionService(store=store, embedder=embedder, gen=gen)
    repo_url = f"https://github.com/{repo}"
    try:
        # force=True triggers contextual retrieval enrichment. Because
        # premium_mode is on, gen.generate() routes those calls to the
        # premium client → claude-sonnet-4-6. progress callback prints
        # sparse milestones to stdout for visibility.
        last_step = [""]
        def on_progress(step: str, detail: str) -> None:
            if step != last_step[0]:
                print(f"     · {step}")
                last_step[0] = step
        result = ingestion.ingest(repo_url, force=True, progress=on_progress)
        print(f"  ✓ ingested ({result.get('chunks_stored', '?')} chunks)")
        return True
    except Exception as e:
        print(f"  ✗ ingestion crashed: {e}")
        return False


def bake_tour(repo: str, diagram_svc: DiagramService, force: bool) -> bool:
    """Run the tour pipeline; persist to Qdrant via the service's own cache logic."""
    if not force and diagram_svc._load_tour(repo) is not None:
        print("  ✓ tour cached — skipping (use --force to rebuild)")
        return True
    print("  ▸ tour…")
    last_stage = None
    try:
        for event in diagram_svc.build_tour_stream(repo, force=force):
            stage = event.get("stage")
            if stage and stage != last_stage:
                print(f"     · {stage} ({int((event.get('progress') or 0) * 100)}%)")
                last_stage = stage
            if stage == "error":
                print(f"  ✗ tour failed: {event.get('error')}")
                return False
        print("  ✓ tour cached")
        return True
    except Exception as e:
        print(f"  ✗ tour crashed: {e}")
        return False


def bake_diagram(repo: str, diagram_type: str, diagram_svc: DiagramService, force: bool) -> bool:
    if not force and diagram_svc._load_diagram(repo, diagram_type) is not None:
        print(f"  ✓ {diagram_type} diagram cached — skipping (use --force to rebuild)")
        return True
    print(f"  ▸ {diagram_type} diagram…")
    try:
        for event in diagram_svc.build_diagram_stream(repo, diagram_type, force=force):
            stage = event.get("stage")
            if stage == "error":
                print(f"  ✗ {diagram_type} diagram failed: {event.get('error')}")
                return False
        print(f"  ✓ {diagram_type} diagram cached")
        return True
    except Exception as e:
        print(f"  ✗ {diagram_type} crashed: {e}")
        return False


def bake_readme(repo: str, readme_svc: ReadmeService, store: QdrantStore, force: bool) -> bool:
    if not force and store.load_artifact(repo, "readme"):
        print("  ✓ readme cached — skipping (use --force to rebuild)")
        return True
    print("  ▸ readme…")
    try:
        for event in readme_svc.build_readme_stream(repo, force=force):
            if event.get("stage") == "error":
                print(f"  ✗ readme failed: {event.get('error')}")
                return False
        print("  ✓ readme cached")
        return True
    except Exception as e:
        print(f"  ✗ readme crashed: {e}")
        return False


def bake_repo_map(repo: str, repo_map_svc: RepoMapService, force: bool) -> bool:
    if force:
        repo_map_svc.invalidate(repo)
    print("  ▸ repo_map…")
    try:
        repo_map_svc.get_or_build(repo)
        print("  ✓ repo_map cached")
        return True
    except Exception as e:
        print(f"  ✗ repo_map crashed: {e}")
        return False


def bake_one(
    repo:        str,
    store:       QdrantStore,
    gen:         GenerationService,
    embedder:    Embedder,
    diagram_svc: DiagramService,
    readme_svc:  ReadmeService,
    repo_map_svc: RepoMapService,
    force:       bool,
) -> bool:
    print(f"\n=== {repo} ===")
    started = time.monotonic()
    if not ingest(repo, store, gen, embedder):
        return False
    # Each bake step returns False on failure. Track them all so the
    # final exit code reflects whether the repo is *actually* fully
    # baked, not just whether ingestion succeeded.
    failures: list[str] = []
    if not bake_repo_map(repo, repo_map_svc, force):                 failures.append("repo_map")
    if not bake_tour(repo, diagram_svc, force):                      failures.append("tour")
    for dtype in DIAGRAM_TYPES:
        if not bake_diagram(repo, dtype, diagram_svc, force):        failures.append(f"diagram:{dtype}")
    if not bake_readme(repo, readme_svc, store, force):              failures.append("readme")
    elapsed = time.monotonic() - started
    if failures:
        print(f"  ⚠  partial: {len(failures)} step(s) failed → {', '.join(failures)}")
    print(f"  ⏱  {elapsed:.1f}s")
    return not failures


def main() -> int:
    parser = argparse.ArgumentParser(description="Pre-bake artifact cache for canonical repos.")
    parser.add_argument("repos", nargs="*", help="Specific repos to bake (default: Karpathy set).")
    parser.add_argument("--force", action="store_true", help="Rebuild artifacts even if already cached.")
    args = parser.parse_args()

    repos = args.repos or DEFAULT_REPOS

    if not settings.anthropic_api_key:
        print("⚠ ANTHROPIC_API_KEY not set — running against the free cascade.")
        print("  Cached artifacts will not represent premium quality.")
    else:
        print(f"Premium tier: enabled (model is configured in GenerationService).")

    store    = QdrantStore()
    embedder = Embedder()
    gen      = GenerationService()
    gen.premium_mode = True   # whole script runs at premium quality

    diagram_svc  = DiagramService(store, gen)
    repo_map_svc = RepoMapService(store)
    readme_svc   = ReadmeService(repo_map_svc, gen, store)

    print(f"\nBaking {len(repos)} repo(s) with premium_mode=True\n")
    ok = 0
    for repo in repos:
        if bake_one(repo, store, gen, embedder, diagram_svc, readme_svc, repo_map_svc, args.force):
            ok += 1

    print(f"\nDone: {ok}/{len(repos)} baked.")
    return 0 if ok == len(repos) else 1


if __name__ == "__main__":
    sys.exit(main())