File size: 15,494 Bytes
96926b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23d5837
 
 
 
 
 
 
 
3ad88a4
 
96926b4
23d5837
 
 
 
3ad88a4
 
 
 
 
 
 
 
96926b4
 
 
 
 
 
 
23d5837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad88a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96926b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad88a4
 
 
 
 
 
 
 
 
 
96926b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad88a4
 
 
 
 
 
 
 
 
 
96926b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad88a4
 
 
 
 
 
 
 
 
 
96926b4
 
 
 
 
 
3ad88a4
 
 
 
 
 
 
 
 
 
96926b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad88a4
 
 
 
 
 
 
 
 
 
96926b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a719c9c
96926b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a719c9c
96926b4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
"""
code_chunker.py β€” Split source files into semantically meaningful chunks.

Two strategies depending on file type:

1. AST chunking (Python only)
   Parse the file into an Abstract Syntax Tree. Extract each top-level
   function and class as its own chunk. Classes include all their methods.

   Why: A function is the natural unit of code β€” it has a name, inputs,
   outputs, and a single responsibility. Splitting mid-function loses context.

2. Character-window chunking (everything else)
   Split by character count with overlap β€” same approach used for prose.
   Works for markdown, YAML, config files, and languages without AST support.

   Why not AST for all languages? Python's `ast` module is in the stdlib.
   Multi-language AST (tree-sitter) adds complexity. For a learning project,
   Python AST + fallback covers 80% of cases cleanly.

Chunk shape (returned by both strategies):
  {
    "text":          str,        # the actual code/text content
    "language":      str,        # "python", "typescript", etc.
    "filepath":      str,        # "src/auth/middleware.py"
    "chunk_type":    str,        # "function", "class", "module", "text"
    "name":          str,        # function/class name (or "" for text chunks)
    "start_line":    int,        # 1-indexed line where chunk starts
    "end_line":      int,        # 1-indexed line where chunk ends
    "calls":         list[str],  # names called by this function (AST only)
    "imports":       list[str],  # imported module names (module chunks only; [] elsewhere)
    "base_classes":  list[str],  # base class names (class chunks only; [] elsewhere)
  }

The `calls` field is used to build the Code Knowledge Graph β€” an interactive
D3 visualization of how functions call each other across files. It's extracted
by the CallExtractor visitor which walks ast.Call nodes inside each function body.

The `imports` field enables file-level dependency edges in the Architecture diagram.
It records every module name imported at the top of the file (both "import X" and
"from X import Y" forms), extracted from the module-level chunk only.

The `base_classes` field enables real inheritance edges in the Class Hierarchy diagram.
It records the names of parent classes from "class Foo(Bar, Baz):" declarations,
extracted directly from each ClassDef node.
"""

import ast
import textwrap
from pathlib import Path


# ── Call extractor ────────────────────────────────────────────────────────────

class _CallExtractor(ast.NodeVisitor):
    """
    AST visitor that collects the names of all functions/methods called
    inside a function or class body.

    How ast.NodeVisitor works:
      - Subclass it and define visit_<NodeType> methods.
      - Call self.visit(node) to start traversal from any node.
      - self.generic_visit(node) continues the walk into child nodes.

    Two kinds of calls in Python's AST:
      ast.Name:      direct calls β€” foo(), bar()
                     β†’ node.func is an ast.Name, name is node.func.id
      ast.Attribute: method/attr calls β€” self.foo(), obj.method()
                     β†’ node.func is an ast.Attribute, name is node.func.attr

    We collect only the leaf name (not the full dotted path) because we match
    against function names in the index, not fully-qualified paths.
    """
    def __init__(self):
        self.calls: list[str] = []

    def visit_Call(self, node: ast.Call):
        if isinstance(node.func, ast.Attribute):
            self.calls.append(node.func.attr)          # self.embed() β†’ "embed"
        elif isinstance(node.func, ast.Name):
            self.calls.append(node.func.id)            # embed() β†’ "embed"
        self.generic_visit(node)                       # recurse into nested calls


def _extract_calls(node: ast.AST) -> list[str]:
    """Extract unique called names from an AST node (function or class)."""
    extractor = _CallExtractor()
    extractor.visit(node)
    # Deduplicate while preserving order; filter builtins that add noise
    _NOISE = {"print", "len", "range", "isinstance", "str", "int", "list",
               "dict", "set", "tuple", "super", "hasattr", "getattr", "setattr",
               "append", "extend", "format", "join", "split", "strip", "get",
               "items", "keys", "values", "zip", "enumerate", "map", "filter"}
    seen = set()
    result = []
    for name in extractor.calls:
        if name not in seen and name not in _NOISE:
            seen.add(name)
            result.append(name)
    return result


def _extract_imports(tree: ast.AST) -> list[str]:
    """
    Extract all imported module names from a parsed AST.
    Used to build file-level dependency edges for the Architecture diagram.

    Handles both forms:
      import os                  β†’ ["os"]
      from micrograd.engine import Value β†’ ["micrograd.engine"]
      from . import engine       β†’ [".engine"] (relative, handled by caller)
    """
    imports = []
    for node in ast.walk(tree):
        if isinstance(node, ast.Import):
            for alias in node.names:
                imports.append(alias.name)
        elif isinstance(node, ast.ImportFrom):
            module = node.module or ""
            level  = node.level or 0   # number of dots for relative imports
            if level > 0:
                # Relative import β€” prefix with dots so caller can resolve them
                imports.append("." * level + module)
            elif module:
                imports.append(module)
    return list(dict.fromkeys(imports))  # deduplicate, preserve order


def _extract_base_classes(node: ast.ClassDef) -> list[str]:
    """
    Extract base class names from a ClassDef node.
    Used to build real inheritance edges for the Class Hierarchy diagram.

    Example: class MLP(Module): β†’ ["Module"]
    Handles direct names (ast.Name) and dotted paths (ast.Attribute).
    """
    bases = []
    for base in node.bases:
        if isinstance(base, ast.Name):
            bases.append(base.id)
        elif isinstance(base, ast.Attribute):
            bases.append(base.attr)   # e.g. nn.Module β†’ "Module"
    # Filter trivial bases that add noise
    return [b for b in bases if b not in ("object", "ABC", "Enum")]


# ── AST Chunking (Python) ─────────────────────────────────────────────────────

def chunk_python(content: str, filepath: str) -> list[dict]:
    """
    Parse Python source and extract functions and classes as individual chunks.

    Algorithm:
      1. Parse content into an AST with ast.parse()
      2. Walk top-level nodes looking for FunctionDef, AsyncFunctionDef, ClassDef
      3. For each, extract the source lines using node.lineno / node.end_lineno
      4. If a node is too large (>60 lines), split it further into sub-chunks

    What about module-level code (imports, constants, global statements)?
    We collect it as a single "module" chunk. It's useful context for
    understanding what a file imports and configures.
    """
    try:
        tree = ast.parse(content)
    except SyntaxError as e:
        # Fall back to character-window if the file can't be parsed
        # (e.g. Python 2 syntax, encoding issues)
        print(f"  [ast parse failed for {filepath}: {e}] β†’ fallback chunking")
        return chunk_by_window(content, filepath, language="python")

    lines = content.splitlines()
    chunks = []

    # Collect line numbers of all top-level definitions
    definition_lines = set()
    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
            if hasattr(node, "lineno"):
                for ln in range(node.lineno, (node.end_lineno or node.lineno) + 1):
                    definition_lines.add(ln)

    # ── Module-level chunk ────────────────────────────────────────────────────
    # Lines not covered by any function/class (imports, constants, etc.)
    module_lines = [
        line for i, line in enumerate(lines, 1)
        if i not in definition_lines
    ]
    module_text = "\n".join(module_lines).strip()
    if module_text:
        chunks.append({
            "text":         f"# {filepath}\n{module_text}",
            "language":     "python",
            "filepath":     filepath,
            "chunk_type":   "module",
            "name":         "",
            "start_line":   1,
            "end_line":     len(lines),
            "calls":        [],
            "imports":      _extract_imports(tree),
            "base_classes": [],
        })

    # ── Function and class chunks ─────────────────────────────────────────────
    for node in tree.body:
        if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
            continue

        start = node.lineno
        end   = node.end_lineno or node.lineno
        node_lines = lines[start - 1 : end]
        node_text  = "\n".join(node_lines)

        chunk_type = "class" if isinstance(node, ast.ClassDef) else "function"
        name = node.name

        # If the chunk is large, split into sub-chunks by method (for classes)
        # or by logical blocks (for large functions)
        if len(node_lines) > 80 and chunk_type == "class":
            sub_chunks = _split_class(node, lines, filepath)
            chunks.extend(sub_chunks)
        else:
            chunks.append({
                "text":         f"# {filepath}\n{node_text}",
                "language":     "python",
                "filepath":     filepath,
                "chunk_type":   chunk_type,
                "name":         name,
                "start_line":   start,
                "end_line":     end,
                "calls":        _extract_calls(node),
                "imports":      [],
                "base_classes": _extract_base_classes(node) if isinstance(node, ast.ClassDef) else [],
            })

    return chunks if chunks else chunk_by_window(content, filepath, language="python")


def _split_class(class_node: ast.ClassDef, lines: list[str], filepath: str) -> list[dict]:
    """
    Split a large class into per-method chunks.

    Each method gets the class signature as a header so the LLM knows
    which class the method belongs to:

      class MyClass:
          def __init__(self): ...
          ↓
      Chunk: "class MyClass:\n    def __init__(self): ..."
    """
    chunks = []
    class_start = class_node.lineno
    class_header = lines[class_start - 1]   # "class MyClass(Base):"

    for node in class_node.body:
        if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
            continue
        start = node.lineno
        end   = node.end_lineno or node.lineno
        method_lines = lines[start - 1 : end]
        # Indent method lines if not already indented (should be)
        method_text  = "\n".join(method_lines)

        chunks.append({
            "text":         f"# {filepath}\n{class_header}\n{method_text}",
            "language":     "python",
            "filepath":     filepath,
            "chunk_type":   "function",
            "name":         f"{class_node.name}.{node.name}",
            "start_line":   start,
            "end_line":     end,
            "calls":        _extract_calls(node),
            "imports":      [],
            "base_classes": [],
        })

    # Also include the class-level code (class variables, docstring)
    class_end = class_node.end_lineno or class_node.lineno
    class_text = "\n".join(lines[class_start - 1 : class_end])
    chunks.insert(0, {
        "text":         f"# {filepath}\n{class_text[:800]}",   # truncated overview
        "language":     "python",
        "filepath":     filepath,
        "chunk_type":   "class",
        "name":         class_node.name,
        "start_line":   class_start,
        "end_line":     class_end,
        "calls":        _extract_calls(class_node),
        "imports":      [],
        "base_classes": _extract_base_classes(class_node),
    })

    return chunks


# ── Character-window chunking (fallback) ──────────────────────────────────────

def chunk_by_window(
    content:   str,
    filepath:  str,
    language:  str = "text",
    chunk_size:  int = 1000,
    chunk_overlap: int = 200,
) -> list[dict]:
    """
    Split text into overlapping fixed-size character windows.

    Used for:
      - Markdown documentation (.md, .rst)
      - Config files (.yaml, .toml, .json)
      - Languages without AST support (TypeScript, Go, Rust, etc.)
      - Python files that failed to parse

    The overlap ensures that a concept spanning a chunk boundary isn't lost.
    With overlap=200, the last 200 chars of chunk N are the first 200 chars
    of chunk N+1.
    """
    if not content.strip():
        return []

    lines  = content.splitlines()
    chunks = []
    start  = 0

    while start < len(content):
        end  = min(start + chunk_size, len(content))
        text = content[start:end]

        # Find approximate start/end line numbers for this character range
        start_line = content[:start].count("\n") + 1
        end_line   = content[:end].count("\n") + 1

        chunks.append({
            "text":         f"# {filepath}\n{text}",
            "language":     language,
            "filepath":     filepath,
            "chunk_type":   "text",
            "name":         "",
            "start_line":   start_line,
            "end_line":     end_line,
            "calls":        [],
            "imports":      [],
            "base_classes": [],
        })

        if end == len(content):
            break
        start = end - chunk_overlap

    return chunks


# ── Main entry point ──────────────────────────────────────────────────────────

def chunk_file(file: dict) -> list[dict]:
    """
    Chunk a single file dict (as returned by repo_fetcher).

    Args:
        file: {"path": str, "content": str, "size": int, "repo": str}

    Returns:
        List of chunk dicts with text + metadata.
    """
    from ingestion.file_filter import language_from_path

    filepath = file.get("path") or file.get("filepath", "")
    content  = file["content"]
    language = language_from_path(filepath)
    repo     = file.get("repo", "")

    if language == "python":
        chunks = chunk_python(content, filepath)
    else:
        chunks = chunk_by_window(content, filepath, language=language)

    # Attach repo to every chunk
    for chunk in chunks:
        chunk["repo"] = repo

    return chunks


def chunk_files(files: list[dict]) -> list[dict]:
    """Chunk all files and return a flat list of all chunks."""
    all_chunks = []
    for file in files:
        file_chunks = chunk_file(file)
        all_chunks.extend(file_chunks)
        print(f"  {file.get('path') or file.get('filepath', '?')} β†’ {len(file_chunks)} chunks")
    print(f"Total: {len(all_chunks)} chunks from {len(files)} files")
    return all_chunks