""" code_chunker.py — Split source files into semantically meaningful chunks. Two strategies depending on file type: 1. AST chunking (Python only) Parse the file into an Abstract Syntax Tree. Extract each top-level function and class as its own chunk. Classes include all their methods. Why: A function is the natural unit of code — it has a name, inputs, outputs, and a single responsibility. Splitting mid-function loses context. 2. Character-window chunking (everything else) Split by character count with overlap — same approach used for prose. Works for markdown, YAML, config files, and languages without AST support. Why not AST for all languages? Python's `ast` module is in the stdlib. Multi-language AST (tree-sitter) adds complexity. For a learning project, Python AST + fallback covers 80% of cases cleanly. Chunk shape (returned by both strategies): { "text": str, # the actual code/text content "language": str, # "python", "typescript", etc. "filepath": str, # "src/auth/middleware.py" "chunk_type": str, # "function", "class", "module", "text" "name": str, # function/class name (or "" for text chunks) "start_line": int, # 1-indexed line where chunk starts "end_line": int, # 1-indexed line where chunk ends "calls": list[str], # names called by this function (AST only) "imports": list[str], # imported module names (module chunks only; [] elsewhere) "base_classes": list[str], # base class names (class chunks only; [] elsewhere) } The `calls` field is used to build the Code Knowledge Graph — an interactive D3 visualization of how functions call each other across files. It's extracted by the CallExtractor visitor which walks ast.Call nodes inside each function body. The `imports` field enables file-level dependency edges in the Architecture diagram. It records every module name imported at the top of the file (both "import X" and "from X import Y" forms), extracted from the module-level chunk only. The `base_classes` field enables real inheritance edges in the Class Hierarchy diagram. It records the names of parent classes from "class Foo(Bar, Baz):" declarations, extracted directly from each ClassDef node. """ import ast import textwrap from pathlib import Path # ── Call extractor ──────────────────────────────────────────────────────────── class _CallExtractor(ast.NodeVisitor): """ AST visitor that collects the names of all functions/methods called inside a function or class body. How ast.NodeVisitor works: - Subclass it and define visit_ methods. - Call self.visit(node) to start traversal from any node. - self.generic_visit(node) continues the walk into child nodes. Two kinds of calls in Python's AST: ast.Name: direct calls — foo(), bar() → node.func is an ast.Name, name is node.func.id ast.Attribute: method/attr calls — self.foo(), obj.method() → node.func is an ast.Attribute, name is node.func.attr We collect only the leaf name (not the full dotted path) because we match against function names in the index, not fully-qualified paths. """ def __init__(self): self.calls: list[str] = [] def visit_Call(self, node: ast.Call): if isinstance(node.func, ast.Attribute): self.calls.append(node.func.attr) # self.embed() → "embed" elif isinstance(node.func, ast.Name): self.calls.append(node.func.id) # embed() → "embed" self.generic_visit(node) # recurse into nested calls def _extract_calls(node: ast.AST) -> list[str]: """Extract unique called names from an AST node (function or class).""" extractor = _CallExtractor() extractor.visit(node) # Deduplicate while preserving order; filter builtins that add noise _NOISE = {"print", "len", "range", "isinstance", "str", "int", "list", "dict", "set", "tuple", "super", "hasattr", "getattr", "setattr", "append", "extend", "format", "join", "split", "strip", "get", "items", "keys", "values", "zip", "enumerate", "map", "filter"} seen = set() result = [] for name in extractor.calls: if name not in seen and name not in _NOISE: seen.add(name) result.append(name) return result def _extract_imports(tree: ast.AST) -> list[str]: """ Extract all imported module names from a parsed AST. Used to build file-level dependency edges for the Architecture diagram. Handles both forms: import os → ["os"] from micrograd.engine import Value → ["micrograd.engine"] from . import engine → [".engine"] (relative, handled by caller) """ imports = [] for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: imports.append(alias.name) elif isinstance(node, ast.ImportFrom): module = node.module or "" level = node.level or 0 # number of dots for relative imports if level > 0: # Relative import — prefix with dots so caller can resolve them imports.append("." * level + module) elif module: imports.append(module) return list(dict.fromkeys(imports)) # deduplicate, preserve order def _extract_base_classes(node: ast.ClassDef) -> list[str]: """ Extract base class names from a ClassDef node. Used to build real inheritance edges for the Class Hierarchy diagram. Example: class MLP(Module): → ["Module"] Handles direct names (ast.Name) and dotted paths (ast.Attribute). """ bases = [] for base in node.bases: if isinstance(base, ast.Name): bases.append(base.id) elif isinstance(base, ast.Attribute): bases.append(base.attr) # e.g. nn.Module → "Module" # Filter trivial bases that add noise return [b for b in bases if b not in ("object", "ABC", "Enum")] # ── AST Chunking (Python) ───────────────────────────────────────────────────── def chunk_python(content: str, filepath: str) -> list[dict]: """ Parse Python source and extract functions and classes as individual chunks. Algorithm: 1. Parse content into an AST with ast.parse() 2. Walk top-level nodes looking for FunctionDef, AsyncFunctionDef, ClassDef 3. For each, extract the source lines using node.lineno / node.end_lineno 4. If a node is too large (>60 lines), split it further into sub-chunks What about module-level code (imports, constants, global statements)? We collect it as a single "module" chunk. It's useful context for understanding what a file imports and configures. """ try: tree = ast.parse(content) except SyntaxError as e: # Fall back to character-window if the file can't be parsed # (e.g. Python 2 syntax, encoding issues) print(f" [ast parse failed for {filepath}: {e}] → fallback chunking") return chunk_by_window(content, filepath, language="python") lines = content.splitlines() chunks = [] # Collect line numbers of all top-level definitions definition_lines = set() for node in ast.walk(tree): if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): if hasattr(node, "lineno"): for ln in range(node.lineno, (node.end_lineno or node.lineno) + 1): definition_lines.add(ln) # ── Module-level chunk ──────────────────────────────────────────────────── # Lines not covered by any function/class (imports, constants, etc.) module_lines = [ line for i, line in enumerate(lines, 1) if i not in definition_lines ] module_text = "\n".join(module_lines).strip() if module_text: chunks.append({ "text": f"# {filepath}\n{module_text}", "language": "python", "filepath": filepath, "chunk_type": "module", "name": "", "start_line": 1, "end_line": len(lines), "calls": [], "imports": _extract_imports(tree), "base_classes": [], }) # ── Function and class chunks ───────────────────────────────────────────── for node in tree.body: if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): continue start = node.lineno end = node.end_lineno or node.lineno node_lines = lines[start - 1 : end] node_text = "\n".join(node_lines) chunk_type = "class" if isinstance(node, ast.ClassDef) else "function" name = node.name # If the chunk is large, split into sub-chunks by method (for classes) # or by logical blocks (for large functions) if len(node_lines) > 80 and chunk_type == "class": sub_chunks = _split_class(node, lines, filepath) chunks.extend(sub_chunks) else: chunks.append({ "text": f"# {filepath}\n{node_text}", "language": "python", "filepath": filepath, "chunk_type": chunk_type, "name": name, "start_line": start, "end_line": end, "calls": _extract_calls(node), "imports": [], "base_classes": _extract_base_classes(node) if isinstance(node, ast.ClassDef) else [], }) return chunks if chunks else chunk_by_window(content, filepath, language="python") def _split_class(class_node: ast.ClassDef, lines: list[str], filepath: str) -> list[dict]: """ Split a large class into per-method chunks. Each method gets the class signature as a header so the LLM knows which class the method belongs to: class MyClass: def __init__(self): ... ↓ Chunk: "class MyClass:\n def __init__(self): ..." """ chunks = [] class_start = class_node.lineno class_header = lines[class_start - 1] # "class MyClass(Base):" for node in class_node.body: if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): continue start = node.lineno end = node.end_lineno or node.lineno method_lines = lines[start - 1 : end] # Indent method lines if not already indented (should be) method_text = "\n".join(method_lines) chunks.append({ "text": f"# {filepath}\n{class_header}\n{method_text}", "language": "python", "filepath": filepath, "chunk_type": "function", "name": f"{class_node.name}.{node.name}", "start_line": start, "end_line": end, "calls": _extract_calls(node), "imports": [], "base_classes": [], }) # Also include the class-level code (class variables, docstring) class_end = class_node.end_lineno or class_node.lineno class_text = "\n".join(lines[class_start - 1 : class_end]) chunks.insert(0, { "text": f"# {filepath}\n{class_text[:800]}", # truncated overview "language": "python", "filepath": filepath, "chunk_type": "class", "name": class_node.name, "start_line": class_start, "end_line": class_end, "calls": _extract_calls(class_node), "imports": [], "base_classes": _extract_base_classes(class_node), }) return chunks # ── Character-window chunking (fallback) ────────────────────────────────────── def chunk_by_window( content: str, filepath: str, language: str = "text", chunk_size: int = 1000, chunk_overlap: int = 200, ) -> list[dict]: """ Split text into overlapping fixed-size character windows. Used for: - Markdown documentation (.md, .rst) - Config files (.yaml, .toml, .json) - Languages without AST support (TypeScript, Go, Rust, etc.) - Python files that failed to parse The overlap ensures that a concept spanning a chunk boundary isn't lost. With overlap=200, the last 200 chars of chunk N are the first 200 chars of chunk N+1. """ if not content.strip(): return [] lines = content.splitlines() chunks = [] start = 0 while start < len(content): end = min(start + chunk_size, len(content)) text = content[start:end] # Find approximate start/end line numbers for this character range start_line = content[:start].count("\n") + 1 end_line = content[:end].count("\n") + 1 chunks.append({ "text": f"# {filepath}\n{text}", "language": language, "filepath": filepath, "chunk_type": "text", "name": "", "start_line": start_line, "end_line": end_line, "calls": [], "imports": [], "base_classes": [], }) if end == len(content): break start = end - chunk_overlap return chunks # ── Main entry point ────────────────────────────────────────────────────────── def chunk_file(file: dict) -> list[dict]: """ Chunk a single file dict (as returned by repo_fetcher). Args: file: {"path": str, "content": str, "size": int, "repo": str} Returns: List of chunk dicts with text + metadata. """ from ingestion.file_filter import language_from_path filepath = file.get("path") or file.get("filepath", "") content = file["content"] language = language_from_path(filepath) repo = file.get("repo", "") if language == "python": chunks = chunk_python(content, filepath) else: chunks = chunk_by_window(content, filepath, language=language) # Attach repo to every chunk for chunk in chunks: chunk["repo"] = repo return chunks def chunk_files(files: list[dict]) -> list[dict]: """Chunk all files and return a flat list of all chunks.""" all_chunks = [] for file in files: file_chunks = chunk_file(file) all_chunks.extend(file_chunks) print(f" {file.get('path') or file.get('filepath', '?')} → {len(file_chunks)} chunks") print(f"Total: {len(all_chunks)} chunks from {len(files)} files") return all_chunks