Spaces:
Running
Running
| """ | |
| code_chunker.py β Split source files into semantically meaningful chunks. | |
| Two strategies depending on file type: | |
| 1. AST chunking (Python only) | |
| Parse the file into an Abstract Syntax Tree. Extract each top-level | |
| function and class as its own chunk. Classes include all their methods. | |
| Why: A function is the natural unit of code β it has a name, inputs, | |
| outputs, and a single responsibility. Splitting mid-function loses context. | |
| 2. Character-window chunking (everything else) | |
| Split by character count with overlap β same approach used for prose. | |
| Works for markdown, YAML, config files, and languages without AST support. | |
| Why not AST for all languages? Python's `ast` module is in the stdlib. | |
| Multi-language AST (tree-sitter) adds complexity. For a learning project, | |
| Python AST + fallback covers 80% of cases cleanly. | |
| Chunk shape (returned by both strategies): | |
| { | |
| "text": str, # the actual code/text content | |
| "language": str, # "python", "typescript", etc. | |
| "filepath": str, # "src/auth/middleware.py" | |
| "chunk_type": str, # "function", "class", "module", "text" | |
| "name": str, # function/class name (or "" for text chunks) | |
| "start_line": int, # 1-indexed line where chunk starts | |
| "end_line": int, # 1-indexed line where chunk ends | |
| "calls": list[str], # names called by this function (AST only) | |
| "imports": list[str], # imported module names (module chunks only; [] elsewhere) | |
| "base_classes": list[str], # base class names (class chunks only; [] elsewhere) | |
| } | |
| The `calls` field is used to build the Code Knowledge Graph β an interactive | |
| D3 visualization of how functions call each other across files. It's extracted | |
| by the CallExtractor visitor which walks ast.Call nodes inside each function body. | |
| The `imports` field enables file-level dependency edges in the Architecture diagram. | |
| It records every module name imported at the top of the file (both "import X" and | |
| "from X import Y" forms), extracted from the module-level chunk only. | |
| The `base_classes` field enables real inheritance edges in the Class Hierarchy diagram. | |
| It records the names of parent classes from "class Foo(Bar, Baz):" declarations, | |
| extracted directly from each ClassDef node. | |
| """ | |
| import ast | |
| import textwrap | |
| from pathlib import Path | |
| # ββ Call extractor ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class _CallExtractor(ast.NodeVisitor): | |
| """ | |
| AST visitor that collects the names of all functions/methods called | |
| inside a function or class body. | |
| How ast.NodeVisitor works: | |
| - Subclass it and define visit_<NodeType> methods. | |
| - Call self.visit(node) to start traversal from any node. | |
| - self.generic_visit(node) continues the walk into child nodes. | |
| Two kinds of calls in Python's AST: | |
| ast.Name: direct calls β foo(), bar() | |
| β node.func is an ast.Name, name is node.func.id | |
| ast.Attribute: method/attr calls β self.foo(), obj.method() | |
| β node.func is an ast.Attribute, name is node.func.attr | |
| We collect only the leaf name (not the full dotted path) because we match | |
| against function names in the index, not fully-qualified paths. | |
| """ | |
| def __init__(self): | |
| self.calls: list[str] = [] | |
| def visit_Call(self, node: ast.Call): | |
| if isinstance(node.func, ast.Attribute): | |
| self.calls.append(node.func.attr) # self.embed() β "embed" | |
| elif isinstance(node.func, ast.Name): | |
| self.calls.append(node.func.id) # embed() β "embed" | |
| self.generic_visit(node) # recurse into nested calls | |
| def _extract_calls(node: ast.AST) -> list[str]: | |
| """Extract unique called names from an AST node (function or class).""" | |
| extractor = _CallExtractor() | |
| extractor.visit(node) | |
| # Deduplicate while preserving order; filter builtins that add noise | |
| _NOISE = {"print", "len", "range", "isinstance", "str", "int", "list", | |
| "dict", "set", "tuple", "super", "hasattr", "getattr", "setattr", | |
| "append", "extend", "format", "join", "split", "strip", "get", | |
| "items", "keys", "values", "zip", "enumerate", "map", "filter"} | |
| seen = set() | |
| result = [] | |
| for name in extractor.calls: | |
| if name not in seen and name not in _NOISE: | |
| seen.add(name) | |
| result.append(name) | |
| return result | |
| def _extract_imports(tree: ast.AST) -> list[str]: | |
| """ | |
| Extract all imported module names from a parsed AST. | |
| Used to build file-level dependency edges for the Architecture diagram. | |
| Handles both forms: | |
| import os β ["os"] | |
| from micrograd.engine import Value β ["micrograd.engine"] | |
| from . import engine β [".engine"] (relative, handled by caller) | |
| """ | |
| imports = [] | |
| for node in ast.walk(tree): | |
| if isinstance(node, ast.Import): | |
| for alias in node.names: | |
| imports.append(alias.name) | |
| elif isinstance(node, ast.ImportFrom): | |
| module = node.module or "" | |
| level = node.level or 0 # number of dots for relative imports | |
| if level > 0: | |
| # Relative import β prefix with dots so caller can resolve them | |
| imports.append("." * level + module) | |
| elif module: | |
| imports.append(module) | |
| return list(dict.fromkeys(imports)) # deduplicate, preserve order | |
| def _extract_base_classes(node: ast.ClassDef) -> list[str]: | |
| """ | |
| Extract base class names from a ClassDef node. | |
| Used to build real inheritance edges for the Class Hierarchy diagram. | |
| Example: class MLP(Module): β ["Module"] | |
| Handles direct names (ast.Name) and dotted paths (ast.Attribute). | |
| """ | |
| bases = [] | |
| for base in node.bases: | |
| if isinstance(base, ast.Name): | |
| bases.append(base.id) | |
| elif isinstance(base, ast.Attribute): | |
| bases.append(base.attr) # e.g. nn.Module β "Module" | |
| # Filter trivial bases that add noise | |
| return [b for b in bases if b not in ("object", "ABC", "Enum")] | |
| # ββ AST Chunking (Python) βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chunk_python(content: str, filepath: str) -> list[dict]: | |
| """ | |
| Parse Python source and extract functions and classes as individual chunks. | |
| Algorithm: | |
| 1. Parse content into an AST with ast.parse() | |
| 2. Walk top-level nodes looking for FunctionDef, AsyncFunctionDef, ClassDef | |
| 3. For each, extract the source lines using node.lineno / node.end_lineno | |
| 4. If a node is too large (>60 lines), split it further into sub-chunks | |
| What about module-level code (imports, constants, global statements)? | |
| We collect it as a single "module" chunk. It's useful context for | |
| understanding what a file imports and configures. | |
| """ | |
| try: | |
| tree = ast.parse(content) | |
| except SyntaxError as e: | |
| # Fall back to character-window if the file can't be parsed | |
| # (e.g. Python 2 syntax, encoding issues) | |
| print(f" [ast parse failed for {filepath}: {e}] β fallback chunking") | |
| return chunk_by_window(content, filepath, language="python") | |
| lines = content.splitlines() | |
| chunks = [] | |
| # Collect line numbers of all top-level definitions | |
| definition_lines = set() | |
| for node in ast.walk(tree): | |
| if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): | |
| if hasattr(node, "lineno"): | |
| for ln in range(node.lineno, (node.end_lineno or node.lineno) + 1): | |
| definition_lines.add(ln) | |
| # ββ Module-level chunk ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Lines not covered by any function/class (imports, constants, etc.) | |
| module_lines = [ | |
| line for i, line in enumerate(lines, 1) | |
| if i not in definition_lines | |
| ] | |
| module_text = "\n".join(module_lines).strip() | |
| if module_text: | |
| chunks.append({ | |
| "text": f"# {filepath}\n{module_text}", | |
| "language": "python", | |
| "filepath": filepath, | |
| "chunk_type": "module", | |
| "name": "", | |
| "start_line": 1, | |
| "end_line": len(lines), | |
| "calls": [], | |
| "imports": _extract_imports(tree), | |
| "base_classes": [], | |
| }) | |
| # ββ Function and class chunks βββββββββββββββββββββββββββββββββββββββββββββ | |
| for node in tree.body: | |
| if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): | |
| continue | |
| start = node.lineno | |
| end = node.end_lineno or node.lineno | |
| node_lines = lines[start - 1 : end] | |
| node_text = "\n".join(node_lines) | |
| chunk_type = "class" if isinstance(node, ast.ClassDef) else "function" | |
| name = node.name | |
| # If the chunk is large, split into sub-chunks by method (for classes) | |
| # or by logical blocks (for large functions) | |
| if len(node_lines) > 80 and chunk_type == "class": | |
| sub_chunks = _split_class(node, lines, filepath) | |
| chunks.extend(sub_chunks) | |
| else: | |
| chunks.append({ | |
| "text": f"# {filepath}\n{node_text}", | |
| "language": "python", | |
| "filepath": filepath, | |
| "chunk_type": chunk_type, | |
| "name": name, | |
| "start_line": start, | |
| "end_line": end, | |
| "calls": _extract_calls(node), | |
| "imports": [], | |
| "base_classes": _extract_base_classes(node) if isinstance(node, ast.ClassDef) else [], | |
| }) | |
| return chunks if chunks else chunk_by_window(content, filepath, language="python") | |
| def _split_class(class_node: ast.ClassDef, lines: list[str], filepath: str) -> list[dict]: | |
| """ | |
| Split a large class into per-method chunks. | |
| Each method gets the class signature as a header so the LLM knows | |
| which class the method belongs to: | |
| class MyClass: | |
| def __init__(self): ... | |
| β | |
| Chunk: "class MyClass:\n def __init__(self): ..." | |
| """ | |
| chunks = [] | |
| class_start = class_node.lineno | |
| class_header = lines[class_start - 1] # "class MyClass(Base):" | |
| for node in class_node.body: | |
| if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): | |
| continue | |
| start = node.lineno | |
| end = node.end_lineno or node.lineno | |
| method_lines = lines[start - 1 : end] | |
| # Indent method lines if not already indented (should be) | |
| method_text = "\n".join(method_lines) | |
| chunks.append({ | |
| "text": f"# {filepath}\n{class_header}\n{method_text}", | |
| "language": "python", | |
| "filepath": filepath, | |
| "chunk_type": "function", | |
| "name": f"{class_node.name}.{node.name}", | |
| "start_line": start, | |
| "end_line": end, | |
| "calls": _extract_calls(node), | |
| "imports": [], | |
| "base_classes": [], | |
| }) | |
| # Also include the class-level code (class variables, docstring) | |
| class_end = class_node.end_lineno or class_node.lineno | |
| class_text = "\n".join(lines[class_start - 1 : class_end]) | |
| chunks.insert(0, { | |
| "text": f"# {filepath}\n{class_text[:800]}", # truncated overview | |
| "language": "python", | |
| "filepath": filepath, | |
| "chunk_type": "class", | |
| "name": class_node.name, | |
| "start_line": class_start, | |
| "end_line": class_end, | |
| "calls": _extract_calls(class_node), | |
| "imports": [], | |
| "base_classes": _extract_base_classes(class_node), | |
| }) | |
| return chunks | |
| # ββ Character-window chunking (fallback) ββββββββββββββββββββββββββββββββββββββ | |
| def chunk_by_window( | |
| content: str, | |
| filepath: str, | |
| language: str = "text", | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 200, | |
| ) -> list[dict]: | |
| """ | |
| Split text into overlapping fixed-size character windows. | |
| Used for: | |
| - Markdown documentation (.md, .rst) | |
| - Config files (.yaml, .toml, .json) | |
| - Languages without AST support (TypeScript, Go, Rust, etc.) | |
| - Python files that failed to parse | |
| The overlap ensures that a concept spanning a chunk boundary isn't lost. | |
| With overlap=200, the last 200 chars of chunk N are the first 200 chars | |
| of chunk N+1. | |
| """ | |
| if not content.strip(): | |
| return [] | |
| lines = content.splitlines() | |
| chunks = [] | |
| start = 0 | |
| while start < len(content): | |
| end = min(start + chunk_size, len(content)) | |
| text = content[start:end] | |
| # Find approximate start/end line numbers for this character range | |
| start_line = content[:start].count("\n") + 1 | |
| end_line = content[:end].count("\n") + 1 | |
| chunks.append({ | |
| "text": f"# {filepath}\n{text}", | |
| "language": language, | |
| "filepath": filepath, | |
| "chunk_type": "text", | |
| "name": "", | |
| "start_line": start_line, | |
| "end_line": end_line, | |
| "calls": [], | |
| "imports": [], | |
| "base_classes": [], | |
| }) | |
| if end == len(content): | |
| break | |
| start = end - chunk_overlap | |
| return chunks | |
| # ββ Main entry point ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chunk_file(file: dict) -> list[dict]: | |
| """ | |
| Chunk a single file dict (as returned by repo_fetcher). | |
| Args: | |
| file: {"path": str, "content": str, "size": int, "repo": str} | |
| Returns: | |
| List of chunk dicts with text + metadata. | |
| """ | |
| from ingestion.file_filter import language_from_path | |
| filepath = file.get("path") or file.get("filepath", "") | |
| content = file["content"] | |
| language = language_from_path(filepath) | |
| repo = file.get("repo", "") | |
| if language == "python": | |
| chunks = chunk_python(content, filepath) | |
| else: | |
| chunks = chunk_by_window(content, filepath, language=language) | |
| # Attach repo to every chunk | |
| for chunk in chunks: | |
| chunk["repo"] = repo | |
| return chunks | |
| def chunk_files(files: list[dict]) -> list[dict]: | |
| """Chunk all files and return a flat list of all chunks.""" | |
| all_chunks = [] | |
| for file in files: | |
| file_chunks = chunk_file(file) | |
| all_chunks.extend(file_chunks) | |
| print(f" {file.get('path') or file.get('filepath', '?')} β {len(file_chunks)} chunks") | |
| print(f"Total: {len(all_chunks)} chunks from {len(files)} files") | |
| return all_chunks | |