import re from pathlib import Path from typing import Dict, List, Optional from tree_sitter_languages import get_parser LANGUAGE_BY_EXTENSION = { ".py": "python", ".js": "javascript", ".jsx": "javascript", ".ts": "typescript", ".tsx": "tsx", ".java": "java", ".go": "go", ".rs": "rust", } SYMBOL_NODE_TYPES = { "python": {"function_definition", "class_definition"}, "javascript": { "function_declaration", "class_declaration", "method_definition", "generator_function_declaration", "lexical_declaration", "variable_declaration", }, "typescript": { "function_declaration", "class_declaration", "method_definition", "interface_declaration", "type_alias_declaration", "lexical_declaration", "variable_statement", }, "tsx": { "function_declaration", "class_declaration", "method_definition", "interface_declaration", "type_alias_declaration", "lexical_declaration", "variable_statement", }, "java": { "class_declaration", "method_declaration", "interface_declaration", "enum_declaration", }, "go": { "function_declaration", "method_declaration", "type_declaration", }, "rust": { "function_item", "impl_item", "struct_item", "enum_item", "trait_item", }, } IDENTIFIER_TYPES = { "identifier", "property_identifier", "type_identifier", "field_identifier", } class CodeParser: def __init__(self): self.parsers = {} def detect_language(self, file_path: str) -> str: return LANGUAGE_BY_EXTENSION.get(Path(file_path).suffix.lower(), "text") def _get_parser(self, language: str): if language == "text": return None if language not in self.parsers: self.parsers[language] = get_parser(language) return self.parsers[language] def chunk_file(self, file_path: str, repo_root: str) -> List[Dict]: language = self.detect_language(file_path) source = Path(file_path).read_text(encoding="utf-8", errors="ignore") relative_path = str(Path(file_path).resolve().relative_to(Path(repo_root).resolve())) if not source.strip(): return [] parser = self._get_parser(language) if parser is None: return self._fallback_chunks(source, relative_path, language) tree = parser.parse(bytes(source, "utf-8")) lines = source.splitlines() chunks = [] capture_types = SYMBOL_NODE_TYPES.get(language, set()) def visit(node): if node.type in capture_types: chunk = self._build_chunk(node, source, lines, relative_path, language) if chunk: chunks.append(chunk) return for child in node.children: visit(child) visit(tree.root_node) if not chunks: return self._fallback_chunks(source, relative_path, language) return chunks def _build_chunk(self, node, source: str, lines: List[str], relative_path: str, language: str) -> Optional[Dict]: start_line = node.start_point[0] + 1 end_line = node.end_point[0] + 1 snippet = "\n".join(lines[start_line - 1 : end_line]).strip() if len(snippet.splitlines()) < 2: return None name_node = node.child_by_field_name("name") symbol_name = None if name_node is not None: symbol_name = source[name_node.start_byte : name_node.end_byte].strip() if not symbol_name: symbol_name = self._find_identifier(node, source) signature = lines[start_line - 1].strip() if start_line - 1 < len(lines) else "" searchable_text = "\n".join( part for part in [relative_path, symbol_name or "", signature, snippet] if part ) return { "file_path": relative_path, "language": language, "symbol_name": symbol_name or relative_path.split("/")[-1], "symbol_type": node.type, "line_start": start_line, "line_end": end_line, "signature": signature, "content": snippet, "searchable_text": searchable_text, "metadata_json": { "parser": "tree-sitter", }, } def _find_identifier(self, node, source: str) -> Optional[str]: stack = list(node.children) while stack: current = stack.pop(0) if current.type in IDENTIFIER_TYPES: return source[current.start_byte : current.end_byte].strip() stack.extend(current.children) return None def _fallback_chunks(self, source: str, relative_path: str, language: str) -> List[Dict]: blocks = [] lines = source.splitlines() buffer = [] start_line = 1 for index, line in enumerate(lines, start=1): if not buffer: start_line = index buffer.append(line) trigger = False if language == "text": trigger = len(buffer) >= 60 or (line.startswith("#") and len(buffer) > 8) else: trigger = ( re.match(r"^\s*(def |class |function |const |export |interface |type )", line) and len(buffer) > 8 ) or len(buffer) >= 80 if trigger: chunk_text = "\n".join(buffer).strip() if chunk_text: blocks.append( { "file_path": relative_path, "language": language, "symbol_name": f"{Path(relative_path).name}:{start_line}", "symbol_type": "fallback_chunk", "line_start": start_line, "line_end": index, "signature": buffer[0].strip(), "content": chunk_text, "searchable_text": f"{relative_path}\n{chunk_text}", "metadata_json": { "parser": "fallback", }, } ) buffer = [] if buffer: chunk_text = "\n".join(buffer).strip() if chunk_text: blocks.append( { "file_path": relative_path, "language": language, "symbol_name": f"{Path(relative_path).name}:{start_line}", "symbol_type": "fallback_chunk", "line_start": start_line, "line_end": len(lines), "signature": buffer[0].strip(), "content": chunk_text, "searchable_text": f"{relative_path}\n{chunk_text}", "metadata_json": { "parser": "fallback", }, } ) return blocks