from pathlib import Path from app.config import Settings from app.schemas import CodeChunk, SourceFile class Chunker: def __init__(self, settings: Settings): self.settings = settings def chunk_files(self, files: list[SourceFile]) -> list[CodeChunk]: chunks: list[CodeChunk] = [] for source_file in files: chunks.extend(self.chunk_file(source_file)) return chunks def chunk_file(self, source_file: SourceFile) -> list[CodeChunk]: text = Path(source_file.absolute_path).read_text(encoding="utf-8", errors="ignore") lines = text.splitlines() if not lines: return [] chunks: list[CodeChunk] = [] current_lines: list[str] = [] current_start = 1 current_chars = 0 for index, line in enumerate(lines, start=1): line_chars = len(line) + 1 if current_lines and current_chars + line_chars > self.settings.max_chars_per_chunk: chunks.append( CodeChunk( file_path=source_file.path, language=source_file.language, line_start=current_start, line_end=index - 1, content="\n".join(current_lines), ) ) current_lines = [] current_start = index current_chars = 0 current_lines.append(line) current_chars += line_chars if current_lines: chunks.append( CodeChunk( file_path=source_file.path, language=source_file.language, line_start=current_start, line_end=len(lines), content="\n".join(current_lines), ) ) return chunks