File size: 1,903 Bytes
a3ecd30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | from pathlib import Path
from app.config import Settings
from app.schemas import CodeChunk, SourceFile
class Chunker:
def __init__(self, settings: Settings):
self.settings = settings
def chunk_files(self, files: list[SourceFile]) -> list[CodeChunk]:
chunks: list[CodeChunk] = []
for source_file in files:
chunks.extend(self.chunk_file(source_file))
return chunks
def chunk_file(self, source_file: SourceFile) -> list[CodeChunk]:
text = Path(source_file.absolute_path).read_text(encoding="utf-8", errors="ignore")
lines = text.splitlines()
if not lines:
return []
chunks: list[CodeChunk] = []
current_lines: list[str] = []
current_start = 1
current_chars = 0
for index, line in enumerate(lines, start=1):
line_chars = len(line) + 1
if current_lines and current_chars + line_chars > self.settings.max_chars_per_chunk:
chunks.append(
CodeChunk(
file_path=source_file.path,
language=source_file.language,
line_start=current_start,
line_end=index - 1,
content="\n".join(current_lines),
)
)
current_lines = []
current_start = index
current_chars = 0
current_lines.append(line)
current_chars += line_chars
if current_lines:
chunks.append(
CodeChunk(
file_path=source_file.path,
language=source_file.language,
line_start=current_start,
line_end=len(lines),
content="\n".join(current_lines),
)
)
return chunks
|