File size: 1,903 Bytes
a3ecd30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from pathlib import Path

from app.config import Settings
from app.schemas import CodeChunk, SourceFile


class Chunker:
    def __init__(self, settings: Settings):
        self.settings = settings

    def chunk_files(self, files: list[SourceFile]) -> list[CodeChunk]:
        chunks: list[CodeChunk] = []
        for source_file in files:
            chunks.extend(self.chunk_file(source_file))
        return chunks

    def chunk_file(self, source_file: SourceFile) -> list[CodeChunk]:
        text = Path(source_file.absolute_path).read_text(encoding="utf-8", errors="ignore")
        lines = text.splitlines()
        if not lines:
            return []

        chunks: list[CodeChunk] = []
        current_lines: list[str] = []
        current_start = 1
        current_chars = 0

        for index, line in enumerate(lines, start=1):
            line_chars = len(line) + 1
            if current_lines and current_chars + line_chars > self.settings.max_chars_per_chunk:
                chunks.append(
                    CodeChunk(
                        file_path=source_file.path,
                        language=source_file.language,
                        line_start=current_start,
                        line_end=index - 1,
                        content="\n".join(current_lines),
                    )
                )
                current_lines = []
                current_start = index
                current_chars = 0

            current_lines.append(line)
            current_chars += line_chars

        if current_lines:
            chunks.append(
                CodeChunk(
                    file_path=source_file.path,
                    language=source_file.language,
                    line_start=current_start,
                    line_end=len(lines),
                    content="\n".join(current_lines),
                )
            )

        return chunks