SwarmAudit / app /services /chunker.py
Pranoy Mukherjee
Add crawler, security agent, API, and Gradio MVP
a3ecd30
from pathlib import Path
from app.config import Settings
from app.schemas import CodeChunk, SourceFile
class Chunker:
def __init__(self, settings: Settings):
self.settings = settings
def chunk_files(self, files: list[SourceFile]) -> list[CodeChunk]:
chunks: list[CodeChunk] = []
for source_file in files:
chunks.extend(self.chunk_file(source_file))
return chunks
def chunk_file(self, source_file: SourceFile) -> list[CodeChunk]:
text = Path(source_file.absolute_path).read_text(encoding="utf-8", errors="ignore")
lines = text.splitlines()
if not lines:
return []
chunks: list[CodeChunk] = []
current_lines: list[str] = []
current_start = 1
current_chars = 0
for index, line in enumerate(lines, start=1):
line_chars = len(line) + 1
if current_lines and current_chars + line_chars > self.settings.max_chars_per_chunk:
chunks.append(
CodeChunk(
file_path=source_file.path,
language=source_file.language,
line_start=current_start,
line_end=index - 1,
content="\n".join(current_lines),
)
)
current_lines = []
current_start = index
current_chars = 0
current_lines.append(line)
current_chars += line_chars
if current_lines:
chunks.append(
CodeChunk(
file_path=source_file.path,
language=source_file.language,
line_start=current_start,
line_end=len(lines),
content="\n".join(current_lines),
)
)
return chunks