File size: 769 Bytes
877add7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
"""Local retrieval index builder."""

from __future__ import annotations

import json
from pathlib import Path

from app.models.retrieval.chunker import chunk_text


def build_local_index(source_dir: Path, out_file: Path) -> int:
    docs: list[dict[str, str]] = []
    for path in source_dir.rglob("*"):
        if path.is_file() and path.suffix.lower() in {".txt", ".md", ".json"}:
            text = path.read_text(encoding="utf-8", errors="ignore")
            for idx, chunk in enumerate(chunk_text(text)):
                docs.append({"id": f"{path.stem}_{idx}", "path": str(path), "text": chunk})
    out_file.parent.mkdir(parents=True, exist_ok=True)
    out_file.write_text(json.dumps(docs, ensure_ascii=True, indent=2), encoding="utf-8")
    return len(docs)