Spaces:
Running
Running
| """Local retrieval index builder.""" | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| from app.models.retrieval.chunker import chunk_text | |
| def build_local_index(source_dir: Path, out_file: Path) -> int: | |
| docs: list[dict[str, str]] = [] | |
| for path in source_dir.rglob("*"): | |
| if path.is_file() and path.suffix.lower() in {".txt", ".md", ".json"}: | |
| text = path.read_text(encoding="utf-8", errors="ignore") | |
| for idx, chunk in enumerate(chunk_text(text)): | |
| docs.append({"id": f"{path.stem}_{idx}", "path": str(path), "text": chunk}) | |
| out_file.parent.mkdir(parents=True, exist_ok=True) | |
| out_file.write_text(json.dumps(docs, ensure_ascii=True, indent=2), encoding="utf-8") | |
| return len(docs) | |