File size: 1,890 Bytes
a688aff 5f7dc7e a688aff 5f7dc7e a688aff 5f7dc7e a688aff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | import os
import hashlib
from backend.graph.state import BrainState
from backend.chunking import detect_doc_type
from backend.sse import emit
async def load_sources(state: BrainState) -> dict:
company_id = state["company_id"]
job_id = state["job_id"]
print(f"[{job_id}] Node load_sources started")
await emit(
job_id,
"stage",
{"name": "LOADING_DOCS", "detail": f"Reading sources for {company_id}"},
)
base = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
sources_dir = os.path.join(base, "data", "sources", company_id)
if not os.path.isdir(sources_dir):
await emit(
job_id,
"pipeline_error",
{"error": f"No source directory: data/sources/{company_id}/"},
)
print(f"[{job_id}] load_sources failed — missing dir: {sources_dir}")
return {"errors": [f"Missing directory: {sources_dir}"], "source_files": []}
source_files = []
for filename in sorted(os.listdir(sources_dir)):
filepath = os.path.join(sources_dir, filename)
if not os.path.isfile(filepath):
continue
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
doc_type = detect_doc_type(filename, content)
source_files.append(
{
"filename": filename,
"content": content,
"sha256": hashlib.sha256(content.encode("utf-8")).hexdigest(),
"doc_type": doc_type,
}
)
print(f"[{job_id}] load_sources finished: {len(source_files)} files")
await emit(
job_id,
"stage",
{
"name": "LOADING_DOCS_DONE",
"detail": f"Loaded {len(source_files)} source files",
},
)
return {"source_files": source_files}
|