File size: 1,181 Bytes
5f7dc7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from backend.graph.state import BrainState
from backend.chunking import get_chunker
from backend.sse import emit


async def chunk_documents(state: BrainState) -> dict:
    job_id = state["job_id"]
    source_files = state.get("source_files", [])

    print(f"[{job_id}] Node chunk_documents: processing {len(source_files)} files")
    await emit(
        job_id,
        "stage",
        {
            "name": "CHUNKING",
            "detail": f"Chunking {len(source_files)} source files",
        },
    )

    all_chunks = []
    for sf in source_files:
        doc_type = sf.get("doc_type", "plain_text")
        filename = sf.get("filename", "unknown")
        content = sf.get("content", "")
        chunker = get_chunker(doc_type)
        chunks = chunker(content, filename)
        all_chunks.extend(chunks)

    print(
        f"[{job_id}] chunk_documents: produced {len(all_chunks)} chunks from {len(source_files)} files"
    )
    await emit(
        job_id,
        "stage",
        {
            "name": "CHUNKING_DONE",
            "detail": f"Produced {len(all_chunks)} chunks from {len(source_files)} files",
        },
    )

    return {"all_chunks": all_chunks}