| |
|
|
| from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
| CHUNK_SIZE = 1500 |
| CHUNK_OVERLAP = 200 |
|
|
| def split_documents(docs): |
| splitter = RecursiveCharacterTextSplitter( |
| chunk_size=CHUNK_SIZE, |
| chunk_overlap=CHUNK_OVERLAP, |
| separators=["\n\n", "\n", ". ", " ", ""], |
| ) |
| chunks = splitter.split_documents(docs) |
|
|
| for c in chunks: |
| c.metadata["chunk_size"] = CHUNK_SIZE |
| c.metadata["chunk_overlap"] = CHUNK_OVERLAP |
|
|
| return chunks |
|
|
| if __name__ == "__main__": |
| from load_documents import load_documents |
| docs = load_documents() |
| chunks = split_documents(docs) |
| print("Docs:", len(docs), "Chunks:", len(chunks)) |
| print(chunks[0].page_content[:300], chunks[0].metadata) |
|
|