| """ |
| BƯỚC 4: VECTORSTORE (FAISS in-memory) |
| ------------------------------------- |
| Tạo FAISS index từ các CHUNK văn bản. |
| - Không ghi file .faiss nào, tất cả nằm trong RAM. |
| - Embeddings được lấy từ get_embeddings() (Bước 3). |
| """ |
|
|
| from langchain_community.vectorstores import FAISS |
| from embeddings import get_embeddings |
|
|
| def build_vectorstore(chunks): |
| """ |
| Nhận danh sách Document (đã split) và trả về FAISS VectorStore. |
| """ |
| print(">>> Initialising embedding model for FAISS index ...") |
| embeddings = get_embeddings() |
|
|
| print(f">>> Building FAISS index from {len(chunks)} chunks ...") |
| vs = FAISS.from_documents(chunks, embeddings) |
| print(">>> FAISS index built.\n") |
| return vs |
|
|
| if __name__ == "__main__": |
| |
| from load_documents import load_documents |
| from split_documents import split_documents |
|
|
| print("=== TEST: load_documents -> split_documents -> FAISS.similarity_search ===\n") |
|
|
| |
| docs = load_documents() |
|
|
| |
| from pprint import pprint |
| print(f"Loaded {len(docs)} raw documents.") |
| chunks = split_documents(docs) |
| print(f"Split into {len(chunks)} chunks.\n") |
|
|
| |
| vectorstore = build_vectorstore(chunks) |
|
|
| |
| query = "Fristen für die Prüfungsanmeldung im Bachelorstudium" |
| print("Test query:") |
| print(" ", query, "\n") |
|
|
| results = vectorstore.similarity_search(query, k=3) |
|
|
| print("Top-3 ähnliche Chunks aus dem VectorStore:") |
| for i, doc in enumerate(results, start=1): |
| print(f"\n=== RESULT {i} ===") |
| print(doc.page_content[:400], "...") |
| print("Metadata:", doc.metadata) |
|
|
|
|
|
|