Spaces:
Runtime error
Runtime error
| import os | |
| import pickle | |
| from typing import List | |
| from llama_parse import LlamaParse | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders.directory import DirectoryLoader | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_community.vectorstores.qdrant import Qdrant | |
| from langchain_community.embeddings.fastembed import FastEmbedEmbeddings | |
| import nltk | |
| import nest_asyncio | |
| # Setup | |
| nltk.download('punkt') | |
| nest_asyncio.apply() | |
| # Load environment variables | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # Environment keys | |
| llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY") | |
| groq_api_key = os.getenv("GROQ_API_KEY") | |
| # Paths | |
| data_dir = "data" | |
| parsed_data_file = os.path.join(data_dir, "parsed_data.pkl") | |
| output_md = os.path.join(data_dir, "output.md") | |
| qdrant_dir = os.path.join(data_dir, "local_qdrant") | |
| collection_name = "rag" | |
| # Helper: Load or parse PDF | |
| def load_or_parse_data(pdf_path): | |
| if os.path.exists(parsed_data_file): | |
| with open(parsed_data_file, "rb") as f: | |
| parsed_data = pickle.load(f) | |
| else: | |
| parsing_instruction = """The provided document is a user guide or manual. | |
| It contains many images and tables. Be precise while answering questions.""" | |
| parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsing_instruction) # type: ignore | |
| parsed_data = parser.load_data(pdf_path) | |
| with open(parsed_data_file, "wb") as f: | |
| pickle.dump(parsed_data, f) | |
| return parsed_data | |
| # Main vector DB builder | |
| def create_vector_database(pdf_path): | |
| print("π§ Starting vector DB creation...") | |
| # Ensure directories exist | |
| os.makedirs(data_dir, exist_ok=True) | |
| os.makedirs(qdrant_dir, exist_ok=True) | |
| # Parse PDF | |
| parsed_docs = load_or_parse_data(pdf_path) | |
| if not parsed_docs: | |
| raise ValueError("β No parsed documents returned from LlamaParse!") | |
| # Write Markdown content | |
| with open(output_md, 'w', encoding='utf-8') as f: | |
| for doc in parsed_docs: | |
| if hasattr(doc, "text") and doc.text.strip(): | |
| f.write(doc.text.strip() + "\n\n") | |
| if not os.path.exists(output_md) or os.path.getsize(output_md) == 0: | |
| raise RuntimeError("β Markdown file was not created or is empty!") | |
| # Load .md as documents | |
| try: | |
| loader = DirectoryLoader(data_dir, glob="**/*.md", show_progress=True) | |
| documents = loader.load() | |
| except Exception as e: | |
| print(f"β οΈ DirectoryLoader failed: {e}. Falling back to TextLoader...") | |
| documents = TextLoader(output_md, encoding='utf-8').load() | |
| if not documents: | |
| raise RuntimeError("β No documents loaded from markdown!") | |
| # Chunk documents | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) | |
| docs = splitter.split_documents(documents) | |
| print(f"β Loaded and split {len(docs)} chunks.") | |
| # Embeddings | |
| embeddings = FastEmbedEmbeddings() # type: ignore | |
| # Create Qdrant vector DB | |
| print("π¦ Creating Qdrant vector DB...") | |
| qdrant = Qdrant.from_documents( | |
| documents=docs, | |
| embedding=embeddings, | |
| path=qdrant_dir, | |
| collection_name=collection_name, | |
| ) | |
| print("β Vector DB created successfully.") | |
| return qdrant | |