| import os |
| import warnings |
| import pickle |
| import faiss |
| from sentence_transformers import SentenceTransformer |
| from PyPDF2 import PdfReader |
| import glob |
| from together import Together |
|
|
| warnings.filterwarnings("ignore") |
|
|
| |
| TOGETHER_API_KEY = "81da53aa3044c7ebead342fb048f016a4e593a86928a783a6fdcc1e3883054e4" |
| client = Together(api_key=TOGETHER_API_KEY) |
|
|
| |
| embedding_model = SentenceTransformer( |
| "sentence-transformers/all-MiniLM-L6-v2", |
| use_auth_token=os.environ.get("HUGGINGFACE_HUB_TOKEN"), |
| ) |
|
|
| def extract_text_from_pdf(pdf_path): |
| """Extract text from a PDF file.""" |
| try: |
| reader = PdfReader(pdf_path) |
| text = "" |
| for page in reader.pages: |
| text += page.extract_text() + "\n" |
| return text.strip() |
| except Exception as e: |
| print(f"Error processing {pdf_path}: {str(e)}") |
| return "" |
|
|
| def create_index(): |
| """Create and save the FAISS index and document metadata.""" |
| |
| os.makedirs("knowledge_base", exist_ok=True) |
| |
| |
| pdf_files = glob.glob("Knowledge_base/*.pdf") |
| |
| if not pdf_files: |
| raise ValueError("No PDF files found in Knowledge_base directory!") |
| |
| print(f"Found {len(pdf_files)} PDF files. Processing...") |
| |
| |
| documents = [] |
| filenames = [] |
| |
| for pdf_path in pdf_files: |
| filename = os.path.basename(pdf_path) |
| content = extract_text_from_pdf(pdf_path) |
| |
| if content: |
| |
| chunks = [content[i:i+1000] for i in range(0, len(content), 1000)] |
| |
| for i, chunk in enumerate(chunks): |
| if chunk.strip(): |
| documents.append(chunk) |
| filenames.append(f"{filename} (chunk {i+1})") |
| |
| if not documents: |
| raise ValueError("No valid content extracted from PDFs!") |
| |
| print(f"Successfully processed {len(documents)} chunks from {len(pdf_files)} PDFs") |
| |
| |
| print("Creating embeddings...") |
| embeddings = embedding_model.encode(documents) |
| |
| |
| dimension = embeddings.shape[1] |
| index = faiss.IndexFlatIP(dimension) |
| |
| |
| faiss.normalize_L2(embeddings) |
| index.add(embeddings) |
| |
| |
| print("Saving index and metadata...") |
| faiss.write_index(index, "knowledge_base/faiss_index.bin") |
| |
| metadata = { |
| "documents": documents, |
| "filenames": filenames |
| } |
| |
| with open("knowledge_base/metadata.pkl", "wb") as f: |
| pickle.dump(metadata, f) |
| |
| print("Index and metadata saved successfully!") |
|
|
| if __name__ == "__main__": |
| create_index() |