| from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| from langchain.schema import Document
|
| from qdrant_client import QdrantClient
|
| from qdrant_client.models import PointStruct, Distance, VectorParams
|
| import fitz
|
| from qdrant_client import QdrantClient
|
| import numpy as np
|
| import streamlit as st
|
|
|
| def pdfachunk(file, chunk_size_pages=20):
|
|
|
| doc = fitz.open(stream=file.read(), filetype="pdf")
|
| chunks = []
|
| for i in range(0, len(doc), chunk_size_pages):
|
| text = ""
|
| for page_num in range(i, min(i + chunk_size_pages, len(doc))):
|
| text += doc[page_num].get_text()
|
| chunks.append(text)
|
| doc.close()
|
| return chunks
|
|
|
| def split_chunks(raw_chunks, chunk_size=1024, chunk_overlap=100):
|
| docs = [Document(page_content=chunk) for chunk in raw_chunks]
|
| splitter = RecursiveCharacterTextSplitter(
|
| chunk_size=chunk_size,
|
| chunk_overlap=chunk_overlap,
|
| separators=["\n\n", "\n", ".", " "]
|
| )
|
| return splitter.split_documents(docs)
|
|
|
| def generaremben(model, texts):
|
| texts = [t for t in texts if t.strip()]
|
| if not texts:
|
| raise ValueError("No hay textos válidos para generar embeddings.")
|
| return model.encode(texts, batch_size=16, show_progress_bar=True)
|
|
|
| def insertarenqdra(embeddings, texts, nombre_coleccion):
|
| client = QdrantClient(path="./data_v2")
|
|
|
| dim = len(embeddings[0])
|
| client.recreate_collection(
|
| collection_name=nombre_coleccion,
|
| vectors_config=VectorParams(size=dim, distance=Distance.COSINE)
|
| )
|
|
|
| points = [
|
| PointStruct(id=i, vector=embeddings[i].tolist(), payload={"text": texts[i]})
|
| for i in range(len(embeddings))
|
| ]
|
|
|
| client.upsert(collection_name=nombre_coleccion, points=points)
|
| print(f"✅ Insertados {len(points)} vectores en Qdrant.")
|
|
|
| def query_qdrant(query, model, nombre_coleccion, top_k, umbral):
|
| query_embedding = generaremben(model, [query])[0]
|
|
|
|
|
| query_embedding = np.array(query_embedding).tolist()
|
|
|
| client = QdrantClient(path="./data_v2")
|
|
|
| results = client.query_points(
|
| collection_name=nombre_coleccion,
|
| query=query_embedding,
|
| limit=top_k,
|
| with_payload=True,
|
| score_threshold=umbral
|
| )
|
|
|
| return results
|
|
|
| def query_qdrant_sinumbral(query, model, nombre_coleccion, top_k=5):
|
| query_embedding = generaremben(model, [query])[0]
|
|
|
|
|
| query_embedding = np.array(query_embedding).tolist()
|
|
|
| client = QdrantClient(path="./data_v2")
|
|
|
| results = client.query_points(
|
| collection_name=nombre_coleccion,
|
| query=query_embedding,
|
| limit=top_k,
|
| with_payload=True,
|
| )
|
|
|
| return results
|
|
|
|
|
| def obtener_colecciones(path="./data_v2"):
|
| client = QdrantClient(path=path)
|
| collections = [col.name for col in client.get_collections().collections]
|
| return ["Todas las colecciones"] + collections
|
|
|
|
|
|
|
|
|
|
|