import os from datetime import datetime from pathlib import Path import streamlit as st from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.prompts import ChatPromptTemplate from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain_groq import ChatGroq from langchain.chains import create_retrieval_chain from langchain.chains.combine_documents import create_stuff_documents_chain # ------------------------- # Configuración general # ------------------------- st.set_page_config( page_title="Lectorín", page_icon="📄", layout="wide" ) st.title("📄 Lectorín 2026") st.caption("Pregunta a tu PDF con RAG, FAISS y Groq") # Secrets / env vars # Preferencia: # 1) st.secrets["GROQ_API_KEY"] # 2) variable de entorno GROQ_API_KEY GROQ_API_KEY = st.secrets.get("GROQ_API_KEY", os.getenv("GROQ_API_KEY", "")) # LangSmith opcional LANGCHAIN_API_KEY = st.secrets.get("LANGCHAIN_API_KEY", os.getenv("LANGCHAIN_API_KEY", "")) if LANGCHAIN_API_KEY: os.environ["LANGCHAIN_TRACING_V2"] = "true" os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY os.environ["LANGCHAIN_PROJECT"] = "qpdf-2026" # Carpeta de datos local DATA_DIR = Path("data") DATA_DIR.mkdir(exist_ok=True) HISTORIAL_PATH = DATA_DIR / "historial.txt" # ------------------------- # Estado de sesión # ------------------------- if "logs" not in st.session_state: st.session_state.logs = [] if "knowledge_base" not in st.session_state: st.session_state.knowledge_base = None if "current_pdf_name" not in st.session_state: st.session_state.current_pdf_name = None # ------------------------- # Modelos # ------------------------- modelos_embeddings = { "multilingual-e5-small (rápido)": ("intfloat/multilingual-e5-small", 512), "multi-qa-MiniLM-L6-cos-v1 (ligero)": ("multi-qa-MiniLM-L6-cos-v1", 256), "bge-m3 (mejor multilingüe, más pesado)": ("BAAI/bge-m3", 2048), } modelos_llm = { "Llama 3.3 70B Versatile": "llama-3.3-70b-versatile", "openai/gpt-oss-120b": "openai/gpt-oss-120b", "moonshotai/kimi-k2-instruct-0905": "moonshotai/kimi-k2-instruct-0905", } with st.sidebar: st.header("Configuración") embedding_label = st.selectbox("Modelo de embeddings", list(modelos_embeddings.keys())) embedding_model_name, sequence = modelos_embeddings[embedding_label] llm_label = st.selectbox("Modelo LLM", list(modelos_llm.keys())) llm_model_name = modelos_llm[llm_label] k_docs = st.slider("Chunks recuperados", min_value=2, max_value=8, value=4) chunk_size = st.slider("Chunk size", min_value=500, max_value=3000, value=min(sequence * 4, 2000), step=100) chunk_overlap = st.slider("Chunk overlap", min_value=50, max_value=400, value=150, step=25) st.divider() st.write("Para producción, configura `GROQ_API_KEY` en secretos o variables de entorno.") # ------------------------- # Utilidades # ------------------------- def extract_text_from_pdf(uploaded_file) -> str: reader = PdfReader(uploaded_file) pages = [] for page in reader.pages: text = page.extract_text() or "" if text.strip(): pages.append(text) return "\n\n".join(pages) @st.cache_resource(show_spinner=False) def load_embeddings_model(model_name: str): return HuggingFaceEmbeddings(model_name=model_name) @st.cache_data(show_spinner=False) def split_text_to_chunks(text: str, chunk_size: int, chunk_overlap: int): splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, separators=["\n\n", "\n", ". ", " ", ""] ) return splitter.split_text(text) def build_knowledge_base(uploaded_file, embedding_model_name: str, chunk_size: int, chunk_overlap: int): text = extract_text_from_pdf(uploaded_file) if not text.strip(): raise ValueError("No se pudo extraer texto del PDF.") chunks = split_text_to_chunks(text, chunk_size, chunk_overlap) embeddings = load_embeddings_model(embedding_model_name) vectorstore = FAISS.from_texts(chunks, embeddings) return vectorstore, len(chunks) def save_to_file(file_name: str, question: str, answer: str): with open(HISTORIAL_PATH, "a", encoding="utf-8") as f: fecha_hora_actual = datetime.now().strftime("%Y-%m-%d %H:%M") f.write("-" * 25) f.write(f" {fecha_hora_actual} ") f.write(f" ({file_name}) ") f.write("-" * 25 + "\n") f.write(f"Pregunta: {question}\n") f.write(f"Respuesta: {answer}\n\n") def build_rag_chain(vectorstore, groq_api_key: str, model_name: str, k: int = 4): retriever = vectorstore.as_retriever(search_kwargs={"k": k}) llm = ChatGroq( groq_api_key=groq_api_key, model=model_name, temperature=0 ) prompt = ChatPromptTemplate.from_messages([ ( "system", "Responde usando solo el contexto recuperado. " "Si la respuesta no está en el documento, di claramente que no aparece en el PDF. " "Contesta en español y de forma precisa.\n\nContexto:\n{context}" ), ("human", "{input}") ]) qa_chain = create_stuff_documents_chain(llm, prompt) rag_chain = create_retrieval_chain(retriever, qa_chain) return rag_chain def render_logs(): with st.sidebar: st.subheader("Historial de preguntas") if not st.session_state.logs: st.caption("Todavía no hay preguntas.") else: for i, entry in enumerate(reversed(st.session_state.logs), start=1): with st.expander(f"{i}. {entry['Pregunta'][:60]}"): st.write(entry["Respuesta"]) # ------------------------- # Interfaz principal # ------------------------- pdf_obj = st.file_uploader("Carga tu documento PDF", type="pdf") if pdf_obj is not None: if st.session_state.current_pdf_name != pdf_obj.name: st.session_state.current_pdf_name = pdf_obj.name st.session_state.logs = [] st.session_state.knowledge_base = None col1, col2 = st.columns([1, 1]) with col1: if st.button("Procesar PDF", type="primary", use_container_width=True): with st.spinner("Procesando PDF y creando índice vectorial..."): try: kb, n_chunks = build_knowledge_base( pdf_obj, embedding_model_name, chunk_size, chunk_overlap ) st.session_state.knowledge_base = kb st.success(f"PDF procesado correctamente. Chunks generados: {n_chunks}") except Exception as e: st.error(f"Error procesando el PDF: {e}") with col2: if st.session_state.knowledge_base is not None: st.success("Base vectorial lista.") else: st.info("Sube un PDF y pulsa 'Procesar PDF'.") if not GROQ_API_KEY: st.warning("Falta GROQ_API_KEY. Añádela en Streamlit secrets o en variables de entorno.") elif st.session_state.knowledge_base is not None: user_question = st.text_input("Haz una pregunta sobre tu PDF") if user_question: with st.spinner("Consultando el documento..."): try: rag_chain = build_rag_chain( st.session_state.knowledge_base, GROQ_API_KEY, llm_model_name, k=k_docs ) result = rag_chain.invoke({"input": user_question}) answer = result["answer"] context_docs = result.get("context", []) st.subheader("Respuesta") st.write(answer) with st.expander("Ver fragmentos recuperados"): if context_docs: for i, doc in enumerate(context_docs, start=1): st.markdown(f"**Chunk {i}**") st.write(doc.page_content) st.markdown("---") else: st.caption("No se devolvieron fragmentos.") st.session_state.logs.append({ "Pregunta": user_question, "Respuesta": answer }) save_to_file(pdf_obj.name, user_question, answer) except Exception as e: st.error(f"Error al consultar el PDF: {e}") render_logs()