import streamlit as st
import PyPDF2
import faiss
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import os
from docx import Document  # For handling Word files

# Initialize OpenAI API
api_key = "5f10d5fec0b440f6b97956dc7630e148"  # Replace with your API key
base_url = "https://api.aimlapi.com/v1"
api = OpenAI(api_key=api_key, base_url=base_url)

# Initialize Sentence Transformer
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# FAISS Index
dimension = 384  # Embedding dimension of the model
index = faiss.IndexFlatL2(dimension)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to extract text from Word file
def extract_text_from_word(word_file):
    document = Document(word_file)
    text = ""
    for paragraph in document.paragraphs:
        text += paragraph.text + "\n"
    return text

# Function to extract text from Text file
def extract_text_from_txt(txt_file):
    return txt_file.read().decode("utf-8")

# Function to chunk text
def chunk_text(text, max_length=500):
    words = text.split()
    chunks = []
    chunk = []
    for word in words:
        if len(" ".join(chunk)) + len(word) <= max_length:
            chunk.append(word)
        else:
            chunks.append(" ".join(chunk))
            chunk = [word]
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

# Function to embed text and add to FAISS index
def embed_and_store(chunks):
    embeddings = embedding_model.encode(chunks)
    index.add(embeddings)

# Query handling
def query_llm(prompt):
    completion = api.chat.completions.create(
        model="mistralai/Mistral-7B-Instruct-v0.2",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
        max_tokens=256,
    )
    return completion.choices[0].message.content

# Streamlit App
st.title("RAG-based Document Query App")
uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])

if uploaded_file:
    file_type = uploaded_file.name.split(".")[-1].lower()

    # Handle file based on type
    if file_type == "pdf":
        text = extract_text_from_pdf(uploaded_file)
        st.write("PDF Extracted Successfully!")
    elif file_type == "docx":
        text = extract_text_from_word(uploaded_file)
        st.write("Word Document Extracted Successfully!")
    elif file_type == "txt":
        text = extract_text_from_txt(uploaded_file)
        st.write("Text File Extracted Successfully!")
    else:
        st.error("Unsupported file type. Please upload a PDF, Word document, or Text file.")
        text = ""

    # Process the text if extraction was successful
    if text:
        # Chunk and embed text
        chunks = chunk_text(text)
        embed_and_store(chunks)
        st.write(f"{len(chunks)} chunks added to the FAISS index.")

        # Query Interface
        user_query = st.text_input("Ask a question about the document:")
        if user_query:
            # Embed query and search FAISS
            query_embedding = embedding_model.encode([user_query])
            distances, indices = index.search(query_embedding, k=5)  # Top 5 results
            relevant_chunks = [chunks[i] for i in indices[0]]

            # Combine chunks for context
            context = " ".join(relevant_chunks)
            final_prompt = f"Context: {context}\n\nQuestion: {user_query}"

            # Get response from OpenAI API
            response = query_llm(final_prompt)
            st.write("### Answer")
            st.write(response)