engrrifatullah's picture
Update app.py
1f03038 verified
import numpy # Ensure NumPy is loaded first to avoid FAISS issues
import faiss # Load FAISS after NumPy
import os
import streamlit as st
import pandas as pd
import pdfplumber
from sentence_transformers import SentenceTransformer
from groq import Groq
import numpy as np
# API key for Groq
API_KEY = "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"
client = Groq(api_key=API_KEY)
# Initialize the embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
return ' '.join(page.extract_text() for page in pdf.pages)
# Function to create embeddings and store them in FAISS
def create_embeddings(text):
chunks = [text[i:i+500] for i in range(0, len(text), 500)]
embeddings = embed_model.encode(chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return chunks, embeddings, index
# Function to find the most relevant chunk for the user's question
def get_relevant_chunk(question, embeddings, index, chunks):
question_embedding = embed_model.encode([question])
D, I = index.search(np.array(question_embedding).astype(np.float32), 1) # Retrieve top 1 chunk
relevant_chunk = chunks[I[0][0]] # The chunk corresponding to the closest embedding
return relevant_chunk
# Function to get the model's response from Groq API
def get_answer_from_groq(question, context):
chat_completion = client.chat.completions.create(
messages=[
{"role": "user", "content": f"Answer the following question based on the context:\nContext: {context}\nQuestion: {question}"}
],
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Streamlit app
def main():
st.set_page_config(
page_title="RAG Based Application",
page_icon="πŸ“„",
layout="centered",
)
# Custom CSS for styling
st.markdown(
"""
<style>
body {
background-color: #f4f7f9;
}
.main-header {
font-size: 2.5rem;
color: #1d3557;
text-align: center;
margin-bottom: 1rem;
}
.upload-box {
border: 2px dashed #457b9d;
border-radius: 10px;
padding: 1rem;
text-align: center;
background-color: #f1faee;
}
</style>
""",
unsafe_allow_html=True,
)
# App title and description
st.markdown('<div class="main-header">RAG Based Application</div>', unsafe_allow_html=True)
st.write("Upload your document (PDF, CSV, or Excel) to process and generate embeddings stored in a FAISS index.")
# File upload section
uploaded_file = st.file_uploader("Drag and drop or browse files", type=["pdf", "csv", "xlsx"])
if uploaded_file:
# Identify file type
file_type = uploaded_file.type
st.markdown('<div class="upload-box">File Uploaded Successfully!</div>', unsafe_allow_html=True)
# Extract content
if file_type == "application/pdf":
text = extract_text_from_pdf(uploaded_file)
elif file_type in ["text/csv", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
df = pd.read_csv(uploaded_file) if file_type == "text/csv" else pd.read_excel(uploaded_file)
text = df.to_string()
# Display content
st.subheader("Document Content:")
st.text_area("Extracted Text", text, height=300)
# Create embeddings
st.write("πŸ”„ Creating embeddings... This may take a moment.")
chunks, embeddings, index = create_embeddings(text)
st.success("βœ… Embeddings created and stored in FAISS index!")
# Question Section
question = st.text_input("Ask a question based on the uploaded document:")
if question:
# Retrieve the most relevant chunk for the question
relevant_chunk = get_relevant_chunk(question, embeddings, index, chunks)
# Get the model's answer based on the relevant chunk
st.write("πŸ”„ Retrieving the answer...")
answer = get_answer_from_groq(question, relevant_chunk)
# Display the answer
st.subheader("Answer:")
st.write(answer)
# Summary Section
st.subheader("Process Summary:")
st.write("- Uploaded file type:", file_type)
st.write("- Number of chunks processed:", len(text) // 500 + 1)
# Run the app
if __name__ == "__main__":
main()