| import numpy |
| import faiss |
| import os |
| import streamlit as st |
| import pandas as pd |
| import pdfplumber |
| from sentence_transformers import SentenceTransformer |
| from groq import Groq |
| import numpy as np |
|
|
| |
| API_KEY = "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" |
| client = Groq(api_key=API_KEY) |
|
|
| |
| embed_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
| |
| def extract_text_from_pdf(pdf_file): |
| with pdfplumber.open(pdf_file) as pdf: |
| return ' '.join(page.extract_text() for page in pdf.pages) |
|
|
| |
| def create_embeddings(text): |
| chunks = [text[i:i+500] for i in range(0, len(text), 500)] |
| embeddings = embed_model.encode(chunks) |
| index = faiss.IndexFlatL2(embeddings.shape[1]) |
| index.add(embeddings) |
| return chunks, embeddings, index |
|
|
| |
| def get_relevant_chunk(question, embeddings, index, chunks): |
| question_embedding = embed_model.encode([question]) |
| D, I = index.search(np.array(question_embedding).astype(np.float32), 1) |
| relevant_chunk = chunks[I[0][0]] |
| return relevant_chunk |
|
|
| |
| def get_answer_from_groq(question, context): |
| chat_completion = client.chat.completions.create( |
| messages=[ |
| {"role": "user", "content": f"Answer the following question based on the context:\nContext: {context}\nQuestion: {question}"} |
| ], |
| model="llama3-8b-8192", |
| ) |
| return chat_completion.choices[0].message.content |
|
|
| |
| def main(): |
| st.set_page_config( |
| page_title="RAG Based Application", |
| page_icon="π", |
| layout="centered", |
| ) |
|
|
| |
| st.markdown( |
| """ |
| <style> |
| body { |
| background-color: #f4f7f9; |
| } |
| .main-header { |
| font-size: 2.5rem; |
| color: #1d3557; |
| text-align: center; |
| margin-bottom: 1rem; |
| } |
| .upload-box { |
| border: 2px dashed #457b9d; |
| border-radius: 10px; |
| padding: 1rem; |
| text-align: center; |
| background-color: #f1faee; |
| } |
| </style> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
| |
| st.markdown('<div class="main-header">RAG Based Application</div>', unsafe_allow_html=True) |
| st.write("Upload your document (PDF, CSV, or Excel) to process and generate embeddings stored in a FAISS index.") |
| |
| |
| uploaded_file = st.file_uploader("Drag and drop or browse files", type=["pdf", "csv", "xlsx"]) |
|
|
| if uploaded_file: |
| |
| file_type = uploaded_file.type |
| st.markdown('<div class="upload-box">File Uploaded Successfully!</div>', unsafe_allow_html=True) |
|
|
| |
| if file_type == "application/pdf": |
| text = extract_text_from_pdf(uploaded_file) |
| elif file_type in ["text/csv", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]: |
| df = pd.read_csv(uploaded_file) if file_type == "text/csv" else pd.read_excel(uploaded_file) |
| text = df.to_string() |
|
|
| |
| st.subheader("Document Content:") |
| st.text_area("Extracted Text", text, height=300) |
|
|
| |
| st.write("π Creating embeddings... This may take a moment.") |
| chunks, embeddings, index = create_embeddings(text) |
| st.success("β
Embeddings created and stored in FAISS index!") |
|
|
| |
| question = st.text_input("Ask a question based on the uploaded document:") |
| |
| if question: |
| |
| relevant_chunk = get_relevant_chunk(question, embeddings, index, chunks) |
|
|
| |
| st.write("π Retrieving the answer...") |
| answer = get_answer_from_groq(question, relevant_chunk) |
| |
| |
| st.subheader("Answer:") |
| st.write(answer) |
|
|
| |
| st.subheader("Process Summary:") |
| st.write("- Uploaded file type:", file_type) |
| st.write("- Number of chunks processed:", len(text) // 500 + 1) |
|
|
| |
| if __name__ == "__main__": |
| main() |
|
|