Spaces:

Nitish-py
/

KnowledgeHub

Sleeping

App Files Files Community

Nitish-py commited on 24 days ago

Commit

38e9abf

1 Parent(s): f4bf677

basic improvements

Browse files

Files changed (1) hide show

app.py +210 -209

app.py CHANGED Viewed

@@ -2,263 +2,264 @@ import os
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
 from langchain.text_splitter import CharacterTextSplitter
-from langchain import vectorstores as vs
 from langchain import chains
-import pinecone
 from goose3 import Goose
 import streamlit as st
 import whisper
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.llms import AI21
 from pytube import YouTube
 import moviepy.editor
 import time
 load_dotenv()
-api_key=os.getenv('PINECONE_API_KEY')
-env=os.getenv('PINECONE_ENVIRONMENT')
-ai21_api_key=os.getenv('AI21_API_KEY')
-pinecone.init(api_key=api_key, environment=env)
-def txtread(txt_content):
-    texts = ""
-    texts += txt_content.decode('utf-8')
-    text_splitter = CharacterTextSplitter(
-    separator="\n",
-    chunk_size = 1000,
-    chunk_overlap = 0)
-    chunks = text_splitter.split_text(texts)
-    process.success("Chunking of the data is done")
-    embeddings = HuggingFaceEmbeddings()
-    pinecone.init(api_key=api_key, environment=env)
-    process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
-    db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="txt")
-    process.success("Data is securly Uploaded")
-def pdfread(pdf):
-    pdf_reader = PdfReader(pdf)
-    texts = ""
-    for page in pdf_reader.pages:
-        texts += page.extract_text()
-    text_splitter = CharacterTextSplitter(
-    separator="\n",
-    chunk_size = 4000,
-    chunk_overlap = 0)
-    chunks = text_splitter.split_text(texts)
-    process.success("Chunking of the data is done")
-    embeddings = HuggingFaceEmbeddings()
-    pinecone.init(api_key=api_key, environment=env)
-    process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
-    db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="pdf")
-    process.success("Data is securly Uploaded")
-def urlread(url_path):
-    g = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup'})
-    texts = g.extract(url=url_path).cleaned_text
-    text_splitter = CharacterTextSplitter(
-    separator="\n",
-    chunk_size = 2000,
-    chunk_overlap = 0)
-    chunks = text_splitter.split_text(texts)
-    process.success("Chunking of the data is done")
-    embeddings = HuggingFaceEmbeddings()
-    pinecone.init(api_key=api_key, environment=env)
-    process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
-    db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="url")
-    process.success("Data is securly Uploaded")
-def scrape(vidlink):
-    youtubeObject = YouTube(vidlink)
-    youtubeObject = youtubeObject.streams.get_highest_resolution()
-    youtubeObject.download(filename='video.mp4')
-    process.success('Downloading Video')
-    done=False
-    while not done:
-        time.sleep(10)
-        done=os.path.exists("video.mp4")
-    video = moviepy.editor.VideoFileClip("video.mp4")
-    process.warning('Extracting Audio')
-    audio = video.audio
-    audio.write_audiofile("audio.mp3")
-    process.warning('Trancscribing the Audio')
-    model = whisper.load_model('base')
-    result=model.transcribe('audio.mp3')
-    texts=(result['text'])
-    process.success('Transcription is done')
-    text_splitter = CharacterTextSplitter(
-    separator="\n",
-    chunk_size = 1000,
-    chunk_overlap = 0)
-    chunks = text_splitter.split_text(texts)
-    process.success("Chunking of the data is done")
-    embeddings = HuggingFaceEmbeddings()
-    pinecone.init(api_key=api_key, environment=env)
-    process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
-    db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="vid")
-    process.success("Data is securly Uploaded")
-def chain(name):
-    process.warning("Your Chain is running")
-    embeddings = HuggingFaceEmbeddings()
-    pinecone.init(api_key=api_key, environment=env)
-    db=vs.pinecone.Pinecone.from_existing_index(index_name='multigpt',namespace=name, embedding=embeddings)
-    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":10})
-    llm = AI21(ai21_api_key=ai21_api_key)
-    qa = chains.ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
-    return qa
-def ai(qa,prompt):
-    chat_history=[]
-    result = qa({"question": prompt,  "chat_history": chat_history})
-    process.success("Search Complete!")
-    return result
-def intro():
-    placeholder.title('____________👨🏻‍💻 MINOR PROJECT 👨🏻‍💻____________\n')
-    data.subheader('🚀 Introducing "KnowledgeHub" Web App! 🌐🧠')
-    process.write('___________________________________________')
-    intro=('''
-Welcome to the future of knowledge interaction! 🚀 With our groundbreaking web app, "KnowledgeHub," you can effortlessly infuse intelligence into our platform through various mediums. 📚💻
-How It Works:
-📁 File Magic: Upload your knowledge-packed text files or PDFs to seamlessly share insights and wisdom with the world! 🚀
-🌐 URL Wizardry: Simply paste a website URL, and watch as the KnowledgeHub transforms online information into a dynamic source of intelligence! 🤯
-🎥 YouTube Brilliance: Share video insights by dropping those mind-blowing YouTube links! Transforming video content into knowledge gold has never been easier! 🌟
-Why use KnowledgeHub:
-🚀 Instant Interaction: Say goodbye to static data! Engage with your knowledge instantly and turn information into actionable insights. 🚀
-🌐 Universal Accessibility: Access your knowledge from anywhere, anytime, and empower your audience to dive into your insights effortlessly. 🌍
-🤖 AI-Powered Conversations: Leverage cutting-edge AI for interactive conversations based on your knowledge repository! It's like having a brilliant virtual assistant at your fingertips! 🤖💡
-📊 Data-Driven Decisions: Turn raw data into actionable intelligence. Make informed decisions backed by the power of your knowledge repository. 📈
-Embrace the future of knowledge sharing with KnowledgeHub – Where ideas come to life, and intelligence knows no bounds! 🚀🔥🔍''')
-    ph=st.empty()
-    x=''
-    for i in intro:
-        x+=i
-        time.sleep(0.005)
-        ph.markdown(x)
 def upload():
-    placeholder.title("Let's create the Knowledge Base")
-    process.error('Here you will be notified regarding the status of the upload')
-    page = ['','TEXT','PDF','URL','VIDEO']
-    choice = st.sidebar.radio("Choose your mode",page)
-    if choice=='':
-        data.subheader('Choose what type of data you wanna upload')
-    elif choice == 'TEXT':
-        text = data.file_uploader("Upload your txt file", type="txt")
-        if text:
-            txtread(text)
     elif choice == 'PDF':
-        pdf = data.file_uploader("Upload your PDF file", type="pdf")
-        if pdf:
-            pdfread(pdf)
     elif choice == 'URL':
-        url_path = data.text_input('Enter the url')
-        if url_path:
-            urlread(url_path)
     elif choice == 'VIDEO':
-        link = data.text_input('Enter link to the youtube video')
         if link:
             scrape(link)
-    time.sleep(10)
-    process.success('You can go to the chat section or upload more data')
 def chat():
-    placeholder.title("Let's go!!")
-    process.error('Here you will be notified regarding the retrival of your answers')
-    page = ['','TEXT','PDF','URL','VIDEO']
-    choice = st.sidebar.radio("Choose your mode",page)
-    if choice=='':
-        data.subheader('Choose from which data you want answers from')
-    elif choice == 'TEXT':
-        name='txt'
-        query = st.text_input("Ask a question based on the txt file",value="")
-        if query:
-            qa=chain(name)
-            result=ai(qa,query)
-            ph=st.empty()
-            x=''
-            for i in result["answer"]:
-                x+=i
-                time.sleep(0.01)
-                ph.markdown(x)
-    elif choice == 'PDF':
-        name='pdf'
-        query = st.text_input("Ask a question based on the PDF",value="")
-        if query:
-            qa=chain(name)
-            result=ai(qa,query)
-            ph=st.empty()
-            x=''
-            for i in result["answer"]:
-                x+=i
-                time.sleep(0.01)
-                ph.markdown(x)
-    elif choice == 'URL':
-        name='url'
-        query = st.text_input("Ask a question based on the data from the url",value="")
-        if query:
-            qa=chain(name)
-            result=ai(qa,query)
-            ph=st.empty()
-            x=''
-            for i in result["answer"]:
-                x+=i
-                time.sleep(0.01)
-                ph.markdown(x)
-    elif choice == 'VIDEO':
-        name='vid'
-        query = st.text_input("Ask a question from based on the YouTube video",value="")
         if query:
-            qa=chain(name)
-            result=ai(qa,query)
-            ph=st.empty()
-            x=''
             for i in result["answer"]:
-                x+=i
                 time.sleep(0.01)
                 ph.markdown(x)
-def main():
     global placeholder, process, data
-    placeholder=st.empty()
-    data=st.empty()
-    process=st.empty()
-    page = ['HOME','Upload','Chat']
-    choice = st.sidebar.radio("Choose upload or chat",page)
-    if choice=='HOME':
-        intro()
-    elif choice=='Upload':
-        upload()
-    elif choice=='Chat':
         chat()
 if __name__ == "__main__":
-    main()

 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain import chains
 from goose3 import Goose
 import streamlit as st
 import whisper
 from pytube import YouTube
 import moviepy.editor
 import time
+from langchain_community.vectorstores import Milvus
+from pymilvus import connections
+# HF
+from huggingface_hub import InferenceClient
+from langchain.embeddings.base import Embeddings
+from langchain.llms.base import LLM
+from typing import Optional, List
+# -------------------- INIT --------------------
 load_dotenv()
+connections.connect(alias="default", host="localhost", port="19530")
+HF_TOKEN = os.getenv("HF_TOKEN")
+# -------------------- HF EMBEDDINGS --------------------
+class HFInferenceEmbeddings(Embeddings):
+    def __init__(self):
+        self.client = InferenceClient(api_key=HF_TOKEN)
+        self.model = "sentence-transformers/all-MiniLM-L6-v2"
+    def embed_documents(self, texts):
+        return self.client.feature_extraction(texts, model=self.model)
+    def embed_query(self, text):
+        return self.client.feature_extraction(text, model=self.model)
+# -------------------- HF LLM --------------------
+class HFChatLLM(LLM):
+    def __init__(self):
+        self.client = InferenceClient(api_key=HF_TOKEN)
+        self.model = "deepseek-ai/DeepSeek-V3.2:novita"
+    @property
+    def _llm_type(self) -> str:
+        return "hf_chat"
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        completion = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "Answer only from the given context. Be concise and accurate."
+                },
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+        )
+        return completion.choices[0].message.content
+def get_embeddings():
+    return HFInferenceEmbeddings()
+def get_llm():
+    return HFChatLLM()
+def get_collection(user_id, name):
+    return f"multigpt_{user_id}_{name}"
+# -------------------- AUTH --------------------
+def login():
+    st.title("🔐 Login")
+    user = st.text_input("Enter username")
+    if st.button("Login"):
+        if user:
+            st.session_state["user_id"] = user.strip().lower()
+            st.success(f"Logged in as {user}")
+            st.rerun()
+        else:
+            st.error("Enter username")
+# -------------------- INGESTION --------------------
+def store_data(chunks, collection_name):
+    Milvus.from_texts(
+        chunks,
+        embedding=get_embeddings(),
+        collection_name=collection_name,
+        connection_args={"host": "localhost", "port": "19530"}
+    )
+def txtread(file):
+    user_id = st.session_state["user_id"]
+    text = file.read().decode("utf-8")
+    chunks = CharacterTextSplitter("\n", 1000, 0).split_text(text)
+    process.success("Chunking done")
+    store_data(chunks, get_collection(user_id, "txt"))
+    process.success("Uploaded")
+def pdfread(file):
+    user_id = st.session_state["user_id"]
+    reader = PdfReader(file)
+    text = "".join([p.extract_text() for p in reader.pages])
+    chunks = CharacterTextSplitter("\n", 4000, 0).split_text(text)
+    process.success("Chunking done")
+    store_data(chunks, get_collection(user_id, "pdf"))
+    process.success("Uploaded")
+def urlread(url):
+    user_id = st.session_state["user_id"]
+    g = Goose()
+    text = g.extract(url=url).cleaned_text
+    chunks = CharacterTextSplitter("\n", 2000, 0).split_text(text)
+    process.success("Chunking done")
+    store_data(chunks, get_collection(user_id, "url"))
+    process.success("Uploaded")
+def scrape(link):
+    user_id = st.session_state["user_id"]
+    yt = YouTube(link).streams.get_highest_resolution()
+    yt.download(filename="video.mp4")
+    process.success("Downloading video")
+    while not os.path.exists("video.mp4"):
+        time.sleep(5)
+    video = moviepy.editor.VideoFileClip("video.mp4")
+    process.warning("Extracting audio")
+    audio = video.audio
+    audio.write_audiofile("audio.mp3")
+    process.warning("Transcribing")
+    model = whisper.load_model("base")
+    result = model.transcribe("audio.mp3")
+    chunks = CharacterTextSplitter("\n", 1000, 0).split_text(result["text"])
+    process.success("Chunking done")
+    store_data(chunks, get_collection(user_id, "vid"))
+    process.success("Uploaded")
+# -------------------- QA --------------------
+def chain(name):
+    user_id = st.session_state["user_id"]
+    db = Milvus(
+        embedding_function=get_embeddings(),
+        collection_name=get_collection(user_id, name),
+        connection_args={"host": "localhost", "port": "19530"}
+    )
+    retriever = db.as_retriever(search_kwargs={"k": 10})
+    return chains.ConversationalRetrievalChain.from_llm(
+        llm=get_llm(),
+        retriever=retriever
+    )
+def ai(qa, query):
+    result = qa({"question": query, "chat_history": []})
+    process.success("Answer ready")
+    return result
+# -------------------- UI --------------------
 def upload():
+    placeholder.title("Upload Data")
+    choice = st.sidebar.radio("Mode", ['', 'TEXT', 'PDF', 'URL', 'VIDEO'])
+    if choice == 'TEXT':
+        file = st.file_uploader("Upload txt")
+        if file:
+            txtread(file)
     elif choice == 'PDF':
+        file = st.file_uploader("Upload PDF")
+        if file:
+            pdfread(file)
     elif choice == 'URL':
+        url = st.text_input("Enter URL")
+        if url:
+            urlread(url)
     elif choice == 'VIDEO':
+        link = st.text_input("YouTube link")
         if link:
             scrape(link)
 def chat():
+    placeholder.title("Chat with your data")
+    choice = st.sidebar.radio("Mode", ['', 'TEXT', 'PDF', 'URL', 'VIDEO'])
+    if choice:
+        query = st.text_input("Ask your question")
         if query:
+            qa = chain(choice.lower())
+            result = ai(qa, query)
+            ph = st.empty()
+            x = ""
             for i in result["answer"]:
+                x += i
                 time.sleep(0.01)
                 ph.markdown(x)
+# -------------------- MAIN --------------------
+def main():
     global placeholder, process, data
+    placeholder = st.empty()
+    data = st.empty()
+    process = st.empty()
+    if "user_id" not in st.session_state:
+        login()
+        return
+    st.sidebar.write(f"👤 {st.session_state['user_id']}")
+    page = st.sidebar.radio("Navigate", ['Upload', 'Chat', 'Logout'])
+    if page == "Upload":
+        upload()
+    elif page == "Chat":
         chat()
+    elif page == "Logout":
+        st.session_state.clear()
+        st.rerun()
 if __name__ == "__main__":
+    main()