Nitish-py commited on
Commit
38e9abf
·
1 Parent(s): f4bf677

basic improvements

Browse files
Files changed (1) hide show
  1. app.py +210 -209
app.py CHANGED
@@ -2,263 +2,264 @@ import os
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
- from langchain import vectorstores as vs
6
  from langchain import chains
7
- import pinecone
8
  from goose3 import Goose
9
  import streamlit as st
10
  import whisper
11
- from langchain.embeddings import HuggingFaceEmbeddings
12
- from langchain.llms import AI21
13
  from pytube import YouTube
14
  import moviepy.editor
15
  import time
16
 
 
 
17
 
 
 
 
 
 
 
 
18
  load_dotenv()
19
- api_key=os.getenv('PINECONE_API_KEY')
20
- env=os.getenv('PINECONE_ENVIRONMENT')
21
- ai21_api_key=os.getenv('AI21_API_KEY')
22
- pinecone.init(api_key=api_key, environment=env)
23
-
24
- def txtread(txt_content):
25
- texts = ""
26
- texts += txt_content.decode('utf-8')
27
- text_splitter = CharacterTextSplitter(
28
- separator="\n",
29
- chunk_size = 1000,
30
- chunk_overlap = 0)
31
- chunks = text_splitter.split_text(texts)
32
- process.success("Chunking of the data is done")
33
- embeddings = HuggingFaceEmbeddings()
34
- pinecone.init(api_key=api_key, environment=env)
35
- process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
36
- db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="txt")
37
- process.success("Data is securly Uploaded")
38
-
39
- def pdfread(pdf):
40
- pdf_reader = PdfReader(pdf)
41
- texts = ""
42
- for page in pdf_reader.pages:
43
- texts += page.extract_text()
44
- text_splitter = CharacterTextSplitter(
45
- separator="\n",
46
- chunk_size = 4000,
47
- chunk_overlap = 0)
48
- chunks = text_splitter.split_text(texts)
49
- process.success("Chunking of the data is done")
50
- embeddings = HuggingFaceEmbeddings()
51
- pinecone.init(api_key=api_key, environment=env)
52
- process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
53
- db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="pdf")
54
- process.success("Data is securly Uploaded")
55
-
56
- def urlread(url_path):
57
- g = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup'})
58
- texts = g.extract(url=url_path).cleaned_text
59
- text_splitter = CharacterTextSplitter(
60
- separator="\n",
61
- chunk_size = 2000,
62
- chunk_overlap = 0)
63
- chunks = text_splitter.split_text(texts)
64
- process.success("Chunking of the data is done")
65
- embeddings = HuggingFaceEmbeddings()
66
- pinecone.init(api_key=api_key, environment=env)
67
- process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
68
- db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="url")
69
- process.success("Data is securly Uploaded")
70
-
71
- def scrape(vidlink):
72
- youtubeObject = YouTube(vidlink)
73
- youtubeObject = youtubeObject.streams.get_highest_resolution()
74
- youtubeObject.download(filename='video.mp4')
75
- process.success('Downloading Video')
76
- done=False
77
- while not done:
78
- time.sleep(10)
79
- done=os.path.exists("video.mp4")
80
- video = moviepy.editor.VideoFileClip("video.mp4")
81
- process.warning('Extracting Audio')
82
- audio = video.audio
83
- audio.write_audiofile("audio.mp3")
84
- process.warning('Trancscribing the Audio')
85
- model = whisper.load_model('base')
86
- result=model.transcribe('audio.mp3')
87
- texts=(result['text'])
88
- process.success('Transcription is done')
89
- text_splitter = CharacterTextSplitter(
90
- separator="\n",
91
- chunk_size = 1000,
92
- chunk_overlap = 0)
93
- chunks = text_splitter.split_text(texts)
94
- process.success("Chunking of the data is done")
95
- embeddings = HuggingFaceEmbeddings()
96
- pinecone.init(api_key=api_key, environment=env)
97
- process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
98
- db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="vid")
99
- process.success("Data is securly Uploaded")
100
 
101
- def chain(name):
102
- process.warning("Your Chain is running")
103
- embeddings = HuggingFaceEmbeddings()
104
- pinecone.init(api_key=api_key, environment=env)
105
- db=vs.pinecone.Pinecone.from_existing_index(index_name='multigpt',namespace=name, embedding=embeddings)
106
- retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":10})
107
- llm = AI21(ai21_api_key=ai21_api_key)
108
- qa = chains.ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
109
- return qa
110
-
111
- def ai(qa,prompt):
112
- chat_history=[]
113
- result = qa({"question": prompt, "chat_history": chat_history})
114
- process.success("Search Complete!")
115
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- def intro():
118
- placeholder.title('____________👨🏻‍💻 MINOR PROJECT 👨🏻‍💻____________\n')
119
- data.subheader('🚀 Introducing "KnowledgeHub" Web App! 🌐🧠')
120
- process.write('___________________________________________')
121
- intro=('''
122
 
123
- Welcome to the future of knowledge interaction! 🚀 With our groundbreaking web app, "KnowledgeHub," you can effortlessly infuse intelligence into our platform through various mediums. 📚💻
124
 
125
- How It Works:
126
 
127
- 📁 File Magic: Upload your knowledge-packed text files or PDFs to seamlessly share insights and wisdom with the world! 🚀
 
128
 
129
- 🌐 URL Wizardry: Simply paste a website URL, and watch as the KnowledgeHub transforms online information into a dynamic source of intelligence! 🤯
 
130
 
131
- 🎥 YouTube Brilliance: Share video insights by dropping those mind-blowing YouTube links! Transforming video content into knowledge gold has never been easier! 🌟
 
132
 
133
- Why use KnowledgeHub:
134
 
135
- 🚀 Instant Interaction: Say goodbye to static data! Engage with your knowledge instantly and turn information into actionable insights. 🚀
 
136
 
137
- 🌐 Universal Accessibility: Access your knowledge from anywhere, anytime, and empower your audience to dive into your insights effortlessly. 🌍
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- 🤖 AI-Powered Conversations: Leverage cutting-edge AI for interactive conversations based on your knowledge repository! It's like having a brilliant virtual assistant at your fingertips! 🤖💡
140
 
141
- 📊 Data-Driven Decisions: Turn raw data into actionable intelligence. Make informed decisions backed by the power of your knowledge repository. 📈
 
 
 
142
 
143
- Embrace the future of knowledge sharing with KnowledgeHub – Where ideas come to life, and intelligence knows no bounds! 🚀🔥🔍''')
144
- ph=st.empty()
145
- x=''
146
- for i in intro:
147
- x+=i
148
- time.sleep(0.005)
149
- ph.markdown(x)
150
 
151
  def upload():
152
- placeholder.title("Let's create the Knowledge Base")
153
- process.error('Here you will be notified regarding the status of the upload')
154
- page = ['','TEXT','PDF','URL','VIDEO']
155
- choice = st.sidebar.radio("Choose your mode",page)
156
 
157
- if choice=='':
158
- data.subheader('Choose what type of data you wanna upload')
159
 
160
- elif choice == 'TEXT':
161
- text = data.file_uploader("Upload your txt file", type="txt")
162
- if text:
163
- txtread(text)
164
 
165
  elif choice == 'PDF':
166
- pdf = data.file_uploader("Upload your PDF file", type="pdf")
167
- if pdf:
168
- pdfread(pdf)
169
 
170
  elif choice == 'URL':
171
- url_path = data.text_input('Enter the url')
172
- if url_path:
173
- urlread(url_path)
174
-
175
 
176
  elif choice == 'VIDEO':
177
- link = data.text_input('Enter link to the youtube video')
178
  if link:
179
  scrape(link)
180
- time.sleep(10)
181
- process.success('You can go to the chat section or upload more data')
182
 
183
  def chat():
184
- placeholder.title("Let's go!!")
185
- process.error('Here you will be notified regarding the retrival of your answers')
186
- page = ['','TEXT','PDF','URL','VIDEO']
187
- choice = st.sidebar.radio("Choose your mode",page)
188
-
189
- if choice=='':
190
- data.subheader('Choose from which data you want answers from')
191
-
192
- elif choice == 'TEXT':
193
- name='txt'
194
- query = st.text_input("Ask a question based on the txt file",value="")
195
- if query:
196
- qa=chain(name)
197
- result=ai(qa,query)
198
- ph=st.empty()
199
- x=''
200
- for i in result["answer"]:
201
- x+=i
202
- time.sleep(0.01)
203
- ph.markdown(x)
204
 
205
- elif choice == 'PDF':
206
- name='pdf'
207
- query = st.text_input("Ask a question based on the PDF",value="")
208
- if query:
209
- qa=chain(name)
210
- result=ai(qa,query)
211
- ph=st.empty()
212
- x=''
213
- for i in result["answer"]:
214
- x+=i
215
- time.sleep(0.01)
216
- ph.markdown(x)
217
 
218
- elif choice == 'URL':
219
- name='url'
220
- query = st.text_input("Ask a question based on the data from the url",value="")
221
- if query:
222
- qa=chain(name)
223
- result=ai(qa,query)
224
- ph=st.empty()
225
- x=''
226
- for i in result["answer"]:
227
- x+=i
228
- time.sleep(0.01)
229
- ph.markdown(x)
230
-
231
 
232
- elif choice == 'VIDEO':
233
- name='vid'
234
- query = st.text_input("Ask a question from based on the YouTube video",value="")
235
  if query:
236
- qa=chain(name)
237
- result=ai(qa,query)
238
- ph=st.empty()
239
- x=''
 
240
  for i in result["answer"]:
241
- x+=i
242
  time.sleep(0.01)
243
  ph.markdown(x)
244
-
245
 
 
246
 
247
- def main():
248
  global placeholder, process, data
249
- placeholder=st.empty()
250
- data=st.empty()
251
- process=st.empty()
252
- page = ['HOME','Upload','Chat']
253
- choice = st.sidebar.radio("Choose upload or chat",page)
254
- if choice=='HOME':
255
- intro()
256
-
257
- elif choice=='Upload':
258
- upload()
259
 
260
- elif choice=='Chat':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  chat()
 
 
 
262
 
263
  if __name__ == "__main__":
264
- main()
 
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
 
5
  from langchain import chains
 
6
  from goose3 import Goose
7
  import streamlit as st
8
  import whisper
 
 
9
  from pytube import YouTube
10
  import moviepy.editor
11
  import time
12
 
13
+ from langchain_community.vectorstores import Milvus
14
+ from pymilvus import connections
15
 
16
+ # HF
17
+ from huggingface_hub import InferenceClient
18
+ from langchain.embeddings.base import Embeddings
19
+ from langchain.llms.base import LLM
20
+ from typing import Optional, List
21
+
22
+ # -------------------- INIT --------------------
23
  load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ connections.connect(alias="default", host="localhost", port="19530")
26
+
27
+ HF_TOKEN = os.getenv("HF_TOKEN")
28
+
29
+ # -------------------- HF EMBEDDINGS --------------------
30
+
31
+ class HFInferenceEmbeddings(Embeddings):
32
+ def __init__(self):
33
+ self.client = InferenceClient(api_key=HF_TOKEN)
34
+ self.model = "sentence-transformers/all-MiniLM-L6-v2"
35
+
36
+ def embed_documents(self, texts):
37
+ return self.client.feature_extraction(texts, model=self.model)
38
+
39
+ def embed_query(self, text):
40
+ return self.client.feature_extraction(text, model=self.model)
41
+
42
+ # -------------------- HF LLM --------------------
43
+
44
+ class HFChatLLM(LLM):
45
+ def __init__(self):
46
+ self.client = InferenceClient(api_key=HF_TOKEN)
47
+ self.model = "deepseek-ai/DeepSeek-V3.2:novita"
48
+
49
+ @property
50
+ def _llm_type(self) -> str:
51
+ return "hf_chat"
52
+
53
+ def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
54
+ completion = self.client.chat.completions.create(
55
+ model=self.model,
56
+ messages=[
57
+ {
58
+ "role": "system",
59
+ "content": "Answer only from the given context. Be concise and accurate."
60
+ },
61
+ {
62
+ "role": "user",
63
+ "content": prompt
64
+ }
65
+ ],
66
+ )
67
+ return completion.choices[0].message.content
68
+
69
+
70
+ def get_embeddings():
71
+ return HFInferenceEmbeddings()
72
+
73
+ def get_llm():
74
+ return HFChatLLM()
75
+
76
+ def get_collection(user_id, name):
77
+ return f"multigpt_{user_id}_{name}"
78
+
79
+ # -------------------- AUTH --------------------
80
+
81
+ def login():
82
+ st.title("🔐 Login")
83
+
84
+ user = st.text_input("Enter username")
85
+
86
+ if st.button("Login"):
87
+ if user:
88
+ st.session_state["user_id"] = user.strip().lower()
89
+ st.success(f"Logged in as {user}")
90
+ st.rerun()
91
+ else:
92
+ st.error("Enter username")
93
+
94
+ # -------------------- INGESTION --------------------
95
+
96
+ def store_data(chunks, collection_name):
97
+ Milvus.from_texts(
98
+ chunks,
99
+ embedding=get_embeddings(),
100
+ collection_name=collection_name,
101
+ connection_args={"host": "localhost", "port": "19530"}
102
+ )
103
+
104
+ def txtread(file):
105
+ user_id = st.session_state["user_id"]
106
+
107
+ text = file.read().decode("utf-8")
108
+
109
+ chunks = CharacterTextSplitter("\n", 1000, 0).split_text(text)
110
+
111
+ process.success("Chunking done")
112
+
113
+ store_data(chunks, get_collection(user_id, "txt"))
114
+ process.success("Uploaded")
115
+
116
+ def pdfread(file):
117
+ user_id = st.session_state["user_id"]
118
+
119
+ reader = PdfReader(file)
120
+ text = "".join([p.extract_text() for p in reader.pages])
121
+
122
+ chunks = CharacterTextSplitter("\n", 4000, 0).split_text(text)
123
+
124
+ process.success("Chunking done")
125
+
126
+ store_data(chunks, get_collection(user_id, "pdf"))
127
+ process.success("Uploaded")
128
+
129
+ def urlread(url):
130
+ user_id = st.session_state["user_id"]
131
 
132
+ g = Goose()
133
+ text = g.extract(url=url).cleaned_text
 
 
 
134
 
135
+ chunks = CharacterTextSplitter("\n", 2000, 0).split_text(text)
136
 
137
+ process.success("Chunking done")
138
 
139
+ store_data(chunks, get_collection(user_id, "url"))
140
+ process.success("Uploaded")
141
 
142
+ def scrape(link):
143
+ user_id = st.session_state["user_id"]
144
 
145
+ yt = YouTube(link).streams.get_highest_resolution()
146
+ yt.download(filename="video.mp4")
147
 
148
+ process.success("Downloading video")
149
 
150
+ while not os.path.exists("video.mp4"):
151
+ time.sleep(5)
152
 
153
+ video = moviepy.editor.VideoFileClip("video.mp4")
154
+
155
+ process.warning("Extracting audio")
156
+ audio = video.audio
157
+ audio.write_audiofile("audio.mp3")
158
+
159
+ process.warning("Transcribing")
160
+ model = whisper.load_model("base")
161
+ result = model.transcribe("audio.mp3")
162
+
163
+ chunks = CharacterTextSplitter("\n", 1000, 0).split_text(result["text"])
164
+
165
+ process.success("Chunking done")
166
+
167
+ store_data(chunks, get_collection(user_id, "vid"))
168
+ process.success("Uploaded")
169
+
170
+ # -------------------- QA --------------------
171
+
172
+ def chain(name):
173
+ user_id = st.session_state["user_id"]
174
+
175
+ db = Milvus(
176
+ embedding_function=get_embeddings(),
177
+ collection_name=get_collection(user_id, name),
178
+ connection_args={"host": "localhost", "port": "19530"}
179
+ )
180
 
181
+ retriever = db.as_retriever(search_kwargs={"k": 10})
182
 
183
+ return chains.ConversationalRetrievalChain.from_llm(
184
+ llm=get_llm(),
185
+ retriever=retriever
186
+ )
187
 
188
+ def ai(qa, query):
189
+ result = qa({"question": query, "chat_history": []})
190
+ process.success("Answer ready")
191
+ return result
192
+
193
+ # -------------------- UI --------------------
 
194
 
195
  def upload():
196
+ placeholder.title("Upload Data")
 
 
 
197
 
198
+ choice = st.sidebar.radio("Mode", ['', 'TEXT', 'PDF', 'URL', 'VIDEO'])
 
199
 
200
+ if choice == 'TEXT':
201
+ file = st.file_uploader("Upload txt")
202
+ if file:
203
+ txtread(file)
204
 
205
  elif choice == 'PDF':
206
+ file = st.file_uploader("Upload PDF")
207
+ if file:
208
+ pdfread(file)
209
 
210
  elif choice == 'URL':
211
+ url = st.text_input("Enter URL")
212
+ if url:
213
+ urlread(url)
 
214
 
215
  elif choice == 'VIDEO':
216
+ link = st.text_input("YouTube link")
217
  if link:
218
  scrape(link)
 
 
219
 
220
  def chat():
221
+ placeholder.title("Chat with your data")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
+ choice = st.sidebar.radio("Mode", ['', 'TEXT', 'PDF', 'URL', 'VIDEO'])
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ if choice:
226
+ query = st.text_input("Ask your question")
 
 
 
 
 
 
 
 
 
 
 
227
 
 
 
 
228
  if query:
229
+ qa = chain(choice.lower())
230
+ result = ai(qa, query)
231
+
232
+ ph = st.empty()
233
+ x = ""
234
  for i in result["answer"]:
235
+ x += i
236
  time.sleep(0.01)
237
  ph.markdown(x)
 
238
 
239
+ # -------------------- MAIN --------------------
240
 
241
+ def main():
242
  global placeholder, process, data
 
 
 
 
 
 
 
 
 
 
243
 
244
+ placeholder = st.empty()
245
+ data = st.empty()
246
+ process = st.empty()
247
+
248
+ if "user_id" not in st.session_state:
249
+ login()
250
+ return
251
+
252
+ st.sidebar.write(f"👤 {st.session_state['user_id']}")
253
+
254
+ page = st.sidebar.radio("Navigate", ['Upload', 'Chat', 'Logout'])
255
+
256
+ if page == "Upload":
257
+ upload()
258
+ elif page == "Chat":
259
  chat()
260
+ elif page == "Logout":
261
+ st.session_state.clear()
262
+ st.rerun()
263
 
264
  if __name__ == "__main__":
265
+ main()