| from langchain_huggingface import HuggingFaceEmbeddings |
| from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM |
| from langchain_community.vectorstores import Chroma |
| from langchain.schema import Document |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline |
| import torch |
|
|
| embedding_model_name = 'l3cube-pune/punjabi-sentence-similarity-sbert' |
|
|
| model_kwargs = {'device':'cuda' if torch.cuda.is_available() else 'cpu',"trust_remote_code": True} |
|
|
| embeddings = HuggingFaceEmbeddings( |
| model_name=embedding_model_name, |
| model_kwargs=model_kwargs |
| ) |
|
|
| vectorstore = None |
|
|
|
|
|
|
| def read_file(data: str) -> Document: |
| f = open(data,'r') |
| content = f.read() |
| f.close() |
| doc = Document(page_content=content, metadata={"name": data.split('/')[-1]}) |
| return doc |
|
|
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100) |
|
|
| def add_doc(data,vectorstore): |
| doc = read_file(data) |
| splits = text_splitter.split_documents([doc]) |
| vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings) |
| retriever = vectorstore.as_retriever(search_kwargs={'k':1}) |
| return retriever, vectorstore |
|
|
| def delete_doc(delete_name,vectorstore): |
| delete_doc_ids = [] |
| for idx,name in enumerate(vectorstore.get()['metadatas']): |
| if name['name'] == delete_name: |
| delete_doc_ids.append(vectorstore.get()['ids'][idx]) |
| for id in delete_doc_ids: |
| vectorstore.delete(ids = id) |
| |
| retriever = vectorstore.as_retriever(search_kwargs={'k':1}) |
| return retriever, vectorstore |
|
|
| def delete_all_doc(vectorstore): |
| delete_doc_ids = vectorstore.get()['ids'] |
| for id in delete_doc_ids: |
| vectorstore.delete(ids = id) |
| |
| retriever = vectorstore.as_retriever(search_kwargs={'k':1}) |
| return retriever, vectorstore |