| import os |
| import pickle |
| from json import dumps, loads |
| import time |
| from typing import Any, List, Mapping, Optional |
|
|
| import numpy as np |
| import openai |
| import pandas as pd |
| import streamlit as st |
| from dotenv import load_dotenv |
| from huggingface_hub import HfFileSystem |
|
|
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Pipeline |
|
|
| |
| from assets.prompts import custom_prompts |
|
|
| |
| from llama_index.core import ( |
| StorageContext, |
| SimpleDirectoryReader, |
| VectorStoreIndex, |
| load_index_from_storage, |
| PromptHelper, |
| PromptTemplate, |
| ) |
| from llama_index.core.llms import ( |
| CustomLLM, |
| CompletionResponse, |
| LLMMetadata, |
| ) |
| from llama_index.core.memory import ChatMemoryBuffer |
| from llama_index.core.llms.callbacks import llm_completion_callback |
| from llama_index.core.base.llms.types import ChatMessage |
| from llama_index.core import Settings |
|
|
| load_dotenv() |
| |
| fs = HfFileSystem() |
|
|
| |
| |
| CONTEXT_WINDOW = 2048 |
| |
| NUM_OUTPUT = 525 |
| |
| CHUNK_OVERLAP_RATION = 0.2 |
|
|
| |
| ANSWER_FORMAT = """ |
| Use the following example format for your answer: |
| [FORMAT] |
| Answer: |
| The answer to the user question. |
| Reference: |
| The list of references to the specific sections of the documents that support your answer. |
| [END_FORMAT] |
| """ |
|
|
| |
| QUERY_ENGINE_QA_TEMPLATE = """ |
| We have provided context information below: |
| [CONTEXT] |
| {context_str} |
| [END_CONTEXT] |
| Given this information, please answer the following question: |
| [QUESTION] |
| {query_str} |
| [END_QUESTION] |
| """ |
|
|
| QUERY_ENGINE_REFINE_TEMPLATE = """ |
| The original query is as follows: |
| [QUESTION] |
| {query_str} |
| [END_QUESTION] |
| |
| We have providec an existing answer: |
| [ANSWER] |
| {existing_answer} |
| [END_ANSWER] |
| |
| We have the opportunity to refine the existing answer (only if needed) with some more |
| context below. |
| [CONTEXT] |
| {context_msg} |
| [END_CONTEXT] |
| |
| Given the new context, refine the original answer to include more details like references \ |
| to the specific sections of the documents that support your answer. |
| |
| Refined Answer: |
| """ |
|
|
|
|
| CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE = """ |
| The following is a friendly conversation between a user and an AI assistant. |
| The assistant is talkative and provides lots of specific details from its context. |
| If the assistant does not know the answer to a question, it truthfully says it |
| does not know. |
| |
| Here are the relevant documents for the context: |
| |
| {context_str} |
| |
| Instruction: Based on the above documents, provide a detailed answer for the user question below. \ |
| Include references to the specific sections of the documents that support your answer. \ |
| Answer "don't know" if not present in the document. |
| """ |
|
|
| CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE = """ |
| Given the following conversation between a user and an AI assistant and a follow up question from user, |
| rephrase the follow up question to be a standalone question. |
| |
| Chat History: |
| {chat_history} |
| Follow Up Input: {question} |
| Standalone question: |
| """ |
|
|
|
|
| @st.cache_resource |
| def load_model(model_name: str): |
| |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained(model_name, config="T5Config") |
|
|
| pipe = pipeline( |
| task="text-generation", |
| model=model, |
| tokenizer=tokenizer, |
| |
| |
| do_sample=True, |
| top_p=0.95, |
| top_k=50, |
| temperature=0.7, |
| ) |
|
|
| return pipe |
|
|
|
|
| class OurLLM(CustomLLM): |
| context_window: int = 3900 |
| num_output: int = 256 |
| model_name: str = "" |
| pipeline: Pipeline = None |
|
|
| @property |
| def metadata(self) -> LLMMetadata: |
| """Get LLM metadata.""" |
| return LLMMetadata( |
| context_window=CONTEXT_WINDOW, |
| num_output=NUM_OUTPUT, |
| model_name=self.model_name, |
| ) |
|
|
| |
| @llm_completion_callback() |
| def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: |
| prompt_length = len(prompt) |
| response = self.pipeline(prompt, max_new_tokens=NUM_OUTPUT)[0]["generated_text"] |
|
|
| |
| text = response[prompt_length:] |
| return CompletionResponse(text=text) |
|
|
| @llm_completion_callback() |
| def stream_complete(self, prompt: str, **kwargs: Any): |
| response = "" |
| for token in self.dummy_response: |
| response += token |
| yield CompletionResponse(text=response, delta=token) |
|
|
|
|
| class LlamaCustom: |
| def __init__(self, model_name: str, index: VectorStoreIndex): |
| self.model_name = model_name |
| self.index = index |
| self.chat_mode = "condense_plus_context" |
| self.memory = ChatMemoryBuffer.from_defaults() |
| self.verbose = True |
|
|
| def get_response(self, query_str: str, chat_history: List[ChatMessage]): |
| |
| query_engine = self.index.as_query_engine( |
| text_qa_template=PromptTemplate(QUERY_ENGINE_QA_TEMPLATE), |
| refine_template=PromptTemplate(QUERY_ENGINE_REFINE_TEMPLATE), |
| verbose=self.verbose, |
| ) |
| |
| |
| |
| |
| |
| |
| |
| response = query_engine.query(query_str) |
| |
|
|
| return str(response) |
|
|
| def get_stream_response(self, query_str: str, chat_history: List[ChatMessage]): |
| response = self.get_response(query_str=query_str, chat_history=chat_history) |
| for word in response.split(): |
| yield word + " " |
| time.sleep(0.05) |
|
|