Arcpolar commited on
Commit
98cc856
·
1 Parent(s): b128f46

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -75
app.py CHANGED
@@ -1,104 +1,78 @@
 
1
  from transformers import AutoTokenizer
2
- import transformers
3
- import torch
4
  from transformers import pipeline
 
5
  import gradio as gr
 
 
 
 
6
 
7
- model = "arcpolar/Ubuntu_Llama_Chat_7B" # test
8
- tokenizer = AutoTokenizer.from_pretrained(model)#, use_auth_token=True)
9
-
10
- llama_pipeline = pipeline(
11
- "text-generation", # LLM task
12
- model=model,
13
- torch_dtype=torch.float16,
14
- device_map="auto",
 
 
 
 
 
15
  )
16
 
17
- def get_response(prompt: str) -> None:
18
- """
19
- Generate a response from the Llama model.
20
-
21
- Parameters:
22
- prompt (str): The user's input/question for the model.
23
-
24
- Returns:
25
- None: Prints the model's response.
26
- """
27
- sequences = llama_pipeline(
28
- prompt,
29
- do_sample=True,
30
- top_k=10,
31
- num_return_sequences=1,
32
- eos_token_id=tokenizer.eos_token_id,
33
- max_length=256,
34
- )
35
- print("Chatbot:", sequences[0]['generated_text'])
36
-
37
  SYSTEM_PROMPT = """<s>[INST] <<SYS>>
38
  You are a helpful bot. Your answers are clear and concise.
39
  <</SYS>>
40
 
41
  """
42
-
43
  # Formatting function for message and history
44
- def format_message(message: str, history: list, memory_limit: int = 3) -> str:
45
- """
46
- Formats the message and history for the Llama model.
47
-
48
- Parameters:
49
- message (str): Current message to send.
50
- history (list): Past conversation history.
51
- memory_limit (int): Limit on how many past interactions to consider.
52
-
53
- Returns:
54
- str: Formatted message string
55
- """
56
- # always keep len(history) <= memory_limit
57
  if len(history) > memory_limit:
58
  history = history[-memory_limit:]
59
-
60
  if len(history) == 0:
61
  return SYSTEM_PROMPT + f"{message} [/INST]"
62
-
63
  formatted_message = SYSTEM_PROMPT + f"{history[0][0]} [/INST] {history[0][1]} </s>"
64
-
65
- # Handle conversation history
66
  for user_msg, model_answer in history[1:]:
67
  formatted_message += f"<s>[INST] {user_msg} [/INST] {model_answer} </s>"
68
-
69
- # Handle the current message
70
  formatted_message += f"<s>[INST] {message} [/INST]"
71
-
72
  return formatted_message
73
 
74
- # Generate a response from the Llama model
75
- def get_llama_response(message: str, history: list) -> str:
76
- """
77
- Generates a conversational response from the Llama model.
78
-
79
- Parameters:
80
- message (str): User's input message.
81
- history (list): Past conversation history.
82
-
83
- Returns:
84
- str: Generated response from the Llama model.
85
- """
86
  query = format_message(message, history)
87
- response = ""
88
 
89
- sequences = llama_pipeline(
 
90
  query,
91
- do_sample=True,
92
- top_k=10,
93
- num_return_sequences=1,
94
- eos_token_id=tokenizer.eos_token_id,
95
- max_length=1024,
96
  )
97
 
98
- generated_text = sequences[0]['generated_text']
99
- response = generated_text[len(query):] # Remove the prompt from the output
 
 
 
100
 
101
- print("Chatbot:", response.strip())
102
- return response.strip()
103
 
104
- gr.ChatInterface(get_llama_response).launch()
 
 
1
+ # Import the AutoTokenizer function from the transformers library
2
  from transformers import AutoTokenizer
3
+ # Import the pipeline function from the transformers library
 
4
  from transformers import pipeline
5
+ # Import Gradio
6
  import gradio as gr
7
+ # Import Transformer
8
+ import transformers
9
+ # Import pyTorch
10
+ import torch
11
 
12
+ # Define Model Name
13
+ model = "arcpolar/Ubuntu_Llama_Chat_7B"
14
+ # Setup Tokenizer
15
+ tokenizer = AutoTokenizer.from_pretrained(model)
16
+
17
+ # Llama pipeline learned from Ograbek, K. youtube video and colab note book
18
+ # Code from https://colab.research.google.com/drive/1SSv6lzX3Byu50PooYogmiwHqf5PQN68E
19
+ # Initialize a text-generation pipeline using Ubuntu_Llama_Chat_7B
20
+ Ubuntu_Llama_Chat_pipeline = pipeline(
21
+ "text-generation", # Specify the task as text-generation
22
+ model=model, # Use Ubuntu_Llama_Chat_7B for the task
23
+ torch_dtype=torch.float16, # Set data type for PyTorch tensors to float16
24
+ device_map="auto", # Automatically choose the computation device
25
  )
26
 
27
+ # Format Message and System Prompt learned from Ograbek, K. youtube video and colab notebook
28
+ # Code from https://colab.research.google.com/drive/1SSv6lzX3Byu50PooYogmiwHqf5PQN68E
29
+ # Define the initial prompt for the Llama 2 model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  SYSTEM_PROMPT = """<s>[INST] <<SYS>>
31
  You are a helpful bot. Your answers are clear and concise.
32
  <</SYS>>
33
 
34
  """
 
35
  # Formatting function for message and history
36
+ def format_message(message: str, history: list, memory_limit: int = 5) -> str:
37
+ # If history length exceeds memory_limit, keep only the most recent interactions
 
 
 
 
 
 
 
 
 
 
 
38
  if len(history) > memory_limit:
39
  history = history[-memory_limit:]
40
+ # If there's no history, return the SYSTEM_PROMPT and current message
41
  if len(history) == 0:
42
  return SYSTEM_PROMPT + f"{message} [/INST]"
43
+ # Start the formatted message with the SYSTEM_PROMPT and the oldest history item
44
  formatted_message = SYSTEM_PROMPT + f"{history[0][0]} [/INST] {history[0][1]} </s>"
45
+ # Iterate over remaining history items and format them accordingly
 
46
  for user_msg, model_answer in history[1:]:
47
  formatted_message += f"<s>[INST] {user_msg} [/INST] {model_answer} </s>"
48
+ # Append the current user message to the formatted string
 
49
  formatted_message += f"<s>[INST] {message} [/INST]"
50
+ # Return the fully formatted message string
51
  return formatted_message
52
 
53
+ # Generate response learned from Ograbek, K. youtube video and colab notebook
54
+ # Code from https://colab.research.google.com/drive/1SSv6lzX3Byu50PooYogmiwHqf5PQN68E
55
+ def get_response(message: str, history: list) -> str:
56
+ # Format the user's message and history for input to the Llama model
 
 
 
 
 
 
 
 
57
  query = format_message(message, history)
 
58
 
59
+ # Get a response from the Llama model using the configured parameters
60
+ sequences = Ubuntu_Llama_Chat_pipeline(
61
  query,
62
+ do_sample=True, # Enable sampling for response generation
63
+ top_k=10, # Limit sampling to top 10 tokens
64
+ num_return_sequences=1, # Request a single response sequence
65
+ eos_token_id=tokenizer.eos_token_id, # Specify the end-of-sequence token
66
+ max_length=1024 # Set a maximum length for the response
67
  )
68
 
69
+ # Extract the model's response, excluding the original query
70
+ response = sequences[0]['generated_text'][len(query):].strip()
71
+
72
+ # Display the response
73
+ print("Chatbot:", response)
74
 
75
+ return response
 
76
 
77
+ # Launch a chat interface using the `get_response` function
78
+ gr.ChatInterface(get_response).launch()