Eeppa commited on
Commit
006c9c1
·
verified ·
1 Parent(s): d9d2c9f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -0
app.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from llama_cpp import Llama
3
+
4
+ # Load the quantized model we built
5
+ # 'n_threads' is key for making CPUs move fast
6
+ llm = Llama(model_path="./my_custom_model_q4.gguf", n_ctx=2048, n_threads=2)
7
+
8
+ def generate_text(prompt):
9
+ output = llm(f"User: {prompt}\nAI:", max_tokens=128, stop=["User:"], stream=True)
10
+ response = ""
11
+ for token in output:
12
+ response += token['choices'][0]['text']
13
+ yield response
14
+
15
+ demo = gr.Interface(fn=generate_text, inputs="text", outputs="text")
16
+ demo.launch()