| import json |
| import requests |
| import time |
|
|
| API_URL = "http://localhost:8000/v1/chat/completions" |
|
|
| HEADERS = { |
| "Content-Type": "application/json", |
| "Authorization": "Bearer 0", |
| } |
|
|
| def run_test(prompt: str, max_tokens=500): |
| payload = { |
| "model": "custom-model", |
| "messages": [ |
| {"role": "system", "content": "Answer the user question about Markie Voss."}, |
| {"role": "user", "content": prompt}, |
| ], |
| "max_tokens": max_tokens, |
| "do_sample": True, |
| "temperature": 0.6, |
| "top_p": 0.8, |
| "eos_token_id": [ |
| 151645, |
| 151643, |
| 151668 |
| ], |
| "max_tokens": 1024, |
| "enable_thinking": True, |
| "stream": True, |
| } |
|
|
| print("=" * 80) |
| print("Prompt:", prompt) |
| print("Streaming response:\n") |
|
|
| with requests.post( |
| API_URL, |
| headers=HEADERS, |
| json=payload, |
| stream=True, |
| timeout=60, |
| ) as r: |
|
|
| print("HTTP status:", r.status_code) |
| r.raise_for_status() |
|
|
| full_text = "" |
|
|
| for line in r.iter_lines(decode_unicode=True): |
| if not line: |
| continue |
|
|
| |
| if line.startswith("data:"): |
| data = line[len("data:"):].strip() |
|
|
| if data == "[DONE]": |
| break |
|
|
| try: |
| chunk = json.loads(data) |
| except json.JSONDecodeError: |
| continue |
|
|
| delta = chunk["choices"][0]["delta"] |
|
|
| if "content" in delta: |
| token = delta["content"] |
| full_text += token |
| print(token, end="", flush=True) |
|
|
| print("\n\n--- END OF STREAM ---") |
| print("✅ Full content repr:", repr(full_text)) |
|
|
|
|
| if __name__ == "__main__": |
| print("Warming up...") |
| time.sleep(1) |
|
|
| while True: |
| p = input("User: ") |
| run_test(p) |
|
|