an nf4 version converted from Jackrong's work by g023 for triattention example (https://github.com/g023/triattention_nf4). Original:
@misc{jackrong_qwen35_9b_v3
title = {Jackrong/Qwopus3.5-9B-v3},
author = {Jackrong},
year = {2026},
publisher = {Hugging Face},
howpublished = {\url{https://huggingface.co/Jackrong/Qwopus3.5-9B-v3}}
}
HERE IS HOW TO PLAY WITH IT IN A LOCAL FOLDER:
"""
Test the NF4 model.
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
MODEL_PATH = "./qwop-v3-9b-nf4/" # Path to the NF4 model directory
MAX_NEW_TOKENS = 8192
TEMPERATURE = 0.7
DO_SAMPLE = True
TOP_P = 0.9
TOP_K = 50
REPETITION_PENALTY = 1.0
class CapturingTextStreamer(TextStreamer):
def __init__(self, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
self.generated_text = ""
def put(self, value):
# Decode the tokens
if len(value.shape) > 1:
value = value[0]
skip_special = self.decode_kwargs.get('skip_special_tokens', False)
decoded = self.tokenizer.decode(value, skip_special_tokens=skip_special)
# Print as usual
print(decoded, end="", flush=True)
# Capture the text
self.generated_text += decoded
messages = [{"role": "user", "content": "What is the capital of France?"}]
def load_model():
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
# device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
print("Model loaded.")
return model, tokenizer
def inference_non_streaming(model, tokenizer, messages):
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=DO_SAMPLE,
top_p=TOP_P,
top_k=TOP_K,
repetition_penalty=REPETITION_PENALTY,
)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print("Response:", response)
return response
def inference_streaming(model, tokenizer, messages):
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
streamer = CapturingTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
outputs = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=DO_SAMPLE,
top_p=TOP_P,
top_k=TOP_K,
repetition_penalty=REPETITION_PENALTY,
streamer=streamer,
)
return streamer.generated_text
if __name__ == "__main__":
messages = [{"role": "user", "content": "What is the capital of France?"}]
# load
model, tokenizer = load_model()
# test streaming inference
print("Testing streaming inference...")
result = inference_streaming(model, tokenizer, messages)
print(f"\nCaptured text length: {len(result)}")
- Downloads last month
- 44
Inference Providers NEW
This model isn't deployed by any Inference Provider. 馃檵 Ask for provider support