an nf4 version converted from Jackrong's work by g023 for triattention example (https://github.com/g023/triattention_nf4). Original:

@misc{jackrong_qwen35_9b_v3
  title        = {Jackrong/Qwopus3.5-9B-v3},
  author       = {Jackrong},
  year         = {2026},
  publisher    = {Hugging Face},
  howpublished = {\url{https://huggingface.co/Jackrong/Qwopus3.5-9B-v3}}
}

HERE IS HOW TO PLAY WITH IT IN A LOCAL FOLDER:

"""
Test the NF4 model.
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

MODEL_PATH = "./qwop-v3-9b-nf4/"  # Path to the NF4 model directory   
MAX_NEW_TOKENS = 8192
TEMPERATURE = 0.7
DO_SAMPLE = True
TOP_P = 0.9
TOP_K = 50
REPETITION_PENALTY = 1.0

class CapturingTextStreamer(TextStreamer):
    def __init__(self, tokenizer, **kwargs):
        super().__init__(tokenizer, **kwargs)
        self.generated_text = ""

    def put(self, value):
        # Decode the tokens
        if len(value.shape) > 1:
            value = value[0]
        skip_special = self.decode_kwargs.get('skip_special_tokens', False)
        decoded = self.tokenizer.decode(value, skip_special_tokens=skip_special)
        # Print as usual
        print(decoded, end="", flush=True)
        # Capture the text
        self.generated_text += decoded

messages = [{"role": "user", "content": "What is the capital of France?"}]

def load_model():
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        # device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    print("Model loaded.")
    return model, tokenizer

def inference_non_streaming(model, tokenizer, messages):
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        do_sample=DO_SAMPLE,
        top_p=TOP_P,
        top_k=TOP_K,
        repetition_penalty=REPETITION_PENALTY,
    )
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    print("Response:", response)
    return response

def inference_streaming(model, tokenizer, messages):
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    streamer = CapturingTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        do_sample=DO_SAMPLE,
        top_p=TOP_P,
        top_k=TOP_K,
        repetition_penalty=REPETITION_PENALTY,
        streamer=streamer,
    )

    return streamer.generated_text


if __name__ == "__main__":
    messages = [{"role": "user", "content": "What is the capital of France?"}]

    # load
    model, tokenizer = load_model()

    # test streaming inference
    print("Testing streaming inference...")
    result = inference_streaming(model, tokenizer, messages)
    print(f"\nCaptured text length: {len(result)}")
Downloads last month
44
Safetensors
Model size
9B params
Tensor type
F32
BF16
U8
Inference Providers NEW
This model isn't deployed by any Inference Provider. 馃檵 Ask for provider support