Spaces:

artificialguybr
/

Qwen3.6-27B-zero

Running on Zero

artificialguybr commited on 15 days ago

Commit

99a0cab

verified ·

1 Parent(s): e8e5451

Switch to text-only causal LM loading

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import gradio as gr
 import spaces
 import torch
 from transformers import (
-    AutoModelForImageTextToText,
     AutoTokenizer,
     BitsAndBytesConfig,
     TextIteratorStreamer,
@@ -29,6 +29,7 @@ PLACEHOLDER = (
 MAX_INPUT_TOKENS = 16384
 DEFAULT_MAX_NEW_TOKENS = 4096
 MAX_NEW_TOKENS = 8192
 os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
 torch.backends.cuda.matmul.allow_tf32 = True
@@ -40,21 +41,27 @@ BNB_CONFIG = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16,
 )
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
-model = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
     quantization_config=BNB_CONFIG,
     attn_implementation="sdpa",
 )
 model.eval()
 def estimate_duration(
     message,
     history,
@@ -113,7 +120,7 @@ def stream_chat(
         return_tensors="pt",
         truncation=True,
         max_length=MAX_INPUT_TOKENS,
-    ).to(model.device)
     streamer = TextIteratorStreamer(
         tokenizer,

 import spaces
 import torch
 from transformers import (
+    AutoModelForCausalLM,
     AutoTokenizer,
     BitsAndBytesConfig,
     TextIteratorStreamer,
 MAX_INPUT_TOKENS = 16384
 DEFAULT_MAX_NEW_TOKENS = 4096
 MAX_NEW_TOKENS = 8192
+HF_TOKEN = os.environ.get("HF_TOKEN")
 os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
 torch.backends.cuda.matmul.allow_tf32 = True
     bnb_4bit_compute_dtype=torch.bfloat16,
 )
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
+    token=HF_TOKEN,
+    device_map={"": 0},
+    dtype=torch.bfloat16,
     quantization_config=BNB_CONFIG,
     attn_implementation="sdpa",
+    low_cpu_mem_usage=True,
 )
 model.eval()
+def model_input_device():
+    return next(model.parameters()).device
 def estimate_duration(
     message,
     history,
         return_tensors="pt",
         truncation=True,
         max_length=MAX_INPUT_TOKENS,
+    ).to(model_input_device())
     streamer = TextIteratorStreamer(
         tokenizer,