ashirato commited on
Commit
2686e51
·
verified ·
1 Parent(s): 03211d1

fix: 14B + bnb int4 (AWQ build failed; bnb proven, no compile)

Browse files
Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -13,7 +13,7 @@ import gradio as gr
13
  import spaces
14
  import torch
15
 
16
- BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ")
17
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
18
 
19
  SYSTEM = (
@@ -36,10 +36,17 @@ def _load_lazy():
36
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
37
  if _tok.pad_token_id is None:
38
  _tok.pad_token_id = _tok.eos_token_id
39
- # AWQ pre-quantized — no quant config needed at load time
 
 
 
 
 
 
 
40
  _model = AutoModelForCausalLM.from_pretrained(
41
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
42
- device_map="cuda", torch_dtype=torch.bfloat16)
43
  return _model, _tok
44
 
45
 
 
13
  import spaces
14
  import torch
15
 
16
+ BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-14B-Instruct")
17
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
18
 
19
  SYSTEM = (
 
36
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
37
  if _tok.pad_token_id is None:
38
  _tok.pad_token_id = _tok.eos_token_id
39
+ # bitsandbytes 4-bitproven, no compile (AWQ build was failing)
40
+ from transformers import BitsAndBytesConfig
41
+ bnb = BitsAndBytesConfig(
42
+ load_in_4bit=True,
43
+ bnb_4bit_compute_dtype=torch.bfloat16,
44
+ bnb_4bit_quant_type="nf4",
45
+ bnb_4bit_use_double_quant=True,
46
+ )
47
  _model = AutoModelForCausalLM.from_pretrained(
48
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
49
+ device_map="cuda", quantization_config=bnb)
50
  return _model, _tok
51
 
52