Fix: load LoRA adapter via PEFT on top of Qwen3.5-9B base
Browse files
app.py
CHANGED
|
@@ -210,17 +210,27 @@ model_load_error: Optional[str] = None
|
|
| 210 |
|
| 211 |
try:
|
| 212 |
import torch
|
|
|
|
| 213 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
print(f"Loading tokenizer from: {MODEL_PATH}")
|
| 216 |
_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
| 217 |
if _tokenizer.pad_token is None:
|
| 218 |
_tokenizer.pad_token = _tokenizer.eos_token
|
| 219 |
|
| 220 |
-
print(f"Loading
|
| 221 |
bnb_available = importlib.util.find_spec("bitsandbytes") is not None
|
|
|
|
| 222 |
|
| 223 |
-
if bnb_available and
|
| 224 |
from transformers import BitsAndBytesConfig
|
| 225 |
bnb_config = BitsAndBytesConfig(
|
| 226 |
load_in_4bit=True,
|
|
@@ -228,23 +238,27 @@ try:
|
|
| 228 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 229 |
bnb_4bit_use_double_quant=True,
|
| 230 |
)
|
| 231 |
-
|
| 232 |
-
|
| 233 |
quantization_config=bnb_config,
|
| 234 |
device_map="auto",
|
| 235 |
trust_remote_code=True,
|
| 236 |
)
|
| 237 |
-
print("
|
| 238 |
else:
|
| 239 |
-
cuda_available = torch.cuda.is_available()
|
| 240 |
dtype = torch.float16 if cuda_available else torch.float32
|
| 241 |
-
|
| 242 |
-
|
| 243 |
torch_dtype=dtype,
|
| 244 |
device_map="auto" if cuda_available else None,
|
| 245 |
trust_remote_code=True,
|
| 246 |
)
|
| 247 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
_pipe = pipeline(
|
| 250 |
"text-generation",
|
|
|
|
| 210 |
|
| 211 |
try:
|
| 212 |
import torch
|
| 213 |
+
from peft import PeftModel
|
| 214 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 215 |
|
| 216 |
+
# The repo contains a LoRA adapter — read base model from adapter_config
|
| 217 |
+
from huggingface_hub import hf_hub_download
|
| 218 |
+
import json as _json
|
| 219 |
+
_adapter_cfg_path = hf_hub_download(MODEL_PATH, "adapter_config.json")
|
| 220 |
+
_adapter_cfg = _json.loads(open(_adapter_cfg_path).read())
|
| 221 |
+
BASE_MODEL_PATH = _adapter_cfg.get("base_model_name_or_path", MODEL_PATH)
|
| 222 |
+
print(f"LoRA adapter detected. Base model: {BASE_MODEL_PATH}")
|
| 223 |
+
|
| 224 |
print(f"Loading tokenizer from: {MODEL_PATH}")
|
| 225 |
_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
| 226 |
if _tokenizer.pad_token is None:
|
| 227 |
_tokenizer.pad_token = _tokenizer.eos_token
|
| 228 |
|
| 229 |
+
print(f"Loading base model: {BASE_MODEL_PATH}")
|
| 230 |
bnb_available = importlib.util.find_spec("bitsandbytes") is not None
|
| 231 |
+
cuda_available = torch.cuda.is_available()
|
| 232 |
|
| 233 |
+
if bnb_available and cuda_available:
|
| 234 |
from transformers import BitsAndBytesConfig
|
| 235 |
bnb_config = BitsAndBytesConfig(
|
| 236 |
load_in_4bit=True,
|
|
|
|
| 238 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 239 |
bnb_4bit_use_double_quant=True,
|
| 240 |
)
|
| 241 |
+
_base = AutoModelForCausalLM.from_pretrained(
|
| 242 |
+
BASE_MODEL_PATH,
|
| 243 |
quantization_config=bnb_config,
|
| 244 |
device_map="auto",
|
| 245 |
trust_remote_code=True,
|
| 246 |
)
|
| 247 |
+
print(" Base loaded with 4-bit NF4 quantization")
|
| 248 |
else:
|
|
|
|
| 249 |
dtype = torch.float16 if cuda_available else torch.float32
|
| 250 |
+
_base = AutoModelForCausalLM.from_pretrained(
|
| 251 |
+
BASE_MODEL_PATH,
|
| 252 |
torch_dtype=dtype,
|
| 253 |
device_map="auto" if cuda_available else None,
|
| 254 |
trust_remote_code=True,
|
| 255 |
)
|
| 256 |
+
print(f" Base loaded in {'fp16 (GPU)' if cuda_available else 'fp32 (CPU)'}")
|
| 257 |
+
|
| 258 |
+
print(f"Applying LoRA adapter from: {MODEL_PATH}")
|
| 259 |
+
_model = PeftModel.from_pretrained(_base, MODEL_PATH)
|
| 260 |
+
_model.eval()
|
| 261 |
+
print(" LoRA adapter applied")
|
| 262 |
|
| 263 |
_pipe = pipeline(
|
| 264 |
"text-generation",
|