Abrar55 commited on
Commit
950e401
·
verified ·
1 Parent(s): 5eb6967

Fix: load LoRA adapter via PEFT on top of Qwen3.5-9B base

Browse files
Files changed (1) hide show
  1. app.py +23 -9
app.py CHANGED
@@ -210,17 +210,27 @@ model_load_error: Optional[str] = None
210
 
211
  try:
212
  import torch
 
213
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
214
 
 
 
 
 
 
 
 
 
215
  print(f"Loading tokenizer from: {MODEL_PATH}")
216
  _tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
217
  if _tokenizer.pad_token is None:
218
  _tokenizer.pad_token = _tokenizer.eos_token
219
 
220
- print(f"Loading model from: {MODEL_PATH}")
221
  bnb_available = importlib.util.find_spec("bitsandbytes") is not None
 
222
 
223
- if bnb_available and torch.cuda.is_available():
224
  from transformers import BitsAndBytesConfig
225
  bnb_config = BitsAndBytesConfig(
226
  load_in_4bit=True,
@@ -228,23 +238,27 @@ try:
228
  bnb_4bit_compute_dtype=torch.bfloat16,
229
  bnb_4bit_use_double_quant=True,
230
  )
231
- _model = AutoModelForCausalLM.from_pretrained(
232
- MODEL_PATH,
233
  quantization_config=bnb_config,
234
  device_map="auto",
235
  trust_remote_code=True,
236
  )
237
- print(" Loaded with 4-bit NF4 quantization")
238
  else:
239
- cuda_available = torch.cuda.is_available()
240
  dtype = torch.float16 if cuda_available else torch.float32
241
- _model = AutoModelForCausalLM.from_pretrained(
242
- MODEL_PATH,
243
  torch_dtype=dtype,
244
  device_map="auto" if cuda_available else None,
245
  trust_remote_code=True,
246
  )
247
- print(f" Loaded in {'fp16 (GPU)' if cuda_available else 'fp32 (CPU)'}")
 
 
 
 
 
248
 
249
  _pipe = pipeline(
250
  "text-generation",
 
210
 
211
  try:
212
  import torch
213
+ from peft import PeftModel
214
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
215
 
216
+ # The repo contains a LoRA adapter — read base model from adapter_config
217
+ from huggingface_hub import hf_hub_download
218
+ import json as _json
219
+ _adapter_cfg_path = hf_hub_download(MODEL_PATH, "adapter_config.json")
220
+ _adapter_cfg = _json.loads(open(_adapter_cfg_path).read())
221
+ BASE_MODEL_PATH = _adapter_cfg.get("base_model_name_or_path", MODEL_PATH)
222
+ print(f"LoRA adapter detected. Base model: {BASE_MODEL_PATH}")
223
+
224
  print(f"Loading tokenizer from: {MODEL_PATH}")
225
  _tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
226
  if _tokenizer.pad_token is None:
227
  _tokenizer.pad_token = _tokenizer.eos_token
228
 
229
+ print(f"Loading base model: {BASE_MODEL_PATH}")
230
  bnb_available = importlib.util.find_spec("bitsandbytes") is not None
231
+ cuda_available = torch.cuda.is_available()
232
 
233
+ if bnb_available and cuda_available:
234
  from transformers import BitsAndBytesConfig
235
  bnb_config = BitsAndBytesConfig(
236
  load_in_4bit=True,
 
238
  bnb_4bit_compute_dtype=torch.bfloat16,
239
  bnb_4bit_use_double_quant=True,
240
  )
241
+ _base = AutoModelForCausalLM.from_pretrained(
242
+ BASE_MODEL_PATH,
243
  quantization_config=bnb_config,
244
  device_map="auto",
245
  trust_remote_code=True,
246
  )
247
+ print(" Base loaded with 4-bit NF4 quantization")
248
  else:
 
249
  dtype = torch.float16 if cuda_available else torch.float32
250
+ _base = AutoModelForCausalLM.from_pretrained(
251
+ BASE_MODEL_PATH,
252
  torch_dtype=dtype,
253
  device_map="auto" if cuda_available else None,
254
  trust_remote_code=True,
255
  )
256
+ print(f" Base loaded in {'fp16 (GPU)' if cuda_available else 'fp32 (CPU)'}")
257
+
258
+ print(f"Applying LoRA adapter from: {MODEL_PATH}")
259
+ _model = PeftModel.from_pretrained(_base, MODEL_PATH)
260
+ _model.eval()
261
+ print(" LoRA adapter applied")
262
 
263
  _pipe = pipeline(
264
  "text-generation",