Spaces:

lablab-ai-amd-developer-hackathon
/

CHEX

Running

App Files Files Community

Abrar55 commited on about 19 hours ago

Commit

950e401

verified ·

1 Parent(s): 5eb6967

Fix: load LoRA adapter via PEFT on top of Qwen3.5-9B base

Browse files

Files changed (1) hide show

app.py +23 -9

app.py CHANGED Viewed

@@ -210,17 +210,27 @@ model_load_error: Optional[str] = None
 try:
     import torch
     from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
     print(f"Loading tokenizer from: {MODEL_PATH}")
     _tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
     if _tokenizer.pad_token is None:
         _tokenizer.pad_token = _tokenizer.eos_token
-    print(f"Loading model from: {MODEL_PATH}")
     bnb_available = importlib.util.find_spec("bitsandbytes") is not None
-    if bnb_available and torch.cuda.is_available():
         from transformers import BitsAndBytesConfig
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -228,23 +238,27 @@ try:
             bnb_4bit_compute_dtype=torch.bfloat16,
             bnb_4bit_use_double_quant=True,
         )
-        _model = AutoModelForCausalLM.from_pretrained(
-            MODEL_PATH,
             quantization_config=bnb_config,
             device_map="auto",
             trust_remote_code=True,
         )
-        print("  Loaded with 4-bit NF4 quantization")
     else:
-        cuda_available = torch.cuda.is_available()
         dtype = torch.float16 if cuda_available else torch.float32
-        _model = AutoModelForCausalLM.from_pretrained(
-            MODEL_PATH,
             torch_dtype=dtype,
             device_map="auto" if cuda_available else None,
             trust_remote_code=True,
         )
-        print(f"  Loaded in {'fp16 (GPU)' if cuda_available else 'fp32 (CPU)'}")
     _pipe = pipeline(
         "text-generation",

 try:
     import torch
+    from peft import PeftModel
     from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+    # The repo contains a LoRA adapter — read base model from adapter_config
+    from huggingface_hub import hf_hub_download
+    import json as _json
+    _adapter_cfg_path = hf_hub_download(MODEL_PATH, "adapter_config.json")
+    _adapter_cfg = _json.loads(open(_adapter_cfg_path).read())
+    BASE_MODEL_PATH = _adapter_cfg.get("base_model_name_or_path", MODEL_PATH)
+    print(f"LoRA adapter detected. Base model: {BASE_MODEL_PATH}")
     print(f"Loading tokenizer from: {MODEL_PATH}")
     _tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
     if _tokenizer.pad_token is None:
         _tokenizer.pad_token = _tokenizer.eos_token
+    print(f"Loading base model: {BASE_MODEL_PATH}")
     bnb_available = importlib.util.find_spec("bitsandbytes") is not None
+    cuda_available = torch.cuda.is_available()
+    if bnb_available and cuda_available:
         from transformers import BitsAndBytesConfig
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_compute_dtype=torch.bfloat16,
             bnb_4bit_use_double_quant=True,
         )
+        _base = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL_PATH,
             quantization_config=bnb_config,
             device_map="auto",
             trust_remote_code=True,
         )
+        print("  Base loaded with 4-bit NF4 quantization")
     else:
         dtype = torch.float16 if cuda_available else torch.float32
+        _base = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL_PATH,
             torch_dtype=dtype,
             device_map="auto" if cuda_available else None,
             trust_remote_code=True,
         )
+        print(f"  Base loaded in {'fp16 (GPU)' if cuda_available else 'fp32 (CPU)'}")
+    print(f"Applying LoRA adapter from: {MODEL_PATH}")
+    _model = PeftModel.from_pretrained(_base, MODEL_PATH)
+    _model.eval()
+    print("  LoRA adapter applied")
     _pipe = pipeline(
         "text-generation",