ChevalierJoseph
/

typtop4

Text Generation

text-generation-inference

4-bit precision

Model card Files Files and versions

ChevalierJoseph commited on Jul 29, 2025

Commit

267fefd

·

verified ·

1 Parent(s): ee72852

Update handler.py

Files changed (1) hide show

handler.py +21 -6

handler.py CHANGED Viewed

@@ -19,12 +19,27 @@ class EndpointHandler:
         # Load tokenizer and model
         self.tokenizer = AutoTokenizer.from_pretrained(path)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            path,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
-            trust_remote_code=True
-        )
         # Set pad token if it doesn't exist
         if self.tokenizer.pad_token is None:

         # Load tokenizer and model
         self.tokenizer = AutoTokenizer.from_pretrained(path)
+        # Try to load without quantization first
+        try:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                path,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto" if torch.cuda.is_available() else None,
+                trust_remote_code=True,
+                load_in_8bit=False,
+                load_in_4bit=False
+            )
+        except Exception as e:
+            logger.warning(f"Failed to load without quantization: {e}")
+            # Fallback: try with different settings
+            self.model = AutoModelForCausalLM.from_pretrained(
+                path,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto" if torch.cuda.is_available() else None,
+                trust_remote_code=True,
+                use_safetensors=True
+            )
         # Set pad token if it doesn't exist
         if self.tokenizer.pad_token is None: