Qwen3-Reranker-0.6B

@@ -1,4 +1,5 @@
 import os
 from typing import Any, Dict, List
 import torch
@@ -7,34 +8,75 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 class EndpointHandler:
     def __init__(self, path: str = ""):
-        model_dir = path if path else os.getenv("HF_MODEL_DIR", ".")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.dtype = torch.float16 if self.device == "cuda" else torch.float32
         self.tokenizer = AutoTokenizer.from_pretrained(
-            model_dir,
             padding_side="left",
             trust_remote_code=True,
         )
         self.model = AutoModelForCausalLM.from_pretrained(
-            model_dir,
-            dtype=self.dtype,
             trust_remote_code=True,
         ).to(self.device).eval()
-        self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
-        self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
         self.max_length = 8192
         self.prefix = (
-            '<|im_start|>system\n'
             'Judge whether the Document meets the requirements based on the Query '
             'and the Instruct provided. Note that the answer can only be "yes" or "no".'
-            '<|im_end|>\n'
-            '<|im_start|>user\n'
         )
-        self.suffix = '<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n'
         self.prefix_tokens = self.tokenizer.encode(self.prefix, add_special_tokens=False)
         self.suffix_tokens = self.tokenizer.encode(self.suffix, add_special_tokens=False)
@@ -43,27 +85,21 @@ class EndpointHandler:
         return f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}"
     def _process_inputs(self, pairs: List[str]) -> Dict[str, torch.Tensor]:
-        # 1. First, encode the text and handle truncation properly
         inputs = self.tokenizer(
             pairs,
             padding=False,
-            truncation=True, # Change from "longest_first" to True for simpler logic
             return_attention_mask=False,
-            # Subtract the length of your prefix and suffix from the limit
             max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens),
         )
-        # 2. Manually prepend/append your special tokens
         for i, ids in enumerate(inputs["input_ids"]):
             inputs["input_ids"][i] = self.prefix_tokens + ids + self.suffix_tokens
-        # 3. FIX: When padding, use 'max_length' if you want a fixed size,
-        # or just padding=True to pad to the longest in the batch.
         padded = self.tokenizer.pad(
             inputs,
-            padding=True, # This will pad to the longest sequence in the current batch
             return_tensors="pt",
-            # Remove max_length here to stop the warning
         )
         for k in padded:

 import os
+from pathlib import Path
 from typing import Any, Dict, List
 import torch
 class EndpointHandler:
     def __init__(self, path: str = ""):
+        model_dir = Path(path or os.getenv("HF_MODEL_DIR", ".")).resolve()
+        if not model_dir.exists():
+            raise FileNotFoundError(f"Model directory does not exist: {model_dir}")
+        # Helpful debug info in endpoint logs
+        print(f"[handler] loading model from: {model_dir}")
+        print(f"[handler] files: {[p.name for p in model_dir.iterdir()]}")
+        required_any = [
+            "config.json",
+        ]
+        missing_required = [f for f in required_any if not (model_dir / f).exists()]
+        if missing_required:
+            raise FileNotFoundError(
+                f"Missing required model files in {model_dir}: {missing_required}"
+            )
+        has_weights = any([
+            (model_dir / "model.safetensors").exists(),
+            (model_dir / "pytorch_model.bin").exists(),
+            any(model_dir.glob("model-*.safetensors")),
+            any(model_dir.glob("pytorch_model-*.bin")),
+        ])
+        if not has_weights:
+            raise FileNotFoundError(
+                f"No model weight file found in {model_dir}. "
+                f"Expected model.safetensors, pytorch_model.bin, or sharded weights."
+            )
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.torch_dtype = torch.float16 if self.device == "cuda" else torch.float32
         self.tokenizer = AutoTokenizer.from_pretrained(
+            str(model_dir),
             padding_side="left",
             trust_remote_code=True,
+            local_files_only=True,
         )
         self.model = AutoModelForCausalLM.from_pretrained(
+            str(model_dir),
+            torch_dtype=self.torch_dtype,
             trust_remote_code=True,
+            local_files_only=True,
         ).to(self.device).eval()
+        # Safer token lookup for decoder LMs: include leading space variants if needed
+        yes_ids = self.tokenizer.encode(" yes", add_special_tokens=False)
+        no_ids = self.tokenizer.encode(" no", add_special_tokens=False)
+        if len(yes_ids) != 1 or len(no_ids) != 1:
+            raise ValueError(
+                f'Expected single-token " yes"/" no", got yes={yes_ids}, no={no_ids}. '
+                "You may need a different scoring method for this tokenizer."
+            )
+        self.token_true_id = yes_ids[0]
+        self.token_false_id = no_ids[0]
         self.max_length = 8192
         self.prefix = (
+            "<|im_start|>system\n"
             'Judge whether the Document meets the requirements based on the Query '
             'and the Instruct provided. Note that the answer can only be "yes" or "no".'
+            "<|im_end|>\n"
+            "<|im_start|>user\n"
         )
+        self.suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
         self.prefix_tokens = self.tokenizer.encode(self.prefix, add_special_tokens=False)
         self.suffix_tokens = self.tokenizer.encode(self.suffix, add_special_tokens=False)
         return f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}"
     def _process_inputs(self, pairs: List[str]) -> Dict[str, torch.Tensor]:
         inputs = self.tokenizer(
             pairs,
             padding=False,
+            truncation=True,
             return_attention_mask=False,
             max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens),
         )
         for i, ids in enumerate(inputs["input_ids"]):
             inputs["input_ids"][i] = self.prefix_tokens + ids + self.suffix_tokens
         padded = self.tokenizer.pad(
             inputs,
+            padding=True,
             return_tensors="pt",
         )
         for k in padded: