Qwen3-Reranker-0.6B

@@ -18,7 +18,7 @@ class EndpointHandler:
         )
         self.model = AutoModelForCausalLM.from_pretrained(
             model_dir,
-            torch_dtype=self.dtype,
             trust_remote_code=True,
         ).to(self.device).eval()
@@ -43,22 +43,27 @@ class EndpointHandler:
         return f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}"
     def _process_inputs(self, pairs: List[str]) -> Dict[str, torch.Tensor]:
         inputs = self.tokenizer(
             pairs,
             padding=False,
-            truncation="longest_first",
             return_attention_mask=False,
             max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens),
         )
         for i, ids in enumerate(inputs["input_ids"]):
             inputs["input_ids"][i] = self.prefix_tokens + ids + self.suffix_tokens
         padded = self.tokenizer.pad(
             inputs,
-            padding=True,
             return_tensors="pt",
-            max_length=self.max_length,
         )
         for k in padded:

         )
         self.model = AutoModelForCausalLM.from_pretrained(
             model_dir,
+            dtype=self.dtype,
             trust_remote_code=True,
         ).to(self.device).eval()
         return f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}"
     def _process_inputs(self, pairs: List[str]) -> Dict[str, torch.Tensor]:
+        # 1. First, encode the text and handle truncation properly
         inputs = self.tokenizer(
             pairs,
             padding=False,
+            truncation=True, # Change from "longest_first" to True for simpler logic
             return_attention_mask=False,
+            # Subtract the length of your prefix and suffix from the limit
             max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens),
         )
+        # 2. Manually prepend/append your special tokens
         for i, ids in enumerate(inputs["input_ids"]):
             inputs["input_ids"][i] = self.prefix_tokens + ids + self.suffix_tokens
+        # 3. FIX: When padding, use 'max_length' if you want a fixed size,
+        # or just padding=True to pad to the longest in the batch.
         padded = self.tokenizer.pad(
             inputs,
+            padding=True, # This will pad to the longest sequence in the current batch
             return_tensors="pt",
+            # Remove max_length here to stop the warning
         )
         for k in padded: