Instructions to use hutlim/Qwen3-Reranker-0.6B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use hutlim/Qwen3-Reranker-0.6B with Transformers:
# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("hutlim/Qwen3-Reranker-0.6B") model = AutoModelForCausalLM.from_pretrained("hutlim/Qwen3-Reranker-0.6B") - Notebooks
- Google Colab
- Kaggle
File size: 7,088 Bytes
43d8db7 6157eb9 43d8db7 2f3cbcd a4af8e6 6157eb9 a4af8e6 6157eb9 a4af8e6 e2f2cfb 43d8db7 2f3cbcd e2f2cfb f775139 6157eb9 2f3cbcd 6157eb9 2f3cbcd 6157eb9 2f3cbcd 43d8db7 6157eb9 2f3cbcd 6157eb9 43d8db7 6157eb9 2f3cbcd 6157eb9 2f3cbcd 6157eb9 2f3cbcd 6157eb9 2f3cbcd 6157eb9 2f3cbcd 6157eb9 a4af8e6 6157eb9 a4af8e6 6157eb9 a4af8e6 6157eb9 a4af8e6 43d8db7 a4af8e6 6157eb9 a4af8e6 6157eb9 e2f2cfb 6157eb9 a4af8e6 6157eb9 a4af8e6 43d8db7 6157eb9 43d8db7 a4af8e6 6157eb9 a4af8e6 6157eb9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | import gc
import os
import threading
from pathlib import Path
from typing import Any, Dict, List
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
class EndpointHandler:
def __init__(self, path: str = ""):
model_dir = Path(path or os.getenv("HF_MODEL_DIR", "")).resolve()
if not model_dir.exists():
raise FileNotFoundError(f"Model directory does not exist: {model_dir}")
# Helpful debug info in endpoint logs
print(f"[handler] loading model from: {model_dir}")
print(f"[handler] files: {[p.name for p in model_dir.iterdir()]}")
required_any = [
"config.json",
]
missing_required = [f for f in required_any if not (model_dir / f).exists()]
if missing_required:
raise FileNotFoundError(
f"Missing required model files in {model_dir}: {missing_required}"
)
has_weights = any([
(model_dir / "model.safetensors").exists(),
(model_dir / "pytorch_model.bin").exists(),
any(model_dir.glob("model-*.safetensors")),
any(model_dir.glob("pytorch_model-*.bin")),
])
if not has_weights:
raise FileNotFoundError(
f"No model weight file found in {model_dir}. "
f"Expected model.safetensors, pytorch_model.bin, or sharded weights."
)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.torch_dtype = torch.float16 if self.device == "cuda" else torch.float32
self.tokenizer = AutoTokenizer.from_pretrained(
str(model_dir),
padding_side="left",
trust_remote_code=True,
)
self.model = AutoModelForCausalLM.from_pretrained(
str(model_dir),
dtype=self.torch_dtype,
trust_remote_code=True,
).to(self.device).eval()
# Safer token lookup for decoder LMs: include leading space variants if needed
yes_ids = self.tokenizer.encode(" yes", add_special_tokens=False)
no_ids = self.tokenizer.encode(" no", add_special_tokens=False)
if len(yes_ids) != 1 or len(no_ids) != 1:
raise ValueError(
f'Expected single-token " yes"/" no", got yes={yes_ids}, no={no_ids}. '
"You may need a different scoring method for this tokenizer."
)
self.token_true_id = yes_ids[0]
self.token_false_id = no_ids[0]
self.max_length = int(os.getenv("HANDLER_MAX_LENGTH", "8192"))
self.batch_size = int(os.getenv("HANDLER_BATCH_SIZE", "8"))
self.max_documents = int(os.getenv("HANDLER_MAX_DOCUMENTS", "64"))
self._semaphore = threading.Semaphore(int(os.getenv("HANDLER_MAX_CONCURRENT", "5")))
self.prefix = (
"<|im_start|>system\n"
'Judge whether the Document meets the requirements based on the Query '
'and the Instruct provided. Note that the answer can only be "yes" or "no".'
"<|im_end|>\n"
"<|im_start|>user\n"
)
self.suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
self.prefix_tokens = self.tokenizer.encode(self.prefix, add_special_tokens=False)
self.suffix_tokens = self.tokenizer.encode(self.suffix, add_special_tokens=False)
def _format_one(self, instruction: str, query: str, document: str) -> str:
return f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}"
def _process_inputs(self, pairs: List[str]) -> Dict[str, torch.Tensor]:
inputs = self.tokenizer(
pairs,
padding=False,
truncation=True,
return_attention_mask=False,
max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens),
)
for i, ids in enumerate(inputs["input_ids"]):
inputs["input_ids"][i] = self.prefix_tokens + ids + self.suffix_tokens
padded = self.tokenizer.pad(
inputs,
padding=True,
return_tensors="pt",
)
for k in padded:
padded[k] = padded[k].to(self.device)
return padded
@torch.no_grad()
def _score(self, model_inputs: Dict[str, torch.Tensor]) -> List[float]:
logits = self.model(**model_inputs).logits[:, -1, :]
false_scores = logits[:, self.token_false_id]
true_scores = logits[:, self.token_true_id]
pair_scores = torch.stack([false_scores, true_scores], dim=1)
probs = torch.nn.functional.softmax(pair_scores, dim=1)[:, 1]
return probs.tolist()
def _score_in_batches(self, pairs: List[str]) -> List[float]:
all_scores = []
for i in range(0, len(pairs), self.batch_size):
batch = pairs[i : i + self.batch_size]
model_inputs = self._process_inputs(batch)
scores = self._score(model_inputs)
all_scores.extend(scores)
del model_inputs
gc.collect()
return all_scores
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
payload = data.get("inputs", data)
instruction = payload.get(
"instruction",
"Given a web search query, retrieve relevant passages that answer the query",
)
query = payload["query"]
documents = payload["documents"]
return_documents = payload.get("return_documents", True)
if not isinstance(documents, list) or len(documents) == 0:
raise ValueError("`documents` must be a non-empty list of strings.")
if len(documents) > self.max_documents:
raise ValueError(
f"`documents` exceeds max allowed ({self.max_documents}). "
f"Got {len(documents)}."
)
pairs = [self._format_one(instruction, query, doc) for doc in documents]
acquired = self._semaphore.acquire(timeout=int(os.getenv("HANDLER_QUEUE_TIMEOUT", "60")))
if not acquired:
raise RuntimeError(
"Server is busy. Another request is being processed. Please retry."
)
try:
scores = self._score_in_batches(pairs)
except MemoryError:
gc.collect()
raise RuntimeError(
"Out of memory while scoring. Try sending fewer or shorter documents."
)
finally:
self._semaphore.release()
results = []
for i, (doc, score) in enumerate(zip(documents, scores)):
item = {
"index": i,
"relevance_score": float(score),
}
if return_documents:
item["document"] = doc
results.append(item)
results.sort(key=lambda x: x["relevance_score"], reverse=True)
return {
"results": results,
"meta": {
"instruction": instruction,
"query": query,
"count": len(documents),
},
} |