mpalinski commited on
Commit
7eb969d
·
verified ·
1 Parent(s): a32bec1

Upload handler.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. handler.py +47 -21
handler.py CHANGED
@@ -1,21 +1,35 @@
1
  """
2
  Custom handler for BERT-OJA-SkillLess on HF Inference Endpoints.
3
- Processes large input batches efficiently on GPU with internal micro-batching.
4
  """
5
  from typing import Dict, List, Any
6
- import torch
7
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
 
9
 
10
  class EndpointHandler:
11
  def __init__(self, path=""):
12
  self.tokenizer = AutoTokenizer.from_pretrained(path)
13
- self.model = AutoModelForSequenceClassification.from_pretrained(path)
14
- self.model.eval()
15
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
16
- if self.device == "cuda":
17
- self.model = self.model.to(self.device).half()
18
- self.batch_size = 512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def __call__(self, data: Dict[str, Any]) -> List[List[Dict[str, float]]]:
21
  inputs = data.get("inputs", data.get("input", ""))
@@ -23,22 +37,34 @@ class EndpointHandler:
23
  inputs = [inputs]
24
 
25
  all_results = []
26
- with torch.no_grad():
27
- for i in range(0, len(inputs), self.batch_size):
28
- batch = inputs[i : i + self.batch_size]
29
- encoded = self.tokenizer(
30
- batch,
31
- padding=True,
32
- truncation=True,
33
- max_length=128,
34
- return_tensors="pt",
35
- )
 
 
 
 
 
 
 
 
 
 
36
  encoded = {k: v.to(self.device) for k, v in encoded.items()}
37
- logits = self.model(**encoded).logits
38
- probs = torch.softmax(logits, dim=-1)
 
39
  for j in range(len(batch)):
40
  all_results.append([
41
  {"label": "LABEL_0", "score": round(probs[j][0].item(), 6)},
42
  {"label": "LABEL_1", "score": round(probs[j][1].item(), 6)},
43
  ])
 
44
  return all_results
 
1
  """
2
  Custom handler for BERT-OJA-SkillLess on HF Inference Endpoints.
3
+ Uses ONNX Runtime with CUDA for 2-4x faster inference.
4
  """
5
  from typing import Dict, List, Any
6
+ import numpy as np
7
+ from transformers import AutoTokenizer
8
 
9
 
10
  class EndpointHandler:
11
  def __init__(self, path=""):
12
  self.tokenizer = AutoTokenizer.from_pretrained(path)
13
+ self.batch_size = 1024
14
+
15
+ try:
16
+ from optimum.onnxruntime import ORTModelForSequenceClassification
17
+ self.model = ORTModelForSequenceClassification.from_pretrained(
18
+ path, export=True, provider="CUDAExecutionProvider",
19
+ )
20
+ self._use_ort = True
21
+ print(f"[handler] Loaded ONNX model on CUDA (batch_size={self.batch_size})")
22
+ except Exception as e:
23
+ print(f"[handler] ONNX failed ({e}), falling back to PyTorch FP16")
24
+ import torch
25
+ from transformers import AutoModelForSequenceClassification
26
+ self.model = AutoModelForSequenceClassification.from_pretrained(path)
27
+ self.model.eval()
28
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
29
+ if self.device == "cuda":
30
+ self.model = self.model.to(self.device).half()
31
+ self._use_ort = False
32
+ self._torch = torch
33
 
34
  def __call__(self, data: Dict[str, Any]) -> List[List[Dict[str, float]]]:
35
  inputs = data.get("inputs", data.get("input", ""))
 
37
  inputs = [inputs]
38
 
39
  all_results = []
40
+ for i in range(0, len(inputs), self.batch_size):
41
+ batch = inputs[i : i + self.batch_size]
42
+ encoded = self.tokenizer(
43
+ batch, padding=True, truncation=True,
44
+ max_length=128, return_tensors="pt" if not self._use_ort else "np",
45
+ )
46
+
47
+ if self._use_ort:
48
+ logits = self.model(**{k: v for k, v in encoded.items()}).logits
49
+ if hasattr(logits, 'numpy'):
50
+ logits = logits.numpy()
51
+ exp = np.exp(logits - logits.max(axis=-1, keepdims=True))
52
+ probs = exp / exp.sum(axis=-1, keepdims=True)
53
+ for j in range(len(batch)):
54
+ all_results.append([
55
+ {"label": "LABEL_0", "score": round(float(probs[j][0]), 6)},
56
+ {"label": "LABEL_1", "score": round(float(probs[j][1]), 6)},
57
+ ])
58
+ else:
59
+ torch = self._torch
60
  encoded = {k: v.to(self.device) for k, v in encoded.items()}
61
+ with torch.no_grad():
62
+ logits = self.model(**encoded).logits
63
+ probs = torch.softmax(logits, dim=-1)
64
  for j in range(len(batch)):
65
  all_results.append([
66
  {"label": "LABEL_0", "score": round(probs[j][0].item(), 6)},
67
  {"label": "LABEL_1", "score": round(probs[j][1].item(), 6)},
68
  ])
69
+
70
  return all_results