teapotai
/

tinyteapot

+# handler.py
+from typing import Any, Dict, List, Union
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+MAX_INPUT_TOKENS = 512
+class EndpointHandler:
+    """
+    HF Inference Endpoints custom handler that reproduces the exact style of
+    your shared Colab code:
+      - slow tokenizer (use_fast=False)
+      - Seq2Seq model
+      - deterministic generation by default (do_sample=False)
+      - decode skip_special_tokens=True
+      - if input > 512 tokens, keep only the MOST RECENT tokens (left-truncate)
+    """
+    def __init__(self, path: str = ""):
+        # Match your working code path and avoid fast tokenizer init issues on HF endpoints.
+        self.tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
+        self.model.eval()
+        self.device = torch.device("cpu")
+        self.model.to(self.device)
+    @torch.inference_mode()
+    def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, str], List[Dict[str, str]]]:
+        """
+        Request schema:
+          {
+            "inputs": "<full prompt string>" OR ["<prompt1>", "<prompt2>", ...],
+            "parameters": { ... optional generate kwargs ... }
+          }
+        Response schema (kept simple):
+          - single input  -> {"generated_text": "..."}
+          - list inputs   -> [{"generated_text": "..."}, ...]
+        """
+        if "inputs" not in data:
+            raise ValueError("Missing required field 'inputs'.")
+        inputs = data["inputs"]
+        params = data.get("parameters") or {}
+        # Normalize to a batch of prompts
+        if isinstance(inputs, str):
+            prompts = [inputs]
+            single = True
+        else:
+            prompts = list(inputs)
+            single = False
+        # --- Tokenize WITHOUT truncation first so we can left-truncate manually ---
+        enc = self.tokenizer(
+            prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=False,
+        )
+        input_ids = enc["input_ids"]
+        attention_mask = enc["attention_mask"]
+        # Left-truncate to keep the most recent tokens (right side)
+        if input_ids.shape[1] > MAX_INPUT_TOKENS:
+            input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
+            attention_mask = attention_mask[:, -MAX_INPUT_TOKENS:]
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        # Defaults that match your code: model.generate(**inputs, do_sample=False)
+        # Keep them overrideable via "parameters".
+        gen_kwargs = {
+            "do_sample": params.pop("do_sample", False),
+        }
+        # Optional knobs (only applied if provided)
+        if "max_new_tokens" in params:
+            gen_kwargs["max_new_tokens"] = params.pop("max_new_tokens")
+        if "num_beams" in params:
+            gen_kwargs["num_beams"] = params.pop("num_beams")
+        if "temperature" in params:
+            gen_kwargs["temperature"] = params.pop("temperature")
+        if "top_p" in params:
+            gen_kwargs["top_p"] = params.pop("top_p")
+        if "top_k" in params:
+            gen_kwargs["top_k"] = params.pop("top_k")
+        # Allow any remaining generate() kwargs through, in case you pass them
+        gen_kwargs.update(params)
+        outputs = self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **gen_kwargs,
+        )
+        texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        result = [{"generated_text": t} for t in texts]
+        return result[0] if single else result