File size: 1,478 Bytes
c911b05 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | """文本 tokenize API:不做模型推理,仅返回各 token 的字符 offset 与原文。"""
from backend.prediction_attributor import _slot_for_prediction_attr_model
from backend.model_manager import ensure_slot_weights_loaded
def tokenize(tokenize_request):
"""
对 context 用指定 model 的 tokenizer 分词,返回各 token 的字符 offset 与原文。
不持有推理锁,不做前向 / 梯度计算。
"""
context = tokenize_request.get("context")
model = tokenize_request.get("model")
if context is None or not isinstance(context, str) or context == "":
return {"success": False, "message": "Missing required field: context"}, 400
if model is None or not isinstance(model, str):
return {"success": False, "message": "Missing required field: model"}, 400
try:
slot = _slot_for_prediction_attr_model(model)
except ValueError as e:
return {"success": False, "message": str(e)}, 400
tokenizer, _, _ = ensure_slot_weights_loaded(slot)
enc = tokenizer(context, return_offsets_mapping=True)
token_ids = enc["input_ids"]
if token_ids and isinstance(token_ids[0], list):
token_ids = token_ids[0]
spans = [
{"offset": [s, e], "raw": context[s:e], "token_id": int(tid)}
for (s, e), tid in zip(enc["offset_mapping"], token_ids)
if s < e # 过滤 BOS/EOS 等长度为 0 的特殊 token
]
return {"success": True, "spans": spans}, 200
|