RL / model /EasyR1 /verl /reward_function /onethinker_reward.py

Upload folder using huggingface_hub

d65b589 verified 19 days ago

29.4 kB

	# -- coding: utf-8 --
	# Rewards for multimodal tasks with <think>...</think><answer>...</answer> outputs.
	import re
	import json
	import math
	import itertools
	from typing import Any, Dict, List, Optional
	import random

	import torch
	from rouge_score import rouge_scorer
	from math_verify import parse as math_parse, verify as math_verify
	from mathruler.grader import grade_answer

	# ===================== Model-based reward configuration =====================
	# Whether to use external Reward Model to compute accuracy for open-ended type
	USE_MODEL_FOR_OPEN_ENDED: bool = False

	# External RM model and service address (kept consistent with example)
	RM_MODEL_PATH = "internlm/POLAR-7B"
	RM_SERVER_ADDRESS = "xx.xx.xx.xx:xxxx"
	# ==========================================================

	# ===================== External RM evaluation dependencies =====================
	from verl.workers.reward.model_reward import RewardModelClient
	import numpy as np
	# =========================================================


	# -------------------------
	# Patterns for format check
	# -------------------------
	THINK_ANSWER_PATTERN = re.compile(
	r"\A\s<think>.?</think>\s<answer>.?</answer>\s*\Z",
	re.DOTALL
	)

	ANSWER_CAPTURE_PATTERN = re.compile(
	r"<answer>\s(.?)\s*</answer>",
	re.DOTALL
	)


	# -------------------------
	# Utilities
	# -------------------------
	def extract_answer(text: str) -> Optional[str]:
	if not isinstance(text, str):
	return None
	m = ANSWER_CAPTURE_PATTERN.search(text)
	return m.group(1).strip() if m else None


	def normalize_number(num_str: str) -> Optional[float]:
	try:
	return float((num_str or "").replace(",", ""))
	except Exception:
	return None


	def mean_relative_accuracy(pred, target, start=0.5, end=0.95, interval=0.05) -> float:
	pred_t = torch.tensor(pred, dtype=torch.float32)
	tgt_t = torch.tensor(target, dtype=torch.float32)
	rel_error = torch.abs(pred_t - tgt_t) / (torch.abs(tgt_t) + 1e-8)
	thresholds = torch.arange(start, end + interval/2, interval, dtype=torch.float32)
	return (rel_error < (1 - thresholds)).float().mean().item()


	def wer(reference: str, hypothesis: str) -> float:
	ref_words, hyp_words = (reference or "").split(), (hypothesis or "").split()
	m, n = len(ref_words), len(hyp_words)
	d = [[0] * (n + 1) for _ in range(m + 1)]
	for i in range(m + 1): d[i][0] = i
	for j in range(n + 1): d[0][j] = j
	for i in range(1, m + 1):
	for j in range(1, n + 1):
	d[i][j] = d[i - 1][j - 1] if ref_words[i - 1] == hyp_words[j - 1] else 1 + min(
	d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]
	)
	return d[m][n] / max(1, m)


	def compute_rouge_score(reference: str, hypothesis: str) -> float:
	scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
	scores = scorer.score(reference or "", hypothesis or "")
	return (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3.0


	# ---------- IoU helpers (strict format: must be numeric lists) ----------
	def _is_list_of_numbers(x, n=None):
	if not isinstance(x, list):
	return False
	if n is not None and len(x) != n:
	return False
	try:
	for v in x:
	float(v)
	return True
	except Exception:
	return False


	def iou_1d(pred: List[float], gt: List[float]) -> float:
	# Strict: must be numeric lists with length 2; otherwise return 0
	if not _is_list_of_numbers(pred, 2) or not _is_list_of_numbers(gt, 2):
	return 0.0
	try:
	s1, e1 = float(pred[0]), float(pred[1])
	s2, e2 = float(gt[0]), float(gt[1])
	except Exception:
	return 0.0
	inter = max(0.0, min(e1, e2) - max(s1, s2))
	union = max(e1, e2) - min(s1, s2)
	return inter / union if union > 1e-12 else 0.0


	def iou_2d(box1: List[float], box2: List[float]) -> float:
	# Strict: must be numeric lists with length 4; otherwise return 0
	if not _is_list_of_numbers(box1, 4) or not _is_list_of_numbers(box2, 4):
	return 0.0
	try:
	x1, y1, x2, y2 = map(float, box1)
	X1, Y1, X2, Y2 = map(float, box2)
	except Exception:
	return 0.0
	inter_x1, inter_y1 = max(x1, X1), max(y1, Y1)
	inter_x2, inter_y2 = min(x2, X2), min(y2, Y2)
	inter_area = max(0.0, inter_x2 - inter_x1) * max(0.0, inter_y2 - inter_y1)
	area1 = max(0.0, x2 - x1) * max(0.0, y2 - y1)
	area2 = max(0.0, X2 - X1) * max(0.0, Y2 - Y1)
	union = area1 + area2 - inter_area
	return inter_area / union if union > 1e-12 else 0.0


	def mean_iou_over_gt_frames(pred_boxes: Dict[str, List[float]], gt_boxes: Dict[str, List[float]]) -> float:
	"""
	For tracking: average over all GT frames (missing predicted frames are counted as 0).
	pred_boxes, gt_boxes: {frame_str: [x1,y1,x2,y2]}
	"""
	if not isinstance(gt_boxes, dict) or not gt_boxes:
	return 0.0
	total, n = 0.0, 0
	for k, gbox in gt_boxes.items():
	total += iou_2d(pred_boxes.get(k, []), gbox)
	n += 1
	return total / n if n > 0 else 0.0


	def mean_iou_over_intersection(pred_boxes: Dict[str, List[float]], gt_boxes: Dict[str, List[float]]) -> float:
	"""
	For spatial-temporal tasks: IoU is averaged only over the intersection of frame keys
	(missing frames are not penalized).
	"""
	if not isinstance(pred_boxes, dict) or not isinstance(gt_boxes, dict):
	return 0.0
	common = [k for k in pred_boxes.keys() if k in gt_boxes]
	if not common:
	return 0.0
	vals = [iou_2d(pred_boxes[k], gt_boxes[k]) for k in common]
	return sum(vals) / len(vals) if vals else 0.0


	# -------------------------
	# Segmentation point matching: strict 3↔3 optimal assignment
	# -------------------------
	def _pairwise_l2(p, q) -> float:
	try:
	dx = float(p[0]) - float(q[0])
	dy = float(p[1]) - float(q[1])
	return math.hypot(dx, dy)
	except Exception:
	return float("inf")


	def assignment_similarity_3(pred_pts: List[List[float]],
	gt_pts: List[List[float]],
	sigma: float = 50.0) -> float:
	"""
	3↔3 optimal matching (minimal total distance), returns Gaussian kernel similarity:
	sim = exp(- (avg_dist^2) / (2 * sigma^2)) ∈ [0,1]

	Explanation of parameters:
	- sigma: controls sensitivity range (smaller → more sensitive, sharper)
	- pred_pts, gt_pts: [[x,y], [x,y], [x,y]] exactly 3 points each
	- If the number or format of points is invalid, return 0.0
	"""
	# ----------- Check input validity -----------
	if not isinstance(pred_pts, list) or not isinstance(gt_pts, list) or len(pred_pts) != 3 or len(gt_pts) != 3:
	return 0.0
	for p in pred_pts + gt_pts:
	if not _is_list_of_numbers(p, 2):
	return 0.0

	# ----------- Compute optimal matching distance -----------
	best_sum = float('inf')
	for perm in itertools.permutations(range(3)):
	s = 0.0
	good = True
	for i in range(3):
	d = _pairwise_l2(pred_pts[perm[i]], gt_pts[i])
	if math.isinf(d):
	good = False
	break
	s += d
	if good:
	best_sum = min(best_sum, s)

	if math.isinf(best_sum):
	return 0.0

	# ----------- Average distance & Gaussian similarity -----------
	avg_d = best_sum / 3.0
	sim = math.exp(- (avg_d ** 2) / (2 * sigma ** 2))
	return max(0.0, min(1.0, sim))


	# -------------------------
	# Format reward + structure reward
	# -------------------------
	def tag_format_reward(response: str) -> float:
	"""
	Format requirement (format reward):
	Must strictly be: <think>...</think><answer>...</answer>
	Arbitrary newlines/whitespaces are allowed in the middle, but tag order and closures must be correct.
	Returns 1.0 if satisfied; otherwise 0.0.
	"""
	return 1.0 if THINK_ANSWER_PATTERN.fullmatch(response or "") else 0.0


	def answer_structure_bonus(answer: str, ground_truth: str, data_type: str, problem_type: str) -> float:
	"""
	Structure requirements (structure reward):
	- spatial-temporal grounding:
	JSON structure must satisfy:
	{"time": [s, e], "boxes": {"frame_id": [x1, y1, x2, y2], ...}}
	+0.25 if the structure is valid;
	plus bbox key overlap ratio * 0.25 (overlap ratio = \|pred.keys ∩ gt.keys\| / \|gt.keys\|).
	- tracking:
	JSON structure must satisfy:
	{"boxes": {"frame_id": [x1, y1, x2, y2], ...}}
	+0.25 if the structure is valid;
	plus bbox key overlap ratio * 0.25.
	- temporal grounding:
	{"time": [s, e]} gets +0.5 if valid, otherwise 0.
	- spatial grounding:
	{"boxes": [x1, y1, x2, y2]} gets +0.5 if valid, otherwise 0.
	- segmentation:
	* image:
	{"boxes": [..], "positive_points": [[x,y],[x,y],[x,y]], "negative_points": [[x,y],[x,y],[x,y]]}
	* video:
	{"time": t, "boxes": [..], "positive_points": [[x,y],[x,y],[x,y]], "negative_points": [[x,y],[x,y],[x,y]]}
	+0.5 if the corresponding structure is satisfied, otherwise 0.
	- Other non-structured tasks: default +0.5.
	"""
	ptype = (problem_type or "").lower()
	dtype = (data_type or "").lower()

	def _json(s):
	try:
	return json.loads(s)
	except Exception:
	return None

	if ptype in {"spatial-temporal grounding", "tracking"}:
	obj_pred = _json(answer)
	obj_gt = _json(ground_truth)
	part_json = 0.0
	part_overlap = 0.0

	if ptype == "spatial-temporal grounding":
	json_ok = (
	isinstance(obj_pred, dict)
	and isinstance(obj_pred.get("time"), list) and len(obj_pred["time"]) == 2
	and isinstance(obj_pred.get("boxes"), dict)
	and all(_is_list_of_numbers(v, 4) for v in obj_pred["boxes"].values())
	)
	else: # tracking
	json_ok = (
	isinstance(obj_pred, dict)
	and isinstance(obj_pred.get("boxes"), dict)
	and all(_is_list_of_numbers(v, 4) for v in obj_pred["boxes"].values())
	)
	if json_ok:
	part_json = 0.25

	if isinstance(obj_pred, dict) and isinstance(obj_gt, dict):
	pboxes = obj_pred.get("boxes", {})
	gboxes = obj_gt.get("boxes", {})
	if isinstance(pboxes, dict) and isinstance(gboxes, dict) and len(gboxes) > 0:
	inter = set(pboxes.keys()) & set(gboxes.keys())
	overlap_ratio = len(inter) / float(len(gboxes))
	part_overlap = 0.25 * max(0.0, min(1.0, float(overlap_ratio)))

	return part_json + part_overlap

	needs_check = {"temporal grounding", "spatial grounding", "segmentation"}
	# if ptype in needs_check:
	if ptype in needs_check or "segmentation" in ptype:
	obj = _json(answer)
	if ptype == "temporal grounding":
	ok = isinstance(obj, dict) and _is_list_of_numbers(obj.get("time"), 2)
	return 0.5 if ok else 0.0

	if ptype == "spatial grounding":
	ok = isinstance(obj, dict) and _is_list_of_numbers(obj.get("boxes"), 4)
	return 0.5 if ok else 0.0

	# if ptype == "segmentation":
	if ptype == "segmentation" or "segmentation" in ptype:
	if dtype == "image":
	ok = (
	isinstance(obj, dict)
	and _is_list_of_numbers(obj.get("boxes"), 4)
	and isinstance(obj.get("positive_points"), list) and len(obj["positive_points"]) == 3
	and isinstance(obj.get("negative_points"), list) and len(obj["negative_points"]) == 3
	and all(_is_list_of_numbers(p, 2) for p in obj["positive_points"])
	and all(_is_list_of_numbers(p, 2) for p in obj["negative_points"])
	)
	return 0.5 if ok else 0.0
	elif dtype == "video":
	ok = (
	isinstance(obj, dict)
	and isinstance(obj.get("time"), (int, float)) # time must be numeric
	and _is_list_of_numbers(obj.get("boxes"), 4)
	and isinstance(obj.get("positive_points"), list) and len(obj["positive_points"]) == 3
	and isinstance(obj.get("negative_points"), list) and len(obj["negative_points"]) == 3
	and all(_is_list_of_numbers(p, 2) for p in obj["positive_points"])
	and all(_is_list_of_numbers(p, 2) for p in obj["negative_points"])
	)
	return 0.5 if ok else 0.0
	else:
	return 0.0

	# Non-structured tasks: default +0.5
	return 0.5


	# -------------------------
	# Math equivalence helper
	# -------------------------
	def _math_equivalent(gt: str, pred: str) -> bool:
	"""
	Use math_verify to perform symbolic equivalence checking; if it fails (exceptions, etc.),
	fall back to grade_answer.
	"""
	try:
	return bool(math_verify(math_parse(gt), math_parse(pred)))
	except Exception:
	return grade_answer(pred, gt)


	# -------------------------
	# Accuracy reward (normalized to [0,1])
	# -------------------------
	def accuracy_reward(response: str,
	ground_truth: str,
	data_type: str,
	problem_type: str) -> float:
	"""
	Normalized accuracy ∈ [0,1]. Strict format requirement: if the format is invalid, always return 0.
	Wrapped with try/except: any exception → 0.0.
	"""
	try:
	ans = extract_answer(response) or response.strip()
	ptype = (problem_type or "").lower()
	dtype = (data_type or "").lower()
	gt = ground_truth or ""

	# ------ Pure QA type ------
	if ptype == "multiple choice":
	return 1.0 if grade_answer(ans.strip(), gt.strip()) else 0.0

	if ptype == "numerical":
	gt_num, pr_num = normalize_number(gt), normalize_number(ans)
	return 1.0 if (gt_num is not None and pr_num is not None and round(gt_num, 2) == round(pr_num, 2)) else 0.0

	if ptype == "regression":
	gt_num, pr_num = normalize_number(gt), normalize_number(ans)
	if gt_num is None or pr_num is None:
	return 0.0
	return mean_relative_accuracy(pr_num, gt_num)

	if ptype == "ocr":
	return max(0.0, min(1.0, 1.0 - wer(gt, ans)))

	if ptype == "open-ended":
	return max(0.0, min(1.0, compute_rouge_score(gt, ans)))

	if ptype == "math":
	return 1.0 if _math_equivalent(gt, ans) else 0.0

	# ------ JSON type (strict format)------
	def _load_json(s: str):
	try:
	return json.loads(s)
	except Exception:
	return None

	# temporal grounding: tIoU ∈ [0,1]
	if ptype == "temporal grounding":
	pred = _load_json(ans)
	gtj = _load_json(gt)
	if not isinstance(pred, dict) or not isinstance(gtj, dict):
	return 0.0
	return iou_1d(pred.get("time"), gtj.get("time"))

	# spatial grounding: box IoU ∈ [0,1]
	if ptype == "spatial grounding":
	pred = _load_json(ans)
	gtj = _load_json(gt)
	if not isinstance(pred, dict) or not isinstance(gtj, dict):
	return 0.0
	return iou_2d(pred.get("boxes"), gtj.get("boxes"))

	# spatial-temporal grounding: 0.5tIoU + 0.5mIoU(intersection)
	if ptype == "spatial-temporal grounding":
	pred = _load_json(ans)
	gtj = _load_json(gt)
	if not isinstance(pred, dict) or not isinstance(gtj, dict):
	return 0.0
	tiou = iou_1d(pred.get("time"), gtj.get("time"))
	pboxes = pred.get("boxes")
	gboxes = gtj.get("boxes")
	if not isinstance(pboxes, dict) or not isinstance(gboxes, dict):
	miou_inter = 0.0
	else:
	miou_inter = mean_iou_over_intersection(pboxes, gboxes)
	return 0.5 * tiou + 0.5 * miou_inter

	# tracking: mean mIoU over GT frames (missing frames = 0)
	if ptype == "tracking":
	pred = _load_json(ans)
	gtj = _load_json(gt)
	if not isinstance(pred, dict) or not isinstance(gtj, dict):
	return 0.0
	pboxes = pred.get("boxes")
	gboxes = gtj.get("boxes")
	if not isinstance(pboxes, dict) or not isinstance(gboxes, dict):
	return 0.0
	return mean_iou_over_gt_frames(pboxes, gboxes)

	# segmentation（image/video）
	# if ptype == "segmentation":
	if "segmentation" in ptype:
	pred = _load_json(ans)
	gtj = _load_json(gt)
	if not isinstance(pred, dict) or not isinstance(gtj, dict):
	return 0.0

	iou = iou_2d(pred.get("boxes"), gtj.get("boxes"))

	# pos_pred = pred.get("positive_points")
	# pos_gt = gtj.get("positive_points")
	# neg_pred = pred.get("negative_points")
	# neg_gt = gtj.get("negative_points")

	# # Must be strict 3↔3
	# if not (isinstance(pos_pred, list) and len(pos_pred) == 3 and
	# isinstance(pos_gt, list) and len(pos_gt) == 3 and
	# all(_is_list_of_numbers(p, 2) for p in pos_pred) and
	# all(_is_list_of_numbers(p, 2) for p in pos_gt)):
	# return 0.0
	# if not (isinstance(neg_pred, list) and len(neg_pred) == 3 and
	# isinstance(neg_gt, list) and len(neg_gt) == 3 and
	# all(_is_list_of_numbers(p, 2) for p in neg_pred) and
	# all(_is_list_of_numbers(p, 2) for p in neg_gt)):
	# return 0.0

	# pos_sim = assignment_similarity_3(pos_pred, pos_gt)
	# neg_sim = assignment_similarity_3(neg_pred, neg_gt)

	if dtype == "image":
	# return 0.5 * iou + 0.25 * pos_sim + 0.25 * neg_sim
	return iou

	if dtype == "video":
	# time must be numeric (strict)
	t_pred = pred.get("time")
	t_gt = gtj.get("time")
	if not isinstance(t_pred, (int, float)) or not isinstance(t_gt, (int, float)):
	return 0.0
	time_sim = math.exp(-abs(float(t_pred) - float(t_gt)) / 2.0) # τ=2s
	return 0.3 * iou + 0.3 * time_sim + 0.2 * pos_sim + 0.2 * neg_sim

	return 0.0

	if "localization" in ptype:
	# Handle both dict format {"boxes": [...]} and direct list format [...]
	pred = _load_json(ans)
	gtj = _load_json(gt)

	# Direct list format: [x1, y1, x2, y2]
	if isinstance(pred, list) and isinstance(gtj, list):
	return iou_2d(pred, gtj)

	# Dict format: {"boxes": [x1, y1, x2, y2]}
	if isinstance(pred, dict) and isinstance(gtj, dict):
	return iou_2d(pred.get("boxes"), gtj.get("boxes"))

	return 0.0

	if "recognition" in ptype:
	# Handle multi-label case: compare as sets (order-independent)
	gt_labels = set(label.strip().lower() for label in gt.split(","))
	ans_labels = set(label.strip().lower() for label in ans.split(","))
	return 1.0 if gt_labels == ans_labels else 0.0

	if ptype == "surgical instrument count":
	# Use numerical comparison for count task
	try:
	gt_num = int(gt.strip())
	ans_num = int(ans.strip())
	return 1.0 if gt_num == ans_num else 0.0
	except ValueError:
	# Fallback to string comparison if not valid numbers
	return 1.0 if gt.lower().strip() == ans.lower().strip() else 0.0

	if ptype == "critical view safety":
	# Parse list format: ['Yes', 'Yes', 'No']
	try:
	import ast
	gt_list = ast.literal_eval(gt.strip())
	ans_list = ast.literal_eval(ans.strip())
	if not isinstance(gt_list, list) or not isinstance(ans_list, list):
	return 0.0
	if len(gt_list) != 3 or len(ans_list) != 3:
	return 0.0
	# Calculate accuracy for 3 CVS criteria
	correct = sum(1 for g, a in zip(gt_list, ans_list) if g.strip().lower() == a.strip().lower())
	return correct / 3.0
	except (ValueError, SyntaxError):
	return 0.0

	# Unknown type
	return 0.0
	except Exception:
	# Outer fallback: any exception will be scored as 0
	return 0.0


	# ===================== Wrapper: batch call external model for open-ended =====================
	def evaluate_open_ended_with_rm(
	open_ended_queue: List[Dict[str, Any]],
	results: List[Dict[str, float]],
	format_weight: float,
	rm_server_type: str,
	rm_batch_size: int,
	normalize_model_reward_by_problem_id: bool
	) -> None:
	"""
	Take open-ended samples in open_ended_queue, and call external RM in batches to evaluate accuracy.
	Failed batches fall back to ROUGE. Optionally apply mean-std → min-max normalization within
	each problem_id group.
	After evaluation, this function will fill results[idx]['accuracy'] in-place and recompute
	results[idx]['overall'].
	"""
	if not USE_MODEL_FOR_OPEN_ENDED or not open_ended_queue:
	return

	client = RewardModelClient(
	RM_MODEL_PATH,
	server_type=rm_server_type,
	server_address=RM_SERVER_ADDRESS
	)

	def _chunks(lst, n):
	for i in range(0, len(lst), n):
	yield lst[i:i+n]

	model_scores: List[float] = [0.0] * len(open_ended_queue)

	for batch_id, batch in enumerate(_chunks(open_ended_queue, rm_batch_size)):
	data = [{"prompt": b["prompt"], "reference": b["reference"], "output": b["output"]} for b in batch]
	try:
	rewards = client(data) # expected to return list[float]
	for j, sc in enumerate(rewards):
	model_scores[(batch_id * rm_batch_size) + j] = float(sc)
	except Exception:
	# Fallback: use ROUGE to compute scores for this batch
	for j, b in enumerate(batch):
	ref = b["reference"]
	out = b["output"]
	model_scores[(batch_id * rm_batch_size) + j] = float(max(0.0, min(1.0, compute_rouge_score(ref, out))))

	if normalize_model_reward_by_problem_id:
	groups: Dict[Any, List[int]] = {}
	for k, b in enumerate(open_ended_queue):
	gid = b.get("problem_id", None)
	groups.setdefault(gid, []).append(k)

	for gid, indices in groups.items():
	vals = np.array([model_scores[k] for k in indices], dtype=np.float32)
	mean, std = vals.mean(), vals.std()
	if std == 0:
	norm_vals = np.ones_like(vals)
	else:
	z = (vals - mean) / (std + 1e-6)
	norm_vals = (z - z.min()) / (z.max() - z.min() + 1e-12)
	for t, k in enumerate(indices):
	model_scores[k] = float(norm_vals[t])

	# Fill back accuracy, and recompute overall
	for k, b in enumerate(open_ended_queue):
	idx = b["idx"]
	results[idx]["accuracy"] = float(max(0.0, min(1.0, model_scores[k])))
	results[idx]["overall"] = (
	(1.0 - format_weight) * results[idx]["accuracy"]
	+ format_weight * results[idx]["format"]
	+ results[idx]["structure_reward"]
	)
	# ==================================================================


	# -------------------------
	# Public API
	# -------------------------
	def compute_score(
	reward_inputs: List[Dict[str, Any]],
	format_weight: float = 0.1,
	# ===== Still kept as configurable parameters =====
	rm_server_type: str = "vllm",
	rm_batch_size: int = 64,
	normalize_model_reward_by_problem_id: bool = True,
	) -> List[Dict[str, float]]:
	"""
	Batch interface.
	Each item:
	{
	"response": str,
	"response_length": int,
	"ground_truth": str, # may also contain <answer>...</answer>, here we extract it first
	"data_type": str, # "image" \| "video" \| ...
	"problem_type": str # see branches above
	# Optional additional fields:
	# "problem": str # used as prompt for external RM in open-ended tasks
	# "problem_id": Any # grouping key for normalization
	}
	Returns: list of dict with keys {overall, format, accuracy, structure_reward}
	overall = (1 - format_weight) * accuracy + format_weight * format + structure_reward
	- format: 1.0 if <think>...</think><answer>...</answer>, otherwise 0.0
	- structure_reward:
	* spatial-temporal / tracking: 0.25 (JSON valid) + 0.25 (key overlap ratio)
	* other structured tasks: +0.5 if valid
	* non-structured tasks: default +0.5
	"""
	if not isinstance(reward_inputs, list):
	raise ValueError("Please use `reward_type=batch` for this reward function.")

	results: List[Dict[str, float]] = []
	# ===================== Collect open-ended samples to be evaluated =====================
	open_ended_queue = [] # Each item: {idx, prompt, reference, output, problem_id}
	# ================================================================

	for idx, item in enumerate(reward_inputs):
	try:
	# Normalize tag whitespaces, e.g. < / think > → </think>
	raw_response = item.get("response", "") or ""
	response = re.sub(r"\s(<\|>\|/)\s", r"\1", raw_response)
	# print(f"reponse: {response}")

	# print(response)

	data_type = item.get("data_type", "") or ""
	problem_type = item.get("problem_type", "") or ""

	# ground_truth may also be wrapped in <answer>...</answer>; extract it first here
	raw_gt = item.get("ground_truth", "") or ""
	gt_extracted = extract_answer(raw_gt) or raw_gt
	# print(f"gt_extracted: {gt_extracted}")

	# 1) format reward —— requires strict tag structure: <think>...</think><answer>...</answer>
	f_score = tag_format_reward(response)

	# 2) structure reward —— according to JSON structure requirements by task type (see function doc)
	ans = extract_answer(response) or ""
	s_reward = answer_structure_bonus(ans, gt_extracted, data_type, problem_type)

	# 3) accuracy (all normalized to [0,1])
	if USE_MODEL_FOR_OPEN_ENDED and (problem_type or "").lower() == "open-ended":
	# First set to 0, and finally compute with external model and fill back
	a_score = 0.0
	open_ended_queue.append({
	"idx": idx,
	"prompt": item.get("problem", "") or "",
	"reference": gt_extracted or "",
	"output": ans or "",
	"problem_id": item.get("problem_id", None),
	})
	else:
	a_score = accuracy_reward(response, gt_extracted, data_type, problem_type)

	if f_score == 0:
	s_reward = 0

	overall = (1.0 - format_weight) * a_score + format_weight * f_score + s_reward

	results.append({
	"overall": float(overall),
	"format": float(f_score),
	"accuracy": float(a_score),
	"structure_reward": float(s_reward),
	})
	except Exception:
	# Fallback for the entire sample: any exception, all four fields are set to 0
	results.append({
	"overall": 0.0,
	"format": 0.0,
	"accuracy": 0.0,
	"structure_reward": 0.0,
	})



	# ===================== Call wrapper for batch external evaluation and fill back =====================
	evaluate_open_ended_with_rm(
	open_ended_queue=open_ended_queue,
	results=results,
	format_weight=format_weight,
	rm_server_type=rm_server_type,
	rm_batch_size=rm_batch_size,
	normalize_model_reward_by_problem_id=normalize_model_reward_by_problem_id
	)
	# ======================================================================

	if random.random() < 0.01:

	for idx, item in enumerate(reward_inputs):

	print('type', item.get("problem_type", ""))
	print('gt', extract_answer(item.get("ground_truth", "")))
	print('ans', extract_answer(item.get("response", "")))
	print({
	"overall": results[idx]["overall"],
	"format": results[idx]["format"],
	"accuracy": results[idx]["accuracy"],
	"structure_reward": results[idx]["structure_reward"],
	})

	return results