Spaces:

Siddeshwar1625
/

OSINT

Paused

OSINT / src /osint_env /training /rewards.py

siddeshwar-kagatikar

feat(training): improve self-play progress visibility and reward diagnostics

4aca4f5 12 days ago

50.1 kB

	from __future__ import annotations

	import json
	import re
	from collections import Counter
	from dataclasses import dataclass, field
	from functools import lru_cache
	from typing import Any

	from osint_env.data.generator import (
	build_swarm_v2_tool_trace,
	emit_swarm_v2_question,
	enumerate_swarm_v2_neighbors,
	select_swarm_v2_answer,
	trace_swarm_v2_path,
	)
	from osint_env.domain.models import CanonicalGraph, Edge, TaskInstance
	from osint_env.env.reward import build_reward_model, compute_answer_reward
	from osint_env.env.spawn_reward_hooks import parl_reward_breakdown
	from osint_env.training.config import (
	GeneratorRewardWeights,
	SwarmV2SharedContextConfig,
	SwarmV2ValidationConfig,
	)


	def _iter_text_fragments(value: Any) -> list[str]:
	if value is None:
	return []
	if isinstance(value, str):
	token = value.strip()
	return [token] if token else []
	if isinstance(value, list):
	out: list[str] = []
	for item in value:
	out.extend(_iter_text_fragments(item))
	return out
	if isinstance(value, dict):
	out: list[str] = []
	for key in ("content", "text", "output_text", "message", "choices"):
	if key in value:
	out.extend(_iter_text_fragments(value.get(key)))
	return out
	return [str(value)]


	def decode_completion_text(completion: Any) -> str:
	parts = _iter_text_fragments(completion)
	return "\n".join(part for part in parts if part)


	def _extract_json_candidates(text: str) -> list[Any]:
	candidate = str(text or "").strip()
	if not candidate:
	return []

	out: list[Any] = []
	try:
	parsed = json.loads(candidate)
	except json.JSONDecodeError:
	parsed = None
	if isinstance(parsed, dict):
	out.append(parsed)

	for start_idx, ch in enumerate(candidate):
	if ch != "{":
	continue

	depth = 0
	in_string = False
	escape = False
	for end_idx in range(start_idx, len(candidate)):
	current = candidate[end_idx]
	if escape:
	escape = False
	continue
	if current == "\\":
	escape = True
	continue
	if current == '"':
	in_string = not in_string
	continue
	if in_string:
	continue
	if current == "{":
	depth += 1
	elif current == "}":
	depth -= 1
	if depth < 0:
	break
	if depth != 0:
	continue
	snippet = candidate[start_idx : end_idx + 1]
	try:
	parsed = json.loads(snippet)
	except json.JSONDecodeError:
	break
	if isinstance(parsed, dict) and parsed not in out:
	out.append(parsed)
	break
	return out


	def _extract_json_blob(text: str, preferred_keys: tuple[str, ...] = ()) -> Any:
	blobs = _extract_json_candidates(text)
	if not blobs:
	return None
	if not preferred_keys:
	return blobs[0]

	best_blob = blobs[0]
	best_score = -1
	preferred = set(preferred_keys)
	for blob in blobs:
	if not isinstance(blob, dict):
	continue
	score = sum(1 for key in preferred if key in blob)
	if score > best_score:
	best_blob = blob
	best_score = score
	return best_blob


	def normalize_answer(text: str) -> str:
	value = str(text or "").strip()
	value = value.strip('"').strip("'")
	value = re.sub(r"\s+", " ", value)
	value = value.rstrip(".\n ")
	return value


	def extract_answer_from_completion(completion_text: str) -> str:
	blob = _extract_json_blob(completion_text, preferred_keys=("answer",))
	if isinstance(blob, dict):
	answer = str(blob.get("answer", "")).strip()
	if answer:
	return normalize_answer(answer)

	match = re.search(r"answer\s[:=]\s(.+)", completion_text, flags=re.IGNORECASE)
	if match:
	return normalize_answer(match.group(1))

	lines = [line.strip() for line in completion_text.splitlines() if line.strip()]
	if not lines:
	return ""
	return normalize_answer(lines[-1])


	@dataclass(slots=True)
	class SwarmReplayToolCall:
	tool_name: str
	args: dict[str, Any] = field(default_factory=dict)
	output: dict[str, Any] = field(default_factory=dict)


	@dataclass(slots=True)
	class SwarmOrchestratorTelemetry:
	spawn_count: int = 0
	finished_subtasks: int = 0
	critical_steps: int = 1
	breadth: int = 0
	depth: int = 0


	@dataclass(slots=True)
	class ReplayValidationResult:
	is_valid: bool
	reasons: list[str] = field(default_factory=list)
	duplicate_similarity: float = 0.0
	context_nodes: int = 0
	context_edges: int = 0
	unique_path_count: int = 0
	replayed_question: str = ""
	replayed_answer: str = ""
	replayed_edges: list[Edge] = field(default_factory=list)

	def to_dict(self) -> dict[str, Any]:
	return {
	"is_valid": self.is_valid,
	"reasons": list(self.reasons),
	"duplicate_similarity": float(self.duplicate_similarity),
	"context_nodes": int(self.context_nodes),
	"context_edges": int(self.context_edges),
	"unique_path_count": int(self.unique_path_count),
	"replayed_question": self.replayed_question,
	"replayed_answer": self.replayed_answer,
	"replayed_edges": [
	{
	"src": edge.src,
	"rel": edge.rel,
	"dst": edge.dst,
	"confidence": float(edge.confidence),
	}
	for edge in self.replayed_edges
	],
	}


	def _parse_edge_rows(value: Any, max_support_edges: int) -> list[Edge]:
	if not isinstance(value, list):
	return []
	out: list[Edge] = []
	for row in value[:max_support_edges]:
	if not isinstance(row, dict):
	continue
	src = str(row.get("src", "")).strip()
	rel = str(row.get("rel", "")).strip()
	dst = str(row.get("dst", "")).strip()
	if not src or not rel or not dst:
	continue
	try:
	confidence = float(row.get("confidence", 1.0))
	except (TypeError, ValueError):
	confidence = 1.0
	out.append(Edge(src=src, rel=rel, dst=dst, confidence=confidence))
	return out


	def _parse_tool_trace(value: Any) -> list[SwarmReplayToolCall]:
	if not isinstance(value, list):
	return []
	out: list[SwarmReplayToolCall] = []
	for row in value:
	if not isinstance(row, dict):
	continue
	tool_name = str(row.get("tool_name", row.get("tool", ""))).strip()
	args = row.get("args", {})
	output = row.get("output", row.get("result", {}))
	if not tool_name:
	continue
	out.append(
	SwarmReplayToolCall(
	tool_name=tool_name,
	args=dict(args) if isinstance(args, dict) else {},
	output=dict(output) if isinstance(output, dict) else {},
	)
	)
	return out


	def _parse_subagent_outputs(value: Any) -> list[str]:
	if not isinstance(value, list):
	return []
	out: list[str] = []
	for row in value:
	if isinstance(row, str):
	token = row.strip()
	elif isinstance(row, dict):
	token = str(row.get("content", row.get("summary", ""))).strip()
	else:
	token = str(row).strip()
	if token:
	out.append(token)
	return out


	def _coerce_int(value: Any, default: int) -> int:
	"""Best-effort int coercion that NEVER raises.

	Models routinely emit garbage like ``"none"``, ``"N/A"``, ``true``,
	``"2 agents"`` for fields the schema requires to be integers. The
	reward function runs inside the GRPO training loop, so a single
	``ValueError`` here crashes the entire training job. Be defensive.
	"""
	if value is None:
	return default
	if isinstance(value, bool):
	return int(value)
	if isinstance(value, int):
	return value
	if isinstance(value, float):
	if value != value or value in (float("inf"), float("-inf")):
	return default
	return int(value)
	if isinstance(value, str):
	token = value.strip()
	if not token:
	return default
	try:
	return int(token)
	except ValueError:
	try:
	return int(float(token))
	except ValueError:
	match = re.search(r"[-+]?\d+(?:\.\d+)?", token)
	if match:
	try:
	return int(float(match.group(0)))
	except ValueError:
	return default
	return default
	return default


	def _parse_orchestrator(value: Any) -> SwarmOrchestratorTelemetry:
	if not isinstance(value, dict):
	return SwarmOrchestratorTelemetry()
	return SwarmOrchestratorTelemetry(
	spawn_count=max(0, _coerce_int(value.get("spawn_count"), 0)),
	finished_subtasks=max(0, _coerce_int(value.get("finished_subtasks"), 0)),
	critical_steps=max(1, _coerce_int(value.get("critical_steps"), 1)),
	breadth=max(0, _coerce_int(value.get("breadth"), 0)),
	depth=max(0, _coerce_int(value.get("depth"), 0)),
	)


	@dataclass(slots=True)
	class GeneratedTaskCandidate:
	question: str
	answer: str
	supporting_edges: list[Edge]
	task_type: str
	is_valid: bool
	tool_trace: list[SwarmReplayToolCall] = field(default_factory=list)
	subagent_outputs: list[str] = field(default_factory=list)
	canonical_edges: list[Edge] = field(default_factory=list)
	canonical_nodes: list[str] = field(default_factory=list)
	orchestrator: SwarmOrchestratorTelemetry = field(default_factory=SwarmOrchestratorTelemetry)
	validation: dict[str, Any] = field(default_factory=dict)



	def parse_generated_task_completion(completion_text: str, max_support_edges: int = 8) -> GeneratedTaskCandidate:
	blob = _extract_json_blob(
	completion_text,
	preferred_keys=("question", "answer", "supporting_edges", "tool_trace", "canonical_graph"),
	)

	question = ""
	answer = ""
	task_type = "adversarial_trace"
	supporting_edges: list[Edge] = []
	tool_trace: list[SwarmReplayToolCall] = []
	subagent_outputs: list[str] = []
	canonical_edges: list[Edge] = []
	canonical_nodes: list[str] = []
	orchestrator = SwarmOrchestratorTelemetry()
	validation: dict[str, Any] = {}

	if isinstance(blob, dict):
	question = str(blob.get("question", "")).strip()
	answer = normalize_answer(str(blob.get("answer", "")).strip())
	task_type = str(blob.get("task_type", "adversarial_trace")).strip() or "adversarial_trace"
	supporting_edges = _parse_edge_rows(blob.get("supporting_edges", []), max_support_edges=max_support_edges)
	tool_trace = _parse_tool_trace(blob.get("tool_trace", []))
	subagent_outputs = _parse_subagent_outputs(blob.get("subagent_outputs", []))
	orchestrator = _parse_orchestrator(blob.get("orchestrator"))
	validation = dict(blob.get("validation", {})) if isinstance(blob.get("validation"), dict) else {}
	canonical_graph = blob.get("canonical_graph", {})
	if isinstance(canonical_graph, dict):
	canonical_nodes = [
	str(node_id).strip()
	for node_id in canonical_graph.get("nodes", [])
	if str(node_id).strip()
	]
	canonical_edges = _parse_edge_rows(
	canonical_graph.get("edges", []),
	max_support_edges=max(1, max_support_edges * 4),
	)

	if not question:
	line_match = re.search(r"question\s[:=]\s(.+)", completion_text, flags=re.IGNORECASE)
	if line_match:
	question = line_match.group(1).strip()
	if not answer:
	answer = extract_answer_from_completion(completion_text)

	is_valid = bool(question and answer)
	return GeneratedTaskCandidate(
	question=question,
	answer=answer,
	supporting_edges=supporting_edges,
	task_type=task_type,
	is_valid=is_valid,
	tool_trace=tool_trace,
	subagent_outputs=subagent_outputs,
	canonical_edges=canonical_edges,
	canonical_nodes=canonical_nodes,
	orchestrator=orchestrator,
	validation=validation,
	)



	def _token_set(text: str) -> set[str]:
	return set(re.findall(r"[a-zA-Z0-9_]+", str(text).lower()))



	def _jaccard_similarity(left: str, right: str) -> float:
	a = _token_set(left)
	b = _token_set(right)
	if not a and not b:
	return 1.0
	if not a or not b:
	return 0.0
	return len(a & b) / max(1, len(a \| b))


	_SWARM_V2_QUESTION_RE = re.compile(
	r"^If you start at (?P<start>.+?) and follow the relation path "
	r"(?P<relations>.+?), which entity do you reach after (?P<hops>\d+) hops\?$"
	)


	def _swarm_v2_question_signature(question: str) -> tuple[str, tuple[str, ...], int] \| None:
	match = _SWARM_V2_QUESTION_RE.match(str(question or "").strip())
	if not match:
	return None
	start = normalize_answer(match.group("start")).lower()
	relations = tuple(
	token.strip().lower()
	for token in match.group("relations").split("->")
	if token.strip()
	)
	if not start or not relations:
	return None
	return start, relations, int(match.group("hops"))


	def _swarm_v2_question_similarity(left: str, right: str) -> float:
	left_sig = _swarm_v2_question_signature(left)
	right_sig = _swarm_v2_question_signature(right)
	if left_sig is None or right_sig is None:
	return _jaccard_similarity(left, right)

	left_start, left_relations, left_hops = left_sig
	right_start, right_relations, right_hops = right_sig
	start_score = 1.0 if left_start == right_start else 0.0
	path_score = 1.0 if left_relations == right_relations else _jaccard_similarity(
	" ".join(left_relations),
	" ".join(right_relations),
	)
	hop_score = 1.0 if left_hops == right_hops else 0.0
	return (0.55 * start_score) + (0.35 * path_score) + (0.10 * hop_score)


	def _distinct_ngram_ratio(texts: list[str], n: int = 2) -> float:
	tokens: list[str] = []
	for text in texts:
	tokens.extend(re.findall(r"[a-zA-Z0-9_]+", text.lower()))
	if len(tokens) < n:
	return 0.0 if texts else 1.0
	ngrams = [tuple(tokens[idx : idx + n]) for idx in range(0, len(tokens) - n + 1)]
	if not ngrams:
	return 0.0
	return len(set(ngrams)) / max(1, len(ngrams))


	class SwarmV2ReplayValidator:
	"""Hard-gated replay validator for deterministic swarm_v2 generation."""

	def __init__(
	self,
	graph: CanonicalGraph,
	validation: SwarmV2ValidationConfig,
	shared_context: SwarmV2SharedContextConfig,
	seen_questions: list[str] \| None = None,
	):
	self.graph = graph
	self.validation = validation
	self.shared_context = shared_context
	self.seen_questions = list(seen_questions or [])
	self.graph_nodes = set(graph.nodes.keys())
	self.graph_edges = {(edge.src, edge.rel, edge.dst) for edge in graph.edges}
	self.outgoing: dict[str, list[Edge]] = {}
	for edge in graph.edges:
	self.outgoing.setdefault(edge.src, []).append(edge)

	def remember(self, question: str) -> None:
	token = str(question).strip()
	if not token:
	return
	self.seen_questions.append(token)
	if len(self.seen_questions) > 4096:
	self.seen_questions = self.seen_questions[-2048:]

	def _count_matching_paths(self, start: str, relations: list[str], answer: str, limit: int = 4) -> int:
	if not start or not relations:
	return 0

	count = 0
	stack: list[tuple[str, int, tuple[str, ...]]] = [(start, 0, (start,))]
	while stack:
	node_id, rel_idx, seen_nodes = stack.pop()
	if rel_idx >= len(relations):
	if node_id == answer:
	count += 1
	if count >= limit:
	return count
	continue

	relation = relations[rel_idx]
	for edge in self.outgoing.get(node_id, []):
	if edge.rel != relation:
	continue
	if edge.dst in seen_nodes:
	continue
	stack.append((edge.dst, rel_idx + 1, seen_nodes + (edge.dst,)))
	return count

	def _replay_tool_trace(self, candidate: GeneratedTaskCandidate) -> tuple[list[str], list[Edge], str, str]:
	reasons: list[str] = []
	replayed_edges: list[Edge] = []
	replayed_answer = ""
	replayed_question = ""
	declared_answer = ""
	declared_question = ""
	tool_trace = list(candidate.tool_trace)
	trace_path_source: Any = candidate.supporting_edges

	if not tool_trace and candidate.supporting_edges:
	synthesized_trace = build_swarm_v2_tool_trace(self.graph, candidate.supporting_edges)
	tool_trace = _parse_tool_trace(synthesized_trace)

	for call in tool_trace:
	if call.tool_name == "enumerate_neighbors":
	node_id = str(call.args.get("node_id", "")).strip()
	expected_edge = call.args.get("expected_edge", {})
	if not node_id:
	reasons.append("non_replayable_tool_calls")
	continue
	neighbors = enumerate_swarm_v2_neighbors(self.graph, node_id)
	if not neighbors:
	reasons.append("non_replayable_tool_calls")
	if isinstance(expected_edge, dict):
	expected_key = (
	str(expected_edge.get("src", "")).strip(),
	str(expected_edge.get("rel", "")).strip(),
	str(expected_edge.get("dst", "")).strip(),
	)
	if expected_key not in {(edge.src, edge.rel, edge.dst) for edge in neighbors}:
	reasons.append("non_replayable_tool_calls")
	elif call.tool_name == "trace_path":
	trace_path_source = call.args.get("path", trace_path_source)
	replayed_edges = trace_swarm_v2_path(self.graph, trace_path_source)
	if not replayed_edges:
	reasons.append("non_replayable_tool_calls")
	elif call.tool_name == "select_answer":
	declared_answer = normalize_answer(str(call.output.get("answer", "")).strip())
	elif call.tool_name == "emit_question":
	declared_question = str(call.output.get("question", "")).strip()
	else:
	reasons.append("non_replayable_tool_calls")

	if not replayed_edges:
	replayed_edges = trace_swarm_v2_path(self.graph, trace_path_source)
	if not replayed_edges and candidate.supporting_edges:
	replayed_edges = trace_swarm_v2_path(self.graph, candidate.supporting_edges)
	if not replayed_edges:
	reasons.append("non_replayable_tool_calls")
	return reasons, replayed_edges, replayed_answer, replayed_question

	replayed_answer = select_swarm_v2_answer(replayed_edges)
	replayed_question = emit_swarm_v2_question(replayed_edges)
	if declared_answer and declared_answer != normalize_answer(replayed_answer):
	reasons.append("non_replayable_tool_calls")
	if declared_question and declared_question != replayed_question:
	reasons.append("non_replayable_tool_calls")
	return reasons, replayed_edges, replayed_answer, replayed_question

	def validate(self, candidate: GeneratedTaskCandidate) -> ReplayValidationResult:
	reasons: list[str] = []

	if not candidate.question or not candidate.answer:
	reasons.append("missing_question_or_answer")

	if not candidate.supporting_edges:
	reasons.append("malformed_support_edges")

	if len(candidate.supporting_edges) > self.validation.max_support_edges:
	reasons.append("context_or_support_budget_overflow")

	edge_keys = [(edge.src, edge.rel, edge.dst) for edge in candidate.supporting_edges]
	if len(set(edge_keys)) != len(edge_keys):
	reasons.append("malformed_support_edges")

	for edge in candidate.supporting_edges:
	if edge.src not in self.graph_nodes or edge.dst not in self.graph_nodes:
	reasons.append("unseen_nodes_or_edges")
	break
	if (edge.src, edge.rel, edge.dst) not in self.graph_edges:
	reasons.append("unseen_nodes_or_edges")
	break

	replay_reasons, replayed_edges, replayed_answer, replayed_question = self._replay_tool_trace(candidate)
	reasons.extend(replay_reasons)

	if replayed_edges:
	expected_keys = [(edge.src, edge.rel, edge.dst) for edge in replayed_edges]
	if expected_keys != edge_keys:
	reasons.append("non_replayable_tool_calls")
	relations = [edge.rel for edge in replayed_edges]
	unique_path_count = self._count_matching_paths(
	start=replayed_edges[0].src,
	relations=relations,
	answer=replayed_answer or candidate.answer,
	)
	else:
	unique_path_count = 0

	if unique_path_count != 1:
	reasons.append("non_unique_derivation_path")

	if replayed_answer and normalize_answer(replayed_answer) != normalize_answer(candidate.answer):
	reasons.append("non_replayable_tool_calls")

	if replayed_question and replayed_question != candidate.question:
	reasons.append("non_replayable_tool_calls")

	if candidate.answer and normalize_answer(candidate.answer).lower() in candidate.question.lower():
	reasons.append("answer_leakage")

	duplicate_similarity = 0.0
	if candidate.question and self.seen_questions:
	duplicate_similarity = max(
	_swarm_v2_question_similarity(candidate.question, seen_question)
	for seen_question in self.seen_questions
	)
	if duplicate_similarity >= self.validation.duplicate_similarity_threshold:
	reasons.append("duplicate_or_near_duplicate")

	context_nodes = len({edge.src for edge in candidate.supporting_edges} \| {edge.dst for edge in candidate.supporting_edges})
	context_edges = len(candidate.supporting_edges)
	max_context_nodes = min(self.validation.max_context_nodes, self.shared_context.max_nodes)
	max_context_edges = min(self.validation.max_context_edges, self.shared_context.max_edges)
	if context_nodes > max_context_nodes or context_edges > max_context_edges:
	reasons.append("context_or_support_budget_overflow")

	if len(candidate.supporting_edges) > self.validation.max_path_hops:
	reasons.append("context_or_support_budget_overflow")

	return ReplayValidationResult(
	is_valid=not reasons,
	reasons=sorted(set(reasons)),
	duplicate_similarity=duplicate_similarity,
	context_nodes=context_nodes,
	context_edges=context_edges,
	unique_path_count=unique_path_count,
	replayed_question=replayed_question,
	replayed_answer=replayed_answer,
	replayed_edges=replayed_edges,
	)


	class AnswererJudge:
	"""Lightweight frozen answerer used to score adversarial hardness."""

	def __init__(self, model_name_or_path: str, max_new_tokens: int = 48):
	self.model_name_or_path = model_name_or_path
	self.max_new_tokens = max_new_tokens
	self._model = None
	self._tokenizer = None

	def _ensure_loaded(self) -> None:
	if self._model is not None and self._tokenizer is not None:
	return

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
	if tokenizer.pad_token is None and tokenizer.eos_token is not None:
	tokenizer.pad_token = tokenizer.eos_token

	model_kwargs: dict[str, Any] = {}
	if torch.cuda.is_available():
	model_kwargs["device_map"] = "auto"
	model_kwargs["torch_dtype"] = torch.bfloat16

	model = AutoModelForCausalLM.from_pretrained(self.model_name_or_path, **model_kwargs)
	model.eval()

	self._model = model
	self._tokenizer = tokenizer

	@lru_cache(maxsize=2048)
	def answer(self, question: str) -> str:
	self._ensure_loaded()
	assert self._model is not None
	assert self._tokenizer is not None

	import torch

	prompt = (
	"You are an OSINT answering model. "
	"Answer with only the final entity string.\n"
	f"Question: {question}\n"
	"Answer:"
	)

	tokenizer = self._tokenizer
	model = self._model
	encoded = tokenizer(prompt, return_tensors="pt")
	device = next(model.parameters()).device
	encoded = {k: v.to(device) for k, v in encoded.items()}

	with torch.no_grad():
	output = model.generate(
	**encoded,
	max_new_tokens=max(1, int(self.max_new_tokens)),
	do_sample=False,
	temperature=0.0,
	pad_token_id=tokenizer.eos_token_id,
	)

	generated = output[0][encoded["input_ids"].shape[1] :]
	completion = tokenizer.decode(generated, skip_special_tokens=True)
	return normalize_answer(extract_answer_from_completion(completion))


	class GeneratorRewardFunction:
	"""Reward for the graph/question generation swarm in adversarial self-play."""

	def __init__(
	self,
	graph: CanonicalGraph,
	answerer_judge: AnswererJudge,
	weights: GeneratorRewardWeights,
	max_support_edges: int = 8,
	pipeline_mode: str = "legacy",
	swarm_v2_validation: SwarmV2ValidationConfig \| None = None,
	swarm_v2_shared_context: SwarmV2SharedContextConfig \| None = None,
	parl_max_parallel_hint: int = 0,
	):
	self.graph = graph
	self.answerer_judge = answerer_judge
	self.weights = weights
	self.max_support_edges = max_support_edges
	self.pipeline_mode = str(pipeline_mode).strip().lower() or "legacy"
	self.graph_nodes = set(graph.nodes.keys())
	self.graph_edges = {(edge.src, edge.rel, edge.dst) for edge in graph.edges}
	self._seen_questions: list[str] = []
	self.swarm_v2_validation = swarm_v2_validation or SwarmV2ValidationConfig(
	max_support_edges=max_support_edges
	)
	self.swarm_v2_shared_context = swarm_v2_shared_context or SwarmV2SharedContextConfig()
	self.parl_max_parallel_hint = max(0, int(parl_max_parallel_hint or 0))
	self._swarm_v2_validator = SwarmV2ReplayValidator(
	graph=graph,
	validation=self.swarm_v2_validation,
	shared_context=self.swarm_v2_shared_context,
	seen_questions=self._seen_questions,
	)
	self._debug_batches_seen = 0
	self._debug_reason_counter: Counter[str] = Counter()
	self._debug_reward_window: list[float] = []
	self._debug_last_batch: dict[str, Any] = {}

	@staticmethod
	def _std(values: list[float]) -> float:
	if len(values) <= 1:
	return 0.0
	mean = sum(values) / len(values)
	variance = sum((value - mean) ** 2 for value in values) / len(values)
	return variance ** 0.5

	def _invalid_swarm_v2_reward(
	self,
	candidate: GeneratedTaskCandidate,
	validation_result: ReplayValidationResult,
	completion_text: str = "",
	) -> float:
	# Avoid a constant hard penalty. Keep invalid samples negative but
	# graded so GRPO still gets reward variance/advantages when quality
	# differs. Three layers of signal:
	# (1) per-reason penalty (caps how bad it can get)
	# (2) partial credit for any parseable structural element
	# (3) tiny text-level signal so completely-collapsed completions
	# differ from completions that at least attempt JSON.
	reason_penalty = {
	"missing_question_or_answer": 0.35,
	"malformed_support_edges": 0.25,
	"non_replayable_tool_calls": 0.25,
	"non_unique_derivation_path": 0.20,
	"unseen_nodes_or_edges": 0.25,
	"answer_leakage": 0.30,
	"duplicate_or_near_duplicate": 0.15,
	"context_or_support_budget_overflow": 0.15,
	}
	penalty = 0.10
	for reason in validation_result.reasons:
	penalty += reason_penalty.get(reason, 0.10)

	partial_credit = 0.0
	if candidate.question:
	partial_credit += 0.25
	if candidate.answer:
	partial_credit += 0.25
	if candidate.supporting_edges:
	partial_credit += min(0.36, 0.12 * len(candidate.supporting_edges))
	if candidate.tool_trace:
	partial_credit += min(0.20, 0.05 * len(candidate.tool_trace))
	if candidate.subagent_outputs:
	partial_credit += 0.10
	if candidate.canonical_edges or candidate.canonical_nodes:
	partial_credit += 0.12

	text_signal = self._completion_text_signal(completion_text)

	reward = partial_credit - penalty + text_signal
	return float(max(-1.25, min(-0.02, reward)))

	@staticmethod
	def _completion_text_signal(completion_text: str) -> float:
	"""Small [0, 0.25] bonus that grades how 'JSON-like' a raw completion is.

	Important for GRPO: if every sample in a group is unparseable garbage
	but the raw text differs in JSON-likeness, we still produce non-zero
	advantages. Without this the reward collapses to a flat floor and
	``frac_reward_zero_std`` stays at 1.0 forever.
	"""
	if not completion_text:
	return 0.0
	text = completion_text.strip()
	if not text:
	return 0.0
	signal = 0.0
	# Brace cues (model is trying to emit JSON).
	signal += 0.03 * min(2, text.count("{"))
	signal += 0.03 * min(2, text.count("}"))
	signal += 0.01 * min(4, text.count("["))
	signal += 0.01 * min(4, text.count("]"))
	# Schema-keyword cues. Each keyword bumps the signal a tiny amount.
	cues = (
	"\"question\"",
	"\"answer\"",
	"\"supporting_edges\"",
	"\"tool_trace\"",
	"\"task_type\"",
	"\"orchestrator\"",
	)
	signal += 0.015 * sum(1 for cue in cues if cue in text)
	# Diversity proxy: number of unique short tokens. Pure repetition
	# collapses this; varied output keeps it nonzero. Caps very fast.
	sample = text[:512]
	unique_words = len(set(sample.split())) if sample else 0
	signal += min(0.04, unique_words / 800.0)
	# Length proxy capped — purely-empty vs anything-emitted differs.
	length_bump = min(0.03, len(text) / 8000.0)
	signal += length_bump
	return min(0.25, signal)

	def _validity_score(self, candidate: GeneratedTaskCandidate) -> float:
	score = 0.0
	if candidate.question:
	score += 0.4
	if candidate.answer:
	score += 0.4
	if len(candidate.supporting_edges) <= self.max_support_edges:
	score += 0.2
	return min(1.0, score)

	def _consistency_score(self, candidate: GeneratedTaskCandidate) -> float:
	if not candidate.question or not candidate.answer:
	return 0.0

	edge_consistency = 0.0
	if candidate.supporting_edges:
	matches = sum(
	1
	for edge in candidate.supporting_edges
	if (edge.src, edge.rel, edge.dst) in self.graph_edges
	)
	edge_consistency = matches / max(1, len(candidate.supporting_edges))

	answer_in_graph = 1.0 if candidate.answer in self.graph_nodes else 0.0
	answer_in_edges = 1.0 if any(
	candidate.answer in {edge.src, edge.dst} for edge in candidate.supporting_edges
	) else 0.0

	question_mentions_graph_symbol = 1.0 if any(
	node_id in candidate.question for node_id in self.graph_nodes
	) else 0.0

	return (
	0.45 * edge_consistency
	+ 0.30 * max(answer_in_graph, answer_in_edges)
	+ 0.25 * question_mentions_graph_symbol
	)

	def _diversity_score(self, question: str) -> float:
	if not self._seen_questions:
	return 1.0
	max_similarity = max(_jaccard_similarity(question, prior) for prior in self._seen_questions)
	return max(0.0, 1.0 - max_similarity)

	def _hardness_score(self, candidate: GeneratedTaskCandidate) -> float:
	if not candidate.is_valid:
	return -1.0
	predicted_answer = normalize_answer(self.answerer_judge.answer(candidate.question))
	target_answer = normalize_answer(candidate.answer)
	return 1.0 if predicted_answer != target_answer else -0.4

	@staticmethod
	def _support_path_coverage(candidate: GeneratedTaskCandidate) -> float:
	if not candidate.supporting_edges:
	return 0.0
	keys = {(edge.src, edge.rel, edge.dst) for edge in candidate.supporting_edges}
	return len(keys) / max(1, len(candidate.supporting_edges))

	def _swarm_diversity_score(self, candidate: GeneratedTaskCandidate) -> float:
	if not candidate.subagent_outputs:
	return 0.0
	distinct_ratio = _distinct_ngram_ratio(candidate.subagent_outputs, n=2)
	path_coverage = self._support_path_coverage(candidate)
	return max(0.0, min(1.0, (0.7 * distinct_ratio) + (0.3 * path_coverage)))

	def _context_pressure_score(self, validation_result: ReplayValidationResult) -> float:
	if not validation_result.is_valid:
	return 0.0

	node_util = validation_result.context_nodes / max(1, self.swarm_v2_shared_context.max_nodes)
	edge_util = validation_result.context_edges / max(1, self.swarm_v2_shared_context.max_edges)
	utilization = max(node_util, edge_util)
	target = max(0.05, float(self.swarm_v2_shared_context.target_pressure))
	if utilization > 1.0:
	return 0.0
	gap = abs(utilization - target)
	return max(0.0, 1.0 - (gap / max(target, 1.0 - target)))

	def _parl_scores(self, candidate: GeneratedTaskCandidate) -> tuple[float, float]:
	breakdown = parl_reward_breakdown(
	task_outcome_reward=0.0,
	spawn_count=candidate.orchestrator.spawn_count,
	finished_subtasks=candidate.orchestrator.finished_subtasks,
	critical_steps=candidate.orchestrator.critical_steps,
	lambda_parallel=0.15,
	lambda_finish=0.20,
	anneal=1.0,
	breadth=candidate.orchestrator.breadth,
	depth=candidate.orchestrator.depth,
	max_parallel_hint=self.parl_max_parallel_hint,
	)
	return breakdown.parallel, breakdown.finish

	def _swarm_v2_reward(
	self,
	candidate: GeneratedTaskCandidate,
	completion_text: str = "",
	) -> tuple[float, ReplayValidationResult]:
	validator = self._swarm_v2_validator
	validator.seen_questions = list(self._seen_questions)
	validation_result = validator.validate(candidate)
	if not validation_result.is_valid:
	return (
	self._invalid_swarm_v2_reward(candidate, validation_result, completion_text),
	validation_result,
	)

	hardness = self._hardness_score(candidate)
	swarm_diversity = self._swarm_diversity_score(candidate)
	context_pressure = self._context_pressure_score(validation_result)
	parl_parallel, parl_finish = self._parl_scores(candidate)
	hardness_component = max(0.0, min(1.0, (hardness + 0.4) / 1.4))
	consistency_component = max(
	0.0,
	min(
	1.0,
	(0.55 * context_pressure)
	+ (0.25 * parl_parallel)
	+ (0.20 * parl_finish),
	),
	)
	completion_component = max(0.0, min(1.0, self._completion_text_signal(completion_text) / 0.25))

	reward = (
	self.weights.validity
	+ (self.weights.hardness * hardness_component)
	+ (self.weights.diversity * swarm_diversity)
	+ (self.weights.consistency * consistency_component)
	+ (0.05 * completion_component)
	)
	return reward, validation_result

	def __call__(
	self,
	prompts: list[Any] \| None = None,
	completions: list[Any] \| None = None,
	**kwargs: Any,
	) -> list[float]:
	del prompts
	if completions is None:
	completions = list(kwargs.get("completions", []))
	rewards: list[float] = []
	batch_reasons: Counter[str] = Counter()
	valid_count = 0
	for completion in completions:
	try:
	text = decode_completion_text(completion)
	except Exception:
	text = ""
	try:
	candidate = parse_generated_task_completion(
	text, max_support_edges=self.max_support_edges
	)
	except Exception as exc:
	# Hard guard: a single malformed completion must NEVER take
	# down GRPO. Fall through to the invalid-floor with a tiny
	# text signal so the group still has reward variance.
	print(
	f"[reward_debug][parse_error] {type(exc).__name__}: {exc}; "
	f"text_head={text[:120]!r}"
	)
	rewards.append(
	float(max(-1.8, -0.6 + self._completion_text_signal(text)))
	)
	batch_reasons["parse_error"] += 1
	continue

	if self.pipeline_mode == "swarm_v2":
	try:
	reward, validation_result = self._swarm_v2_reward(
	candidate, completion_text=text
	)
	except Exception as exc:
	print(
	f"[reward_debug][reward_error] {type(exc).__name__}: {exc}"
	)
	rewards.append(
	float(max(-1.8, -0.6 + self._completion_text_signal(text)))
	)
	batch_reasons["reward_error"] += 1
	continue
	rewards.append(float(max(-1.8, min(1.2, reward))))
	if validation_result.is_valid and candidate.question:
	valid_count += 1
	self._seen_questions.append(candidate.question)
	if len(self._seen_questions) > 4096:
	self._seen_questions = self._seen_questions[-2048:]
	else:
	for reason in validation_result.reasons:
	batch_reasons[reason] += 1
	else:
	validity = self._validity_score(candidate)
	consistency = self._consistency_score(candidate)
	diversity = self._diversity_score(candidate.question) if candidate.question else 0.0
	hardness = self._hardness_score(candidate)

	reward = (
	self.weights.validity * validity
	+ self.weights.hardness * hardness
	+ self.weights.diversity * diversity
	+ self.weights.consistency * consistency
	)
	rewards.append(float(max(-2.0, min(1.2, reward))))

	if self.pipeline_mode != "swarm_v2" and candidate.question:
	self._seen_questions.append(candidate.question)
	if len(self._seen_questions) > 4096:
	self._seen_questions = self._seen_questions[-2048:]

	self._debug_batches_seen += 1
	self._debug_reward_window.extend(rewards)
	self._debug_reward_window = self._debug_reward_window[-512:]
	self._debug_reason_counter.update(batch_reasons)
	batch_mean = float(sum(rewards) / max(1, len(rewards)))
	batch_std = float(self._std(rewards))
	advantages = [float(value - batch_mean) for value in rewards]
	self._debug_last_batch = {
	"batch_rewards": list(rewards),
	"batch_reward_mean": batch_mean,
	"batch_reward_std": batch_std,
	"advantage_proxy_min": min(advantages) if advantages else 0.0,
	"advantage_proxy_max": max(advantages) if advantages else 0.0,
	"advantage_proxy_std": float(self._std(advantages)),
	"valid_count": int(valid_count),
	"invalid_count": int(max(0, len(rewards) - valid_count)),
	"valid_output_ratio": float(valid_count / max(1, len(rewards))),
	"top_invalid_reasons": batch_reasons.most_common(5),
	}
	if self.pipeline_mode == "swarm_v2" and (self._debug_batches_seen % 10 == 0):
	window_std = self._std(self._debug_reward_window)
	print(
	"[reward_debug][generator] "
	f"batches={self._debug_batches_seen} "
	f"window_reward_std={window_std:.6f} "
	f"last_batch_valid={valid_count}/{len(rewards)} "
	f"top_invalid_reasons={batch_reasons.most_common(3)}"
	)

	return rewards


	class AnswererRewardFunction:
	"""Answer-swarm reward wrapper that reuses the environment answer reward logic."""

	def __init__(
	self,
	graph: CanonicalGraph,
	pipeline_mode: str = "legacy",
	parl_max_parallel_hint: int = 0,
	):
	self.reward_model = build_reward_model(graph)
	self.pipeline_mode = str(pipeline_mode).strip().lower() or "legacy"
	self.parl_max_parallel_hint = max(0, int(parl_max_parallel_hint or 0))
	# Mirror GeneratorRewardFunction observability: TRL's GRPOTrainer
	# already logs `rewards/AnswererRewardFunction/{mean,std}` to W&B
	# at every `logging_steps`, but we ALSO publish a per-batch debug
	# snapshot so the [reward_debug][last_batch] line appears in stdout
	# for the answerer phase, exactly like it does for the generator.
	self._debug_batches_seen = 0
	self._debug_reward_window: list[float] = []
	self._debug_last_batch: dict[str, Any] = {}

	@staticmethod
	def _std(values: list[float]) -> float:
	if len(values) <= 1:
	return 0.0
	mean = sum(values) / len(values)
	variance = sum((value - mean) ** 2 for value in values) / len(values)
	return variance ** 0.5

	@staticmethod
	def _parse_support_edges(value: Any) -> list[Edge]:
	payload = value
	if isinstance(value, str):
	try:
	payload = json.loads(value)
	except json.JSONDecodeError:
	payload = []

	out: list[Edge] = []
	if not isinstance(payload, list):
	return out
	for row in payload:
	if not isinstance(row, dict):
	continue
	src = str(row.get("src", "")).strip()
	rel = str(row.get("rel", "")).strip()
	dst = str(row.get("dst", "")).strip()
	if not src or not rel or not dst:
	continue
	try:
	confidence = float(row.get("confidence", 1.0))
	except (TypeError, ValueError):
	confidence = 1.0
	out.append(Edge(src=src, rel=rel, dst=dst, confidence=confidence))
	return out

	@staticmethod
	def _value_at(column: Any, index: int, default: Any) -> Any:
	if isinstance(column, list) and index < len(column):
	return column[index]
	return default

	@staticmethod
	def _extract_predicted_edges(completion_text: str, support_edges: list[Edge]) -> list[Edge]:
	blob = _extract_json_blob(completion_text)
	if isinstance(blob, dict):
	structured_edges = _parse_edge_rows(blob.get("supporting_edges", []), max_support_edges=len(support_edges))
	if structured_edges:
	return structured_edges
	text = completion_text.lower()
	matched: list[Edge] = []
	for edge in support_edges:
	if edge.src.lower() in text and edge.rel.lower() in text and edge.dst.lower() in text:
	matched.append(edge)
	return matched

	def _extract_orchestrator_reward(self, completion_text: str, base_reward: float) -> float:
	if self.pipeline_mode != "swarm_v2":
	return float(base_reward)
	blob = _extract_json_blob(completion_text)
	orchestrator = _parse_orchestrator(blob.get("orchestrator")) if isinstance(blob, dict) else SwarmOrchestratorTelemetry()
	breakdown = parl_reward_breakdown(
	task_outcome_reward=base_reward,
	spawn_count=orchestrator.spawn_count,
	finished_subtasks=orchestrator.finished_subtasks,
	critical_steps=orchestrator.critical_steps,
	lambda_parallel=0.15,
	lambda_finish=0.20,
	anneal=1.0,
	breadth=orchestrator.breadth,
	depth=orchestrator.depth,
	max_parallel_hint=self.parl_max_parallel_hint,
	)
	return float(breakdown.total)

	def __call__(
	self,
	prompts: list[Any],
	completions: list[Any],
	answer: list[Any] \| None = None,
	question: list[Any] \| None = None,
	supporting_edges_json: list[Any] \| None = None,
	difficulty: list[Any] \| None = None,
	**kwargs: Any,
	) -> list[float]:
	rewards: list[float] = []
	success_count = 0
	graph_f1_sum = 0.0

	for idx, completion in enumerate(completions):
	completion_text = decode_completion_text(completion)
	predicted_answer = extract_answer_from_completion(completion_text)

	target_answer = normalize_answer(str(self._value_at(answer, idx, "")))
	question_text = str(self._value_at(question, idx, "")).strip()
	if not question_text:
	question_text = str(self._value_at(prompts, idx, "")).strip()

	support_payload = self._value_at(supporting_edges_json, idx, [])
	support_edges = self._parse_support_edges(support_payload)
	difficulty_level = str(self._value_at(difficulty, idx, "hard")).strip() or "hard"

	task = TaskInstance(
	task_id=f"train_task_{idx}",
	task_type="adversarial_trace",
	question=question_text,
	answer=target_answer,
	supporting_edges=support_edges,
	metadata={"difficulty": difficulty_level},
	)
	pred_edges = self._extract_predicted_edges(completion_text, support_edges)
	breakdown = compute_answer_reward(
	proposed_answer=predicted_answer,
	task=task,
	pred_edges=pred_edges,
	tool_outputs=[],
	step_count=1,
	model=self.reward_model,
	difficulty=difficulty_level,
	)
	final_reward = self._extract_orchestrator_reward(completion_text, breakdown.total)
	rewards.append(final_reward)

	if predicted_answer and target_answer and normalize_answer(predicted_answer) == target_answer:
	success_count += 1
	graph_f1_sum += float(getattr(breakdown, "graph_f1", 0.0) or 0.0)

	# Mirror GeneratorRewardFunction debug surface so the answerer reward
	# is visible to the same downstream tooling (printed by
	# `_train_grpo_phase` and forwarded to W&B by TRL).
	self._debug_batches_seen += 1
	self._debug_reward_window.extend(rewards)
	self._debug_reward_window = self._debug_reward_window[-512:]
	batch_size = max(1, len(rewards))
	batch_mean = float(sum(rewards) / batch_size)
	batch_std = float(self._std(rewards))
	advantages = [float(value - batch_mean) for value in rewards]
	self._debug_last_batch = {
	"batch_rewards": list(rewards),
	"batch_reward_mean": batch_mean,
	"batch_reward_std": batch_std,
	"advantage_proxy_min": min(advantages) if advantages else 0.0,
	"advantage_proxy_max": max(advantages) if advantages else 0.0,
	"advantage_proxy_std": float(self._std(advantages)),
	"exact_match_count": int(success_count),
	"exact_match_ratio": float(success_count / batch_size),
	"avg_graph_f1": float(graph_f1_sum / batch_size),
	}
	if self._debug_batches_seen % 10 == 0:
	window_std = self._std(self._debug_reward_window)
	print(
	"[reward_debug][answerer] "
	f"batches={self._debug_batches_seen} "
	f"window_reward_std={window_std:.6f} "
	f"last_batch_mean={batch_mean:.6f} "
	f"last_batch_std={batch_std:.6f} "
	f"exact_match_ratio={self._debug_last_batch['exact_match_ratio']:.3f} "
	f"avg_graph_f1={self._debug_last_batch['avg_graph_f1']:.3f}",
	flush=True,
	)

	return rewards