Spaces:

anugrahteesdollar
/

drugenv-trainer

Runtime error

App Files Files Community

drugenv-trainer / training /live_callback.py

anugrahteesdollar

fix: include requirements-train.txt + tests (glob bug)

ad12dda verified 12 days ago

raw

history blame contribute delete

12.9 kB

	"""Live (mid-training) evidence callback for the trainer Space dashboard.

	Bolt-on `transformers.TrainerCallback` that streams two CSVs while a
	GRPO run is in flight:

	* ``evidence/training_log.csv`` — one row per logging step, with all
	numeric values surfaced by TRL's log-history dict (``loss``,
	``reward``, ``reward_std``, ``kl``, ``grad_norm``, ``learning_rate``,
	``epoch``, ``frac_reward_zero_std``, ``completions/mean_length``, …).
	The columns are discovered dynamically from ``state.log_history`` —
	we never hard-code TRL's exact key set because it has shifted across
	releases.

	* ``evidence/checkpoint_evals.csv`` — every ``checkpoint_eval_steps``
	GRPO updates we run a short held-out rollout against
	``DrugTargetEnvironment`` (using the same heuristic policy
	``training/training_script.py::heuristic_next_action`` would pick if
	GRPO were silent), score it through DrugEnv's existing
	``RewardComputer`` / ``RuleEngine`` / ``TransitionEngine`` so the
	reward column names match ``RewardBreakdown.to_dict()`` 1-to-1, and
	append a row.

	On train-end, the existing post-hoc ``save_training_plots`` from
	``training/training_script.py`` is invoked again so the plots
	``training_dashboard.png`` / ``training_loss.png`` /
	``training_reward.png`` / ``training_metric.png`` are refreshed for the
	final state too. We deliberately do NOT replace that post-hoc plotter
	— both paths coexist.
	"""

	from __future__ import annotations

	import csv
	import logging
	import threading
	import time
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Sequence

	from models import ActionType, DrugTargetAction, ValidationObservation


	logger = logging.getLogger(__name__)


	# Reward component column names mirror ``server.rewards.reward
	# .RewardBreakdown.to_dict()`` so a reviewer can pivot the CSV against
	# the dashboard cards without renaming columns. ``term_*`` prefixed
	# duplicates are emitted by ``DrugTargetEnvironment.step`` to
	# disambiguate per-step vs terminal contributions; we keep both shapes.
	_REWARD_COLUMNS = (
	"evidence_coverage",
	"decision_accuracy",
	"credit_efficiency",
	"reasoning_coherence",
	"novelty",
	"penalty",
	"shaping",
	"terminal",
	"total",
	)


	def _ensure_dir(p: Path) -> None:
	p.parent.mkdir(parents=True, exist_ok=True)


	class _AppendOnlyCsv:
	"""Minimal append-only CSV with a thread-safe header rewrite when
	new fields show up after the file was first opened.

	GRPO's ``state.log_history`` adds keys as different code paths fire
	(e.g. ``frac_reward_zero_std`` only appears once a saturated batch
	is logged), so we cannot fix the header at construction time the
	way ``csv.DictWriter`` would prefer. Instead we accept the union
	of seen-so-far keys, rewrite the header in place when it grows,
	and pad missing fields with empty strings.
	"""

	def __init__(self, path: Path, base_fields: Sequence[str]) -> None:
	self.path = Path(path)
	_ensure_dir(self.path)
	self._lock = threading.Lock()
	self._fields: List[str] = list(base_fields)
	if not self.path.exists():
	with open(self.path, "w", newline="") as f:
	csv.DictWriter(f, fieldnames=self._fields).writeheader()

	def append(self, row: Dict[str, Any]) -> None:
	with self._lock:
	new_keys = [k for k in row if k not in self._fields]
	if new_keys:
	# Header grew — rewrite the whole file in place.
	self._fields.extend(new_keys)
	existing: List[Dict[str, str]] = []
	if self.path.exists():
	with open(self.path, newline="") as f:
	existing = list(csv.DictReader(f))
	with open(self.path, "w", newline="") as f:
	w = csv.DictWriter(f, fieldnames=self._fields)
	w.writeheader()
	for r in existing:
	w.writerow({k: r.get(k, "") for k in self._fields})
	with open(self.path, "a", newline="") as f:
	w = csv.DictWriter(f, fieldnames=self._fields)
	w.writerow({k: row.get(k, "") for k in self._fields})


	def _try_callback_class():
	try:
	from transformers import TrainerCallback # type: ignore
	return TrainerCallback
	except Exception as exc: # pragma: no cover - tested only inside Spaces
	logger.warning(
	"transformers.TrainerCallback unavailable (%s); LiveTrainingCallback "
	"will degrade to a no-op.",
	exc,
	)
	return object # plain stand-in; trainer will simply not call our hooks


	_TrainerCallback = _try_callback_class()


	class LiveTrainingCallback(_TrainerCallback): # type: ignore[misc]
	"""Streams ``training_log.csv`` + ``checkpoint_evals.csv`` during GRPO.

	Wire it into a TRL ``GRPOTrainer`` via ``callbacks=[LiveTrainingCallback(...)]``
	after building the trainer; see ``training/training_script.py`` for the
	canonical wiring. The callback only writes through DrugEnv's existing
	public API (``DrugTargetEnvironment``, ``RewardBreakdown.to_dict``,
	``training/training_script.py``'s heuristic policy) so future tweaks
	to the env or reward shape do not require touching this file.
	"""

	def __init__(
	self,
	*,
	evidence_dir: str = "evidence",
	checkpoint_eval_steps: int = 50,
	checkpoint_eval_episodes: int = 4,
	difficulty: Optional[str] = None,
	scenario_name: Optional[str] = None,
	) -> None:
	self.evidence_dir = Path(evidence_dir)
	self.checkpoint_eval_steps = max(1, int(checkpoint_eval_steps))
	self.checkpoint_eval_episodes = max(1, int(checkpoint_eval_episodes))
	self.difficulty = difficulty
	self.scenario_name = scenario_name
	self.evidence_dir.mkdir(parents=True, exist_ok=True)
	self._training_log = _AppendOnlyCsv(
	self.evidence_dir / "training_log.csv",
	base_fields=("step", "wall_time_s"),
	)
	self._checkpoint_log = _AppendOnlyCsv(
	self.evidence_dir / "checkpoint_evals.csv",
	base_fields=(
	"step", "fraction_done", "episodes",
	"mean_reward", "success_rate",
	"decision_accuracy_rate", "evidence_coverage_rate",
	"report_submitted_rate",
	),
	)
	self._t0 = time.time()
	self._last_eval_step = -1
	self._latest_log_keys: List[str] = []

	# ── transformers TrainerCallback hooks ──────────────────────────

	def on_log(self, _args, state, control, logs=None, **kw):
	logs = logs or {}
	row: Dict[str, Any] = {
	"step": getattr(state, "global_step", None),
	"wall_time_s": round(time.time() - self._t0, 2),
	}
	for k, v in logs.items():
	if isinstance(v, (int, float)) and not isinstance(v, bool):
	# CSV-friendly: replace '/' in TRL keys (e.g. 'rewards/mean')
	# with a flat dotted name so spreadsheet tools don't choke.
	row[k.replace("/", ".")] = v
	self._latest_log_keys = list(row.keys())
	self._training_log.append(row)
	return control

	def on_step_end(self, _args, state, control, **kw):
	step = getattr(state, "global_step", 0)
	if step <= 0 or step == self._last_eval_step:
	return control
	if step % self.checkpoint_eval_steps != 0:
	return control
	self._last_eval_step = step
	try:
	self._run_checkpoint_eval(step, state)
	except Exception as exc:
	logger.warning("checkpoint eval failed at step %d: %s", step, exc)
	return control

	def on_train_end(self, _args, state, control, **kw):
	# Best-effort: ask DrugEnv's existing post-hoc plotter to refresh
	# the static PNG dashboard. Imported lazily so import failures do
	# not break the in-flight run.
	try:
	from training.training_script import save_training_plots

	save_training_plots(
	getattr(state, "log_history", []) or [],
	str(self.evidence_dir),
	)
	except Exception as exc: # pragma: no cover - plotting is optional
	logger.warning("post-hoc plot refresh failed: %s", exc)
	return control

	# ── checkpoint-eval rollout ─────────────────────────────────────

	def _run_checkpoint_eval(self, step: int, state) -> None:
	"""Run a tiny held-out rollout using the heuristic policy and
	score the final episode through DrugEnv's RewardBreakdown."""
	from server.hackathon_environment import DrugTargetEnvironment, MAX_STEPS
	from training.training_script import (
	HEURISTIC_SEQUENCE,
	build_drug_target_action,
	heuristic_next_action,
	)

	episodes_summary: List[Dict[str, Any]] = []
	held_out_seed_base = 900_000
	for i in range(self.checkpoint_eval_episodes):
	env = DrugTargetEnvironment(
	scenario_name=self.scenario_name,
	domain_randomise=False,
	)
	obs: ValidationObservation = env.reset(seed=held_out_seed_base + i)
	history: List[ActionType] = []
	cumulative = 0.0
	steps = 0
	while not obs.done and steps < MAX_STEPS:
	next_at = heuristic_next_action(history, steps)
	action: DrugTargetAction = build_drug_target_action(next_at, obs)
	obs = env.step(action)
	history.append(action.action_type)
	cumulative += float(obs.reward or 0.0)
	steps += 1

	# Pull ground-truth flags off the latent — the heuristic
	# only "succeeds" if it ended in a submitted report whose
	# decision matched the hidden ``correct_decision``.
	latent = env._latent
	correct = (
	latent is not None
	and latent.progress.report_submitted
	and any(
	h == ActionType.SUBMIT_VALIDATION_REPORT for h in history
	)
	)
	terminal_breakdown = (obs.metadata or {}).get("reward_breakdown", {}) or {}
	evidence_cov = float(
	terminal_breakdown.get("term_evidence_coverage")
	or terminal_breakdown.get("evidence_coverage")
	or 0.0
	)
	decision_acc = float(
	terminal_breakdown.get("term_decision_accuracy")
	or terminal_breakdown.get("decision_accuracy")
	or 0.0
	)
	episodes_summary.append({
	"reward": cumulative,
	"decision_accuracy": decision_acc,
	"evidence_coverage": evidence_cov,
	"submitted": bool(latent and latent.progress.report_submitted),
	"correct": bool(correct and decision_acc >= 0.5),
	})

	n = len(episodes_summary)
	if n == 0:
	return
	mean_reward = sum(e["reward"] for e in episodes_summary) / n
	success_rate = sum(1 for e in episodes_summary if e["correct"]) / n
	decision_rate = sum(e["decision_accuracy"] for e in episodes_summary) / n
	coverage_rate = sum(e["evidence_coverage"] for e in episodes_summary) / n
	submitted_rate = sum(1 for e in episodes_summary if e["submitted"]) / n

	max_steps = getattr(state, "max_steps", None) or step
	self._checkpoint_log.append({
	"step": step,
	"fraction_done": round(step / max(max_steps, 1), 4),
	"episodes": n,
	"mean_reward": round(mean_reward, 4),
	"success_rate": round(success_rate, 4),
	"decision_accuracy_rate": round(decision_rate, 4),
	"evidence_coverage_rate": round(coverage_rate, 4),
	"report_submitted_rate": round(submitted_rate, 4),
	})
	logger.info(
	"[checkpoint-eval step=%d] reward=%.3f success=%.2f decision=%.2f "
	"coverage=%.2f submitted=%.2f (heuristic policy, n=%d)",
	step, mean_reward, success_rate, decision_rate,
	coverage_rate, submitted_rate, n,
	)


	__all__ = ["LiveTrainingCallback"]