Spaces:

jampuramprem
/

AxiomForgeAI

Sleeping

App Files Files Community

AxiomForgeAI / src /rl /value_network.py

jampuramprem

Initial Space deployment

ec4ae03 12 days ago

raw

history blame contribute delete

6.74 kB

	"""
	Value Network (Critic) for PPO.

	ValueHead wraps a frozen copy of the base language model backbone and
	appends a small MLP to regress a scalar value V(s_t) ∈ ℝ.

	Design notes
	------------
	- The backbone is loaded once with bfloat16 to fit on GPU.
	- Only the MLP head (value_head) is updated during training; the
	backbone can optionally be unfrozen for fine-grained critic learning.
	- The forward pass returns a 1-D tensor of shape (batch_size,) so the
	caller can do .item() for single inputs.
	"""

	from __future__ import annotations

	import logging
	from typing import Any, Optional

	import torch
	import torch.nn as nn
	from transformers import AutoConfig, AutoModel

	from src.utils.attn_backend import select_attn_implementation

	logger = logging.getLogger(__name__)


	class ValueHead(nn.Module):
	"""
	Critic network V_φ(s).

	Architecture
	------------
	backbone (LM encoder, frozen by default)
	↓ last-token hidden state [hidden_size]
	Linear(hidden_size, 256) + ReLU
	↓
	Linear(256, 1)
	↓ squeeze → scalar V(s)

	Args:
	base_model_path : HuggingFace model id or local checkpoint path.
	freeze_backbone : If True, backbone weights are not updated.
	Defaults to True (only head is trained).
	hidden_size : Override backbone hidden size (auto-detected
	from config when None).
	"""

	def __init__(
	self,
	base_model_path: str,
	freeze_backbone: bool = True,
	hidden_size: Optional[int] = None,
	model_device_map: Optional[Any] = "auto",
	max_memory: Optional[dict] = None,
	) -> None:
	super().__init__()

	logger.info(f"Loading ValueHead backbone from {base_model_path}")

	config = AutoConfig.from_pretrained(
	base_model_path, trust_remote_code=True
	)
	h = hidden_size or config.hidden_size

	# Always load on CPU first to avoid 90% GPU allocation
	# The caller will move to GPU if needed
	load_kwargs = {
	"torch_dtype": torch.bfloat16,
	"device_map": model_device_map,
	"low_cpu_mem_usage": True,
	"trust_remote_code": True,
	"attn_implementation": select_attn_implementation(),
	}

	self.backbone = AutoModel.from_pretrained(
	base_model_path,
	**load_kwargs,
	)

	if freeze_backbone:
	for param in self.backbone.parameters():
	param.requires_grad_(False)
	logger.info("Backbone frozen; only ValueHead MLP will be trained.")

	self.value_head = nn.Sequential(
	nn.Linear(h, 256),
	nn.ReLU(),
	nn.Linear(256, 1),
	)

	# ------------------------------------------------------------------
	# Forward
	# ------------------------------------------------------------------

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Compute V(s) for a batch of states.

	Args:
	input_ids : [batch, seq_len]
	attention_mask : [batch, seq_len] (ones if None)

	Returns:
	values : [batch] — scalar value estimate per sequence
	"""
	if attention_mask is None:
	attention_mask = torch.ones_like(input_ids)

	outputs = self.backbone(
	input_ids=input_ids,
	attention_mask=attention_mask,
	)

	# Last non-pad token (right-padded batches: last valid index per row)
	last_hidden = outputs.last_hidden_state # [B, T, H]
	last_idx = attention_mask.long().sum(dim=1) - 1
	last_idx = last_idx.clamp(min=0)
	b = torch.arange(last_hidden.size(0), device=last_hidden.device)
	cls_hidden = last_hidden[b, last_idx].to(self.value_head[0].weight.dtype)

	values = self.value_head(cls_hidden).squeeze(-1) # [B]
	return values

	@torch.no_grad()
	def values_at_positions(
	self,
	input_ids: torch.Tensor,
	positions: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Compute V(s_t) for many states in a SINGLE backbone forward pass.

	The naive rollout loop calls ``self.value(...)`` once per generated
	token, which does one full backbone forward over the growing
	sequence each step — that's O(T^2) work for T tokens. This helper
	lets the caller run the backbone exactly once on the full
	trajectory and then pluck hidden states at the positions that
	correspond to each state s_t.

	For a trajectory with prompt length P and T generated tokens,
	state s_t (= prompt + generated[:t], t=0..T-1) is a "last token"
	at position P + t - 1 in the full sequence, so callers pass
	``positions = torch.arange(P - 1, P + T - 1)``.

	Args:
	input_ids:
	[1, L] full trajectory (prompt + generated). A single
	un-padded sequence — callers that need batched different-
	length trajectories should loop over them (cheap because
	each call is O(L), not O(L^2)).
	positions:
	[N] long tensor of indices into the L-axis. Hidden states
	at these positions will be fed through the value MLP.
	attention_mask:
	Optional [1, L] mask. Defaults to all-ones.

	Returns:
	values: [N] scalar value estimates, one per requested position,
	on the same device as ``input_ids`` and already in float32
	(so callers can safely ``.tolist()`` them for the buffer).
	"""
	if attention_mask is None:
	attention_mask = torch.ones_like(input_ids)

	outputs = self.backbone(
	input_ids=input_ids,
	attention_mask=attention_mask,
	)
	hidden = outputs.last_hidden_state # [1, L, H]

	positions = positions.to(device=hidden.device, dtype=torch.long)
	# Clamp just in case the caller requests an out-of-range position
	# (e.g. T=0 edge cases). clamp is a no-op for valid indices.
	positions = positions.clamp(min=0, max=hidden.size(1) - 1)

	# Gather → [N, H]. Cast to the value_head's weight dtype so
	# bf16 backbone + fp32 head works regardless of how torch
	# autocast is configured on the caller side.
	gathered = hidden[0, positions].to(self.value_head[0].weight.dtype)
	values = self.value_head(gathered).squeeze(-1).float() # [N]
	return values