TD3B / td3b /td3b_finetune.py

Upload TD3B code (inference, training, baselines)

ee6da62 verified 27 days ago

25.5 kB

	"""
	TD3B Finetuning Loop
	Extends TR2-D2 training with contrastive loss and directional rewards.
	"""

	import numpy as np
	import torch
	import wandb
	import os
	from finetune_utils import loss_wdce
	from .td3b_losses import TD3BTotalLoss, extract_embeddings_from_mdlm
	from tqdm import tqdm
	import pandas as pd
	from plotting import plot_data_with_distribution_seaborn, plot_data


	def td3b_finetune(
	args,
	cfg,
	policy_model,
	reward_model,
	mcts=None,
	pretrained=None,
	filename=None,
	prot_name=None,
	eps=1e-5,
	# TD3B-specific arguments
	contrastive_weight=0.1,
	contrastive_margin=1.0,
	contrastive_type='margin',
	embedding_pool_method='mean',
	kl_beta=0.1
	):
	"""
	TD3B finetuning with combined WDCE + contrastive loss + KL regularization.

	Args:
	args: Configuration arguments
	cfg: Hydra config
	policy_model: Policy model (MDLM)
	reward_model: Reward scoring functions (TD3BRewardFunction)
	mcts: TD3B_MCTS instance
	pretrained: Pretrained model (for no-MCTS mode)
	filename: Output filename
	prot_name: Target protein name
	eps: Small epsilon
	contrastive_weight: λ for contrastive loss
	contrastive_margin: Margin for margin-based contrastive loss
	contrastive_type: 'margin' or 'infonce'
	embedding_pool_method: 'mean', 'max', or 'cls'
	kl_beta: β coefficient for KL divergence regularization
	Returns:
	batch_losses: List of training losses
	"""
	base_path = args.base_path
	dt = (1 - eps) / args.total_num_steps

	if args.no_mcts:
	assert pretrained is not None, "pretrained model is required for no mcts"
	else:
	assert mcts is not None, "mcts is required for mcts"

	# Create reference model (frozen copy of policy model at start of training)
	# Cannot use copy.deepcopy() due to unpicklable objects (file handles, etc.)
	# Instead, create a new model instance and load CLONED state dict
	print("[TD3B] Creating reference model for KL regularization...")

	# Import Diffusion class
	from diffusion import Diffusion

	# Create new instance with same config
	reference_model = Diffusion(
	config=policy_model.config,
	tokenizer=policy_model.tokenizer,
	mode="eval",
	device=policy_model.device if hasattr(policy_model, 'device') else args.device
	)

	# Get the device from policy model
	device = policy_model.device if hasattr(policy_model, 'device') else args.device
	if device is None:
	device = next(policy_model.parameters()).device

	# IMPORTANT: Clone the state dict to create independent tensors
	# This ensures no memory sharing between policy and reference model
	state_dict_copy = {
	key: value.clone().detach()
	for key, value in policy_model.state_dict().items()
	}
	reference_model.load_state_dict(state_dict_copy)

	# Move reference model to same device as policy model
	reference_model = reference_model.to(device)

	# Freeze and set to eval mode
	reference_model.eval()
	for param in reference_model.parameters():
	param.requires_grad = False

	print(f"[TD3B] Reference model frozen with {sum(p.numel() for p in reference_model.parameters())} parameters")
	print(f"[TD3B] Reference model on device: {device}")

	# Verify no parameter sharing
	policy_params = {id(p) for p in policy_model.parameters()}
	ref_params = {id(p) for p in reference_model.parameters()}
	assert len(policy_params.intersection(ref_params)) == 0, \
	"ERROR: Reference model shares parameters with policy model!"
	print("[TD3B] ✓ Verified: No parameter sharing between policy and reference model")

	# Initialize TD3B total loss
	td3b_loss_fn = TD3BTotalLoss(
	contrastive_weight=contrastive_weight,
	contrastive_margin=contrastive_margin,
	contrastive_type=contrastive_type,
	kl_beta=kl_beta,
	reference_model=reference_model
	)

	# Set model to train mode
	policy_model.train()
	torch.set_grad_enabled(True)
	optim = torch.optim.AdamW(policy_model.parameters(), lr=args.learning_rate)

	# Record metrics
	batch_losses = []
	batch_wdce_losses = []
	batch_contrastive_losses = []
	batch_kl_losses = []

	# Initialize saved trajectories
	x_saved, log_rnd_saved, final_rewards_saved = None, None, None
	directional_labels_saved, confidences_saved = None, None

	# Logs
	valid_fraction_log = []
	affinity_log = []
	gated_reward_log = []
	confidence_log = []
	direction_prediction_log = [] # Oracle predictions f_φ ∈ [0, 1]
	consistency_reward_log = [] # d* × (f_φ - 0.5)

	### Fine-Tuning Loop ###
	pbar = tqdm(range(args.num_epochs))

	for epoch in pbar:
	rewards = []
	losses = []

	policy_model.train()

	with torch.no_grad():
	if x_saved is None or epoch % args.resample_every_n_step == 0:
	# Generate trajectories
	if args.no_mcts:
	# Direct sampling (not typical for TD3B, but keep for compatibility)
	x_final, log_rnd, final_rewards = policy_model.sample_finetuned_with_rnd(
	args, reward_model, pretrained
	)
	directional_labels = torch.zeros(x_final.size(0), dtype=torch.float32)
	confidences = torch.ones(x_final.size(0), dtype=torch.float32)
	else:
	# TD3B MCTS forward pass
	# For dual-direction mode, sample BOTH directions in the same batch
	if hasattr(args, 'target_direction') and args.target_direction == 'both':
	print(f"[Dual-direction] Epoch {epoch}: Sampling BOTH agonist and antagonist binders")

	# Sample agonist binders (d* = +1)
	reward_model.target_direction = 1.0
	if epoch % args.reset_every_n_step == 0:
	results_agonist = mcts.forward(resetTree=True)
	else:
	results_agonist = mcts.forward(resetTree=False)

	# Sample antagonist binders (d* = -1)
	reward_model.target_direction = -1.0
	# Don't reset tree for antagonist to save computation
	results_antagonist = mcts.forward(resetTree=False)

	# Unpack both results
	if len(results_agonist) == 7 and len(results_antagonist) == 7:
	x_agonist, log_rnd_agonist, rewards_agonist, _, _, labels_agonist, conf_agonist = results_agonist
	x_antagonist, log_rnd_antagonist, rewards_antagonist, _, _, labels_antagonist, conf_antagonist = results_antagonist

	# Force labels to be correct (in case oracle is wrong)
	labels_agonist = torch.ones(x_agonist.size(0), dtype=torch.float32) * 1.0 # +1 for agonist
	labels_antagonist = torch.ones(x_antagonist.size(0), dtype=torch.float32) * -1.0 # -1 for antagonist

	# Combine both directions into single batch
	x_final = torch.cat([x_agonist, x_antagonist], dim=0)
	log_rnd = torch.cat([log_rnd_agonist, log_rnd_antagonist], dim=0)
	final_rewards = np.concatenate([rewards_agonist, rewards_antagonist], axis=0)
	directional_labels = torch.cat([labels_agonist, labels_antagonist], dim=0)
	confidences = torch.cat([
	conf_agonist if isinstance(conf_agonist, torch.Tensor) else torch.tensor(conf_agonist),
	conf_antagonist if isinstance(conf_antagonist, torch.Tensor) else torch.tensor(conf_antagonist)
	], dim=0)

	print(f" → Combined batch: {x_agonist.size(0)} agonists + {x_antagonist.size(0)} antagonists = {x_final.size(0)} total")
	print(f" → Directional labels: {torch.unique(directional_labels).tolist()} (DIVERSITY CONFIRMED!)")
	else:
	raise ValueError("Dual-direction mode requires 7-value return from MCTS")
	else:
	# Single-direction mode
	if epoch % args.reset_every_n_step == 0:
	results = mcts.forward(resetTree=True)
	else:
	results = mcts.forward(resetTree=False)

	# Unpack results (TD3B version includes directional labels and confidences)
	if len(results) == 7:
	x_final, log_rnd, final_rewards, score_vectors, sequences, directional_labels, confidences = results
	# Convert numpy arrays to tensors immediately for consistency
	if not isinstance(directional_labels, torch.Tensor):
	directional_labels = torch.tensor(directional_labels, dtype=torch.float32)
	if not isinstance(confidences, torch.Tensor):
	confidences = torch.tensor(confidences, dtype=torch.float32)
	else:
	# Fallback for compatibility with base MCTS
	x_final, log_rnd, final_rewards, score_vectors, sequences = results
	directional_labels = torch.zeros(x_final.size(0), dtype=torch.float32)
	confidences = torch.ones(x_final.size(0), dtype=torch.float32)

	# Save for next iteration
	x_saved = x_final
	log_rnd_saved = log_rnd
	final_rewards_saved = final_rewards
	directional_labels_saved = directional_labels
	confidences_saved = confidences
	else:
	# Reuse cached trajectories
	x_final = x_saved
	log_rnd = log_rnd_saved
	final_rewards = final_rewards_saved
	directional_labels = directional_labels_saved
	confidences = confidences_saved

	# Compute WDCE loss
	wdce_loss = loss_wdce(
	policy_model,
	log_rnd,
	x_final,
	num_replicates=args.wdce_num_replicates,
	centering=args.centering
	)

	# Compute KL divergence loss
	# Use a random masking and forward pass for KL computation
	mask_index = policy_model.mask_index
	device = x_final.device

	# Sample random noise level
	lamda = torch.rand(x_final.shape[0], device=device) # (B,)
	sigma_kl = -torch.log1p(-(1 - eps) * lamda)

	# Apply random masking
	masked_index = torch.rand(*x_final.shape, device=device) < lamda[..., None] # (B, L)
	perturbed_batch = torch.where(masked_index, mask_index, x_final)
	attn_mask_kl = torch.ones_like(perturbed_batch).to(device)

	# Compute KL loss
	kl_loss = td3b_loss_fn.compute_kl_loss(
	policy_model,
	perturbed_batch,
	attn_mask_kl,
	sigma_kl
	)

	# Extract embeddings for contrastive loss
	# Only compute if we have directional labels
	if directional_labels is not None and len(torch.unique(directional_labels)) > 1:
	# Get device from backbone
	device = policy_model.backbone.device if hasattr(policy_model.backbone, 'device') else x_final.device

	embeddings = extract_embeddings_from_mdlm(
	policy_model,
	x_final.to(device),
	pool_method=embedding_pool_method
	)

	# Move directional labels to same device
	directional_labels = directional_labels.to(embeddings.device)

	# Enable debug mode for first 3 epochs or if loss was zero last epoch
	debug_mode = (epoch < 3) or (epoch > 0 and batch_contrastive_losses and batch_contrastive_losses[-1] < 1e-6)

	# Compute total TD3B loss
	total_loss, loss_dict = td3b_loss_fn.compute_loss(
	wdce_loss,
	embeddings,
	directional_labels,
	kl_loss=kl_loss, # Pass KL loss
	debug=debug_mode # Enable debugging when needed
	)
	else:
	# If no directional diversity, skip contrastive loss
	print(f"[WARNING] Epoch {epoch}: No directional diversity! Skipping contrastive loss.")
	print(f" Labels: {directional_labels.cpu().tolist() if directional_labels is not None else 'None'}")
	total_loss = wdce_loss + td3b_loss_fn.kl_beta * kl_loss
	loss_dict = {
	'total_loss': total_loss.item(),
	'wdce_loss': wdce_loss.item(),
	'contrastive_loss': 0.0,
	'kl_loss': kl_loss.item()
	}

	# Gradient descent
	total_loss.backward()

	# Gradient clipping
	if args.grad_clip:
	torch.nn.utils.clip_grad_norm_(policy_model.parameters(), args.gradnorm_clip)

	optim.step()
	optim.zero_grad()

	pbar.set_postfix(
	total_loss=loss_dict['total_loss'],
	wdce=loss_dict['wdce_loss'],
	ctr=loss_dict['contrastive_loss']
	)

	# Evaluation sampling
	x_eval, eval_metrics = policy_model.sample_finetuned_td3b(
	args,
	reward_model,
	batch_size=50,
	dataframe=False
	)

	# Extract metrics (TD3B-specific)
	affinity = eval_metrics.get('affinity', [0])
	gated_reward = eval_metrics.get('gated_reward', [0])
	confidence = eval_metrics.get('confidence', [1])
	valid_fraction = eval_metrics.get('valid_fraction', 0)

	# Extract direction predictions (f_φ ∈ [0, 1])
	direction_predictions = eval_metrics.get('direction_predictions', [0.5])

	# Compute consistency reward: d* × (f_φ - 0.5)
	# Get target direction d* from reward_model
	d_star = reward_model.target_direction # +1 or -1
	consistency_rewards = [d_star * (f_phi - 0.5) for f_phi in direction_predictions]

	# Append to logs
	affinity_log.append(affinity)
	gated_reward_log.append(gated_reward)
	confidence_log.append(confidence)
	valid_fraction_log.append(valid_fraction)
	direction_prediction_log.append(direction_predictions)
	consistency_reward_log.append(consistency_rewards)

	batch_losses.append(loss_dict['total_loss'])
	batch_wdce_losses.append(loss_dict['wdce_loss'])
	batch_contrastive_losses.append(loss_dict['contrastive_loss'])
	batch_kl_losses.append(loss_dict.get('kl_loss', 0.0))

	# Compute search statistics
	if args.no_mcts:
	mean_reward_search = final_rewards.mean().item()
	min_reward_search = final_rewards.min().item()
	max_reward_search = final_rewards.max().item()
	median_reward_search = final_rewards.median().item()
	else:
	mean_reward_search = np.mean(final_rewards)
	min_reward_search = np.min(final_rewards)
	max_reward_search = np.max(final_rewards)
	median_reward_search = np.median(final_rewards)

	# Compute direction oracle and consistency reward statistics
	mean_direction = np.mean(direction_predictions) if len(direction_predictions) > 0 else 0.5
	std_direction = np.std(direction_predictions) if len(direction_predictions) > 0 else 0.0
	mean_consistency = np.mean(consistency_rewards) if len(consistency_rewards) > 0 else 0.0
	std_consistency = np.std(consistency_rewards) if len(consistency_rewards) > 0 else 0.0

	print(
	f"epoch {epoch} \| "
	f"affinity {np.mean(affinity):.4f} \| "
	f"gated_reward {np.mean(gated_reward):.4f} \| "
	f"confidence {np.mean(confidence):.4f} \| "
	f"valid_frac {valid_fraction:.4f} \| "
	f"direction_oracle {mean_direction:.4f}±{std_direction:.4f} \| "
	f"consistency_reward {mean_consistency:.4f}±{std_consistency:.4f} \| "
	f"total_loss {loss_dict['total_loss']:.4f} \| "
	f"wdce_loss {loss_dict['wdce_loss']:.4f} \| "
	f"contrastive_loss {loss_dict['contrastive_loss']:.4f} \| "
	f"kl_loss {loss_dict.get('kl_loss', 0.0):.4f}"
	)

	# W&B logging
	wandb.log({
	"epoch": epoch,
	"affinity": np.mean(affinity),
	"gated_reward": np.mean(gated_reward),
	"confidence": np.mean(confidence),
	"valid_fraction": valid_fraction,
	"direction_oracle/mean": mean_direction,
	"direction_oracle/std": std_direction,
	"consistency_reward/mean": mean_consistency,
	"consistency_reward/std": std_consistency,
	"total_loss": loss_dict['total_loss'],
	"wdce_loss": loss_dict['wdce_loss'],
	"contrastive_loss": loss_dict['contrastive_loss'],
	"kl_loss": loss_dict.get('kl_loss', 0.0),
	"mean_reward_search": mean_reward_search,
	"min_reward_search": min_reward_search,
	"max_reward_search": max_reward_search,
	"median_reward_search": median_reward_search
	})

	# Save checkpoint
	if (epoch + 1) % args.save_every_n_epochs == 0:
	model_path = os.path.join(args.save_path, f'model_{epoch}.ckpt')
	torch.save(policy_model.state_dict(), model_path)
	print(f"model saved at epoch {epoch}")

	### End of Fine-Tuning Loop ###

	wandb.finish()

	# Save logs and plots
	plot_path = f'{base_path}/TR2-D2/tr2d2-pep/results/{args.run_name}'
	os.makedirs(plot_path, exist_ok=True)
	output_log_path = f'{base_path}/TR2-D2/tr2d2-pep/results/{args.run_name}/log_{filename}.csv'
	save_td3b_logs_to_file(
	valid_fraction_log,
	affinity_log,
	gated_reward_log,
	confidence_log,
	direction_prediction_log,
	consistency_reward_log,
	output_log_path
	)

	plot_data(valid_fraction_log,
	save_path=f'{base_path}/TR2-D2/tr2d2-pep/results/{args.run_name}/valid_{filename}.png')

	plot_data_with_distribution_seaborn(
	log1=affinity_log,
	save_path=f'{base_path}/TR2-D2/tr2d2-pep/results/{args.run_name}/affinity_{filename}.png',
	label1=f"Average Affinity to {prot_name}",
	title=f"Average Affinity to {prot_name} Over Iterations"
	)

	plot_data_with_distribution_seaborn(
	log1=gated_reward_log,
	save_path=f'{base_path}/TR2-D2/tr2d2-pep/results/{args.run_name}/gated_reward_{filename}.png',
	label1="Average Gated Reward",
	title="Average Gated Reward Over Iterations"
	)

	plot_data_with_distribution_seaborn(
	log1=confidence_log,
	save_path=f'{base_path}/TR2-D2/tr2d2-pep/results/{args.run_name}/confidence_{filename}.png',
	label1="Average Confidence",
	title="Average Confidence Over Iterations"
	)

	# Final evaluation
	x_eval, eval_metrics, df = policy_model.sample_finetuned_td3b(
	args,
	reward_model,
	batch_size=200,
	dataframe=True
	)
	df.to_csv(f'{base_path}/TR2-D2/tr2d2-pep/results/{args.run_name}/{prot_name}_generation_results.csv', index=False)

	return batch_losses


	def save_td3b_logs_to_file(valid_fraction_log, affinity_log, gated_reward_log, confidence_log,
	direction_prediction_log, consistency_reward_log, output_path):
	"""
	Saves TD3B-specific logs to a CSV file.

	Parameters:
	valid_fraction_log (list): Log of valid fractions over iterations.
	affinity_log (list): Log of binding affinity over iterations.
	gated_reward_log (list): Log of gated rewards over iterations.
	confidence_log (list): Log of confidence scores over iterations.
	direction_prediction_log (list): Log of direction oracle predictions over iterations.
	consistency_reward_log (list): Log of consistency rewards over iterations.
	output_path (str): Path to save the log CSV file.
	"""
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	# Combine logs into a DataFrame
	log_data = {
	"Iteration": list(range(1, len(valid_fraction_log) + 1)),
	"Valid Fraction": valid_fraction_log,
	"Binding Affinity": affinity_log,
	"Gated Reward": gated_reward_log,
	"Confidence": confidence_log,
	"Direction Oracle": direction_prediction_log,
	"Consistency Reward": consistency_reward_log
	}

	df = pd.DataFrame(log_data)

	# Save to CSV
	df.to_csv(output_path, index=False)
	print(f"Logs saved to {output_path}")


	# Add sampling method to diffusion model (monkey patch or extend)
	def add_td3b_sampling_to_model(model):
	"""
	Adds TD3B-specific sampling method to the model.
	This is a helper function to extend the existing model.
	"""
	def sample_finetuned_td3b(self, args, reward_model, batch_size=50, dataframe=False):
	"""
	TD3B-specific sampling that returns directional metrics.
	"""
	self.backbone.eval()
	self.noise.eval()

	if batch_size is None:
	batch_size = args.batch_size

	eps = getattr(args, "sampling_eps", 1e-5)
	num_steps = args.total_num_steps
	x_rollout = self.sample_prior(
	batch_size,
	args.seq_length).to(self.device, dtype=torch.long)

	timesteps = torch.linspace(1, eps, num_steps + 1, device=self.device)
	dt = torch.tensor((1 - eps) / num_steps, device=self.device)

	for i in range(num_steps):
	t = timesteps[i] * torch.ones(x_rollout.shape[0], 1, device=self.device)
	log_p, x_next = self.single_reverse_step(x_rollout, t=t, dt=dt)
	x_rollout = x_next.to(self.device)

	mask_positions = (x_rollout == self.mask_index)
	if mask_positions.any().item():
	log_p, x_next = self.single_noise_removal(x_rollout, t=t, dt=dt)
	x_rollout = x_next.to(self.device)

	# Convert x to sequences to get valid ones
	from utils.app import PeptideAnalyzer
	analyzer = PeptideAnalyzer()
	sequences = self.tokenizer.batch_decode(x_rollout)
	valid_mask = torch.tensor([analyzer.is_peptide(seq) for seq in sequences], device=self.device)
	valid_sequences = [seq for seq, keep in zip(sequences, valid_mask.tolist()) if keep]
	valid_x_final = x_rollout[valid_mask] if valid_mask.any().item() else torch.empty(0, device=self.device)
	valid_fraction = len(valid_sequences) / batch_size

	if len(valid_sequences) > 0:
	result = reward_model(valid_sequences)
	if isinstance(result, tuple):
	total_rewards, info = result
	affinity = np.asarray(info.get('affinities', total_rewards))
	confidence = np.asarray(info.get('confidences', np.ones_like(affinity)))
	direction_predictions = np.asarray(info.get('directions', np.zeros_like(affinity)))
	else:
	total_rewards = np.asarray(result)
	if total_rewards.ndim > 1:
	affinity = total_rewards[:, 0]
	else:
	affinity = total_rewards
	confidence = np.ones_like(affinity)
	direction_predictions = np.zeros_like(affinity)

	rewards_t = torch.as_tensor(total_rewards, dtype=torch.float32, device=self.device)
	alpha = max(float(getattr(args, "alpha", 0.1)), 1e-6)
	weights = torch.softmax(rewards_t / alpha, dim=0)
	idx = torch.multinomial(weights, num_samples=batch_size, replacement=True)

	idx_np = idx.detach().cpu().numpy()
	x_resampled = valid_x_final[idx]
	sequences = [valid_sequences[i] for i in idx_np]
	total_rewards = total_rewards[idx_np]
	affinity = affinity[idx_np]
	confidence = confidence[idx_np]
	direction_predictions = direction_predictions[idx_np]
	else:
	x_resampled = x_rollout
	total_rewards = np.array([])
	affinity = np.array([])
	confidence = np.array([])
	direction_predictions = np.array([])

	eval_metrics = {
	'affinity': affinity,
	'gated_reward': total_rewards,
	'confidence': confidence,
	'direction_predictions': direction_predictions,
	'valid_fraction': valid_fraction
	}

	if dataframe:
	df = pd.DataFrame({
	'sequence': sequences if len(total_rewards) else [],
	'affinity': affinity,
	'gated_reward': total_rewards,
	'confidence': confidence
	})
	return x_resampled, eval_metrics, df
	else:
	return x_resampled, eval_metrics

	# Attach method to model
	model.sample_finetuned_td3b = sample_finetuned_td3b.__get__(model, type(model))
	return model