Spaces:

Zyriix
/

prologue-demo

Running on Zero

Bowen Zheng

init

500ee30 3 days ago

39.7 kB

	import copy
	import glob as glob_module
	import math
	import os
	import random
	import shutil
	import subprocess
	import sys
	from pathlib import Path
	import itertools
	from typing import Iterator, Iterable, List, NamedTuple

	import numpy as np
	import torch
	import torch.distributed as dist
	from einops import rearrange
	from accelerate import Accelerator
	from omegaconf import OmegaConf


	def build_ar_logit_mask(vis_pos_mask, sem_pos_mask, vis_cb_size, sem_cb_size):
	"""Merge visual/semantic per-position masks into a single ``[T, vis_cb+sem_cb]`` AR logit mask."""
	if vis_pos_mask is None and sem_pos_mask is None:
	return None
	ar_vocab = vis_cb_size + sem_cb_size
	parts = []
	if sem_pos_mask is not None:
	sem_full = torch.full((sem_pos_mask.shape[0], ar_vocab), float('-inf'))
	sem_full[:, vis_cb_size:vis_cb_size + sem_cb_size] = sem_pos_mask
	parts.append(sem_full)
	if vis_pos_mask is not None:
	vis_full = torch.full((vis_pos_mask.shape[0], ar_vocab), float('-inf'))
	vis_full[:, :vis_cb_size] = vis_pos_mask
	parts.append(vis_full)
	return torch.cat(parts, dim=0) if parts else None


	def load_config():
	"""OmegaConf merge of ``--config`` / ``--configs`` (comma list, left-to-right) plus CLI ``key=value`` overrides."""
	OmegaConf.register_new_resolver("eval", eval, replace=True)
	cli = OmegaConf.from_cli()
	paths_str = cli.pop("--configs", None) or cli.pop("--config", None)
	if paths_str is None:
	raise ValueError("Must provide --config or --configs")
	paths = [p.strip() for p in str(paths_str).split(",") if p.strip()]
	conf = OmegaConf.merge(*[OmegaConf.load(p) for p in paths])
	for k, v in cli.items():
	OmegaConf.update(conf, k, v)
	return conf


	def print0(args, *kwargs):
	rank = 0
	if dist.is_available() and dist.is_initialized():
	rank = dist.get_rank()
	else:
	rank = int(os.environ.get("LOCAL_RANK", 0))
	if rank == 0:
	print(args, *kwargs)


	# ============================================================================
	# Phase / Target Training System
	# ============================================================================

	class Target(NamedTuple):
	DO_AE: bool = False
	DO_L2: bool = False
	DO_L1: bool = False
	DO_LPIPS: bool = False
	DO_GAN_G: bool = False
	DO_GAN_D: bool = False
	DO_PRIOR_AR: bool = False
	DO_PRIOR_ENC: bool = False

	class Phase(NamedTuple):
	num_steps: int
	targets: List[Target]
	internal_steps: List[int]

	def parse_phases(phases_str):
	phases = []
	for phase_str in phases_str.split(' '):
	num_steps, targets_str, internal_steps_str = phase_str.split(':')
	num_steps = int(num_steps)
	targets = [Target(**{k: True for obj in target_str.split(',') for k in obj.split('-') }) for target_str in targets_str.split(',')]
	internal_steps = [int(step) for step in internal_steps_str.split(',')]
	phases.append(Phase(num_steps, targets, internal_steps))
	return phases

	def parse_training_config_from_phases(phases):
	train_ae = False
	train_ar = False
	use_lpips_loss = False
	use_gan_loss = False
	train_prior_enc = False
	for phase in phases:
	for target in phase.targets:
	if target.DO_L1 or target.DO_L2 or target.DO_LPIPS or target.DO_GAN_G :
	train_ae = True
	if target.DO_PRIOR_AR or target.DO_PRIOR_ENC:
	train_ar = True
	if target.DO_LPIPS:
	use_lpips_loss = True
	if target.DO_GAN_G or target.DO_GAN_D:
	use_gan_loss = True
	if target.DO_PRIOR_ENC:
	train_prior_enc = True
	return train_ae, train_ar, use_lpips_loss, use_gan_loss, train_prior_enc

	def get_phase(global_step, phases, phase_step_accum, gan_start=0):
	target = None
	for phase_idx, phase_step in enumerate(phase_step_accum):
	if global_step <= phase_step:
	internel_step = (global_step - phase_step_accum[phase_idx-1]) if phase_idx > 0 else global_step
	internel_accumulate = list(itertools.accumulate(phases[phase_idx].internal_steps))
	internel_step = internel_step % internel_accumulate[-1]
	for inner_idx in range(len(internel_accumulate)):
	if internel_step < internel_accumulate[inner_idx]:
	target = phases[phase_idx].targets[inner_idx]
	break
	if target is not None:
	break

	DO_AE = any([target.DO_L2, target.DO_L1, target.DO_LPIPS, target.DO_GAN_G])
	target = Target(DO_L1=target.DO_L1,
	DO_L2=target.DO_L2,
	DO_LPIPS=target.DO_LPIPS,
	DO_GAN_G=target.DO_GAN_G and (global_step >= gan_start),
	DO_GAN_D=target.DO_GAN_D and (global_step >= gan_start),
	DO_PRIOR_AR=target.DO_PRIOR_AR,
	DO_PRIOR_ENC=target.DO_PRIOR_ENC,
	DO_AE=DO_AE)
	return phase_idx, inner_idx, target, internel_step


	# ============================================================================
	# Learning Rate Schedulers
	# ============================================================================

	def get_linear_schedule_with_warmup_peak(
	optimizer: torch.optim.Optimizer,
	num_warmup_steps: int,
	num_peak_steps: int,
	num_training_steps: int,
	last_epoch: int = -1,
	base_lr: float = 1e-4,
	end_lr: float = 0.0,
	):
	"""Linear warmup -> flat peak -> linear decay (``base_lr`` -> ``end_lr``)."""
	def lr_lambda(current_step):
	if current_step < num_warmup_steps:
	return float(current_step) / float(max(1, num_warmup_steps))
	elif current_step < num_warmup_steps + num_peak_steps:
	return 1.0
	else:
	decay_steps = num_training_steps - num_warmup_steps - num_peak_steps
	progress = float(current_step - num_warmup_steps - num_peak_steps) / float(max(1, decay_steps))
	progress = min(progress, 1.0)
	ratio = 1.0 - progress
	return (end_lr + (base_lr - end_lr) * ratio) / base_lr

	return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch)

	try:
	import wandb
	except ImportError:
	wandb = None

	try:
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	except Exception: # pragma: no cover
	plt = None

	# Trie structure for computing data conditional entropy
	from dataclasses import dataclass, field
	from typing import Dict, List, Tuple

	@dataclass
	class TrieNode:
	count: int = 0
	children: Dict[int, "TrieNode"] = field(default_factory=dict)

	def trie_insert(root: TrieNode, seq: List[int], max_depth: int) -> None:
	"""Insert a sequence into the Trie up to max_depth."""
	node = root
	for tok in seq[:max_depth]:
	nxt = node.children.get(tok)
	if nxt is None:
	nxt = TrieNode()
	node.children[tok] = nxt
	nxt.count += 1
	node = nxt

	def entropy_from_counts(counts: List[int], log_base: float = 2.0) -> float:
	"""Compute entropy from a list of counts."""
	if log_base <= 0:
	raise ValueError("log_base must be > 0")
	T = int(sum(counts))
	if T <= 0:
	return float("nan")
	s = 0.0
	for c in counts:
	if c > 0:
	s += c * math.log(c)
	denom = math.log(log_base) if log_base != math.e else 1.0
	return (math.log(T) - (s / float(T))) / denom

	def trie_conditional_entropy_all_positions(root: TrieNode, max_depth: int, log_base: float = 2.0) -> Tuple[List[float], List[int]]:
	"""Per-position conditional entropy H(X_d \| X_<d); returns (H_cond[d], num_contexts[d])."""
	if max_depth <= 0:
	return [], []

	H_cond: List[float] = []
	num_contexts: List[int] = []

	# Position 0: H(X_0)
	root_child_counts = [int(ch.count) for ch in root.children.values()]
	H_cond.append(float(entropy_from_counts(root_child_counts, log_base=log_base)))
	num_contexts.append(1)

	# Position 1 to max_depth-1: H(X_d \| X_<d)
	for d in range(1, max_depth):
	target_depth = d
	total_T = 0
	weighted_sum = 0.0
	ctx_cnt = 0
	stack: List[Tuple[TrieNode, int]] = [(root, 0)]

	while stack:
	node, depth = stack.pop()
	if depth == target_depth:
	ctx_T = int(node.count)
	if ctx_T > 0:
	child_counts = [int(ch.count) for ch in node.children.values()]
	if len(child_counts) == 0:
	continue
	Hc = entropy_from_counts(child_counts, log_base=log_base)
	total_T += ctx_T
	weighted_sum += ctx_T * Hc
	ctx_cnt += 1
	continue
	if depth > target_depth:
	continue
	for child in node.children.values():
	stack.append((child, depth + 1))

	H = weighted_sum / float(total_T) if total_T > 0 else float("nan")
	H_cond.append(float(H))
	num_contexts.append(int(ctx_cnt))

	return H_cond, num_contexts

	def _entropy_from_logits(logits: torch.Tensor, log_base: float = 2.0) -> torch.Tensor:
	"""Masked categorical entropy from logits (bits when ``log_base == 2``)."""
	logits = logits.float()
	log_probs = torch.log_softmax(logits, dim=-1)
	probs = log_probs.exp()
	ent_nats = -torch.nan_to_num(probs * log_probs, nan=0.0).sum(dim=-1) # 0*log0 -> 0
	if log_base == math.e:
	return ent_nats
	return ent_nats / math.log(log_base)


	def plot_data_conditional_entropy(
	out_path: str,
	H_cond: list \| None,
	log_base: float = 2.0,
	codebook_size: int \| None = None,
	title_prefix: str = "Data conditional entropy",
	) -> bool:
	"""Save bar plot of H(X_d \| X_<d) to ``out_path``; optional ``codebook_size`` adds reference lines."""
	if plt is None:
	return False
	if H_cond is None or len(H_cond) == 0:
	return False

	N = len(H_cond)
	xs = list(range(N))

	fig = plt.figure(figsize=(max(10, N * 0.05), 4))
	plt.bar(xs, H_cond, width=1.0, color="#E45756", label="H(X_d \| X_<d)", edgecolor='none')

	plt.grid(True, linestyle="--", alpha=0.3, axis='y')

	# Theoretical reference lines (fixed codebook)
	if codebook_size is not None and codebook_size > 0:
	max_entropy = math.log(codebook_size) / math.log(log_base)
	plt.axhline(y=max_entropy, color='red', linestyle='--', linewidth=2.0,
	label=f'Max H (Independent): {max_entropy:.2f}', alpha=0.8, zorder=10)
	if N > 0:
	codes_per_pos = codebook_size / N
	if codes_per_pos >= 1.0:
	split_entropy = math.log(codes_per_pos) / math.log(log_base)
	plt.axhline(y=split_entropy, color='orange', linestyle='-.', linewidth=2.0,
	label=f'Split Codebook ({codebook_size}/{N}={codes_per_pos:.1f}): {split_entropy:.2f}',
	alpha=0.8, zorder=10)

	plt.xlim(-0.5, max(0, N - 0.5))

	step = max(1, N // 16)
	xticks = list(range(0, N, step))
	if (N - 1) not in xticks:
	xticks.append(N - 1)
	plt.xticks(xticks)

	# Compute the y-axis range from data + reference lines
	valid_vals = [v for v in H_cond if isinstance(v, (int, float)) and not math.isnan(v)]
	if codebook_size is not None and codebook_size > 0:
	valid_vals.append(math.log(codebook_size) / math.log(log_base))
	if N > 0 and codebook_size / N >= 1.0:
	valid_vals.append(math.log(codebook_size / N) / math.log(log_base))

	if valid_vals:
	ymax = max(valid_vals)
	ymax = max(ymax, 0.0)
	y_max_tick = int(math.ceil(ymax * 1.1)) # leave 10% headroom
	plt.yticks(list(range(0, y_max_tick + 1, max(1, y_max_tick // 5))))
	plt.ylim(0, y_max_tick)
	else:
	plt.ylim(bottom=0)

	plt.xlabel("position d (0-based)")
	plt.ylabel(f"conditional entropy (log_base={log_base})")
	plt.title(f"{title_prefix}: H(X_d \| X_<d)")
	plt.legend()
	plt.tight_layout()
	plt.savefig(out_path, dpi=200)
	plt.close(fig)
	return True

	def plot_ar_prefix_conditional_entropy(
	out_path: str,
	H: list \| None,
	log_base: float = 2.0,
	codebook_size: int \| None = None,
	title_prefix: str = "AR predictive conditional entropy",
	) -> bool:
	"""Save per-position entropy curve to ``out_path``; optional ``codebook_size`` adds reference lines.
	"""
	if plt is None:
	return False
	if H is None or len(H) <= 0:
	return False

	plot_len = len(H)
	xs = list(range(plot_len))
	fig = plt.figure(figsize=(max(10, plot_len * 0.05), 4))

	plt.bar(xs, H, width=1.0, color="#4C78A8", label="H_ar(X_d \| X_<d)", edgecolor='none')

	plt.grid(True, linestyle="--", alpha=0.3, axis='y')

	# Reference line (fixed codebook)
	if codebook_size is not None and codebook_size > 0:
	max_entropy = math.log(codebook_size) / math.log(log_base)
	plt.axhline(y=max_entropy, color='red', linestyle='--', linewidth=2.0,
	label=f'Max H (K={codebook_size}): {max_entropy:.2f}', alpha=0.8, zorder=10)

	plt.xlim(-0.5, max(0, plot_len - 0.5))

	step = max(1, plot_len // 16)
	xticks = list(range(0, plot_len, step))
	if (plot_len - 1) not in xticks:
	xticks.append(plot_len - 1)
	plt.xticks(xticks)

	valid_vals = [v for v in H if isinstance(v, (int, float)) and not math.isnan(v)]
	if codebook_size is not None and codebook_size > 0:
	valid_vals.append(math.log(codebook_size) / math.log(log_base))

	if valid_vals:
	ymax = max(max(valid_vals), 0.0)
	y_max_tick = int(math.ceil(ymax * 1.1))
	plt.yticks(list(range(0, y_max_tick + 1, max(1, y_max_tick // 5))))
	plt.ylim(0, y_max_tick)
	else:
	plt.ylim(bottom=0)

	plt.xlabel("position d (0-based)")
	plt.ylabel(f"conditional entropy (log_base={log_base})")
	plt.title(f"{title_prefix}, d=0..{plot_len-1} (log_base={log_base})")
	plt.legend()
	plt.tight_layout()
	plt.savefig(out_path, dpi=200)
	plt.close(fig)
	return True


	def compute_posterior_entropy_from_logits(logits: torch.Tensor, log_base: float = 2.0) -> torch.Tensor:
	"""Posterior entropy ``-E_q log q`` from ``[B, L, K]`` logits."""
	logits = logits.float()
	log_probs = torch.log_softmax(logits, dim=-1)
	probs = log_probs.exp()
	ent_nats = -torch.nan_to_num(probs * log_probs, nan=0.0).sum(dim=-1) # 0*log0 -> 0
	if log_base == math.e:
	return ent_nats
	return ent_nats / math.log(log_base)


	def compute_aggregated_entropy_from_counts(count_matrix: torch.Tensor, log_base: float = 2.0) -> torch.Tensor:
	"""Aggregated-posterior entropy ``-E_z log q(z)`` from ``[L, K]`` counts."""
	probs = count_matrix.float() / count_matrix.sum(dim=-1, keepdim=True).clamp(min=1.0)
	log_probs = torch.log(probs.clamp(min=1e-10))
	ent_nats = -(probs * log_probs).sum(dim=-1)
	if log_base == math.e:
	return ent_nats
	return ent_nats / math.log(log_base)


	def plot_posterior_entropy(
	sample_entropy: list \| None,
	aggregated_entropy: list \| None,
	*,
	accelerator: Accelerator,
	save_dir: str,
	global_step: int,
	rFID: float = 0.0,
	gFID: float = 0.0,
	log_base: float = 2.0,
	codebook_size: int \| None = None,
	out_name: str = "ae_pos_posterior_entropy.png",
	) -> None:
	"""Save bar plot of per-position sample/aggregated entropy."""
	if plt is None or not accelerator.is_main_process:
	return
	if sample_entropy is None and aggregated_entropy is None:
	return

	len_sample = len(sample_entropy) if sample_entropy is not None else 0
	len_agg = len(aggregated_entropy) if aggregated_entropy is not None else 0
	L = max(len_sample, len_agg)
	if L <= 0:
	return

	out_dir = Path(save_dir) / "analysis_ae" / f"Step={global_step+1}-rFID={rFID:.4f}-gFID={gFID:.4f}"
	out_dir.mkdir(exist_ok=True, parents=True)
	fig_path = out_dir / out_name

	# Thin bars
	fig = plt.figure(figsize=(max(10, L * 0.05), 4))
	xs = list(range(L))

	# Pick bar offsets/widths based on which series are present
	if sample_entropy is not None and aggregated_entropy is not None:
	# Both series: side-by-side with offsets
	if len(sample_entropy) > 0:
	plt.bar([x - 0.2 for x in xs[:len_sample]], sample_entropy,
	width=0.4, color="#F58518", label="Sample Entropy", edgecolor='none')
	if len(aggregated_entropy) > 0:
	plt.bar([x + 0.2 for x in xs[:len_agg]], aggregated_entropy,
	width=0.4, color="#4C78A8", label="Aggregated Entropy", edgecolor='none')
	else:
	# Only one series: centered bars
	if sample_entropy is not None and len(sample_entropy) > 0:
	plt.bar(xs[:len_sample], sample_entropy,
	width=1.0, color="#F58518", label="Sample Entropy", edgecolor='none')
	if aggregated_entropy is not None and len(aggregated_entropy) > 0:
	plt.bar(xs[:len_agg], aggregated_entropy,
	width=1.0, color="#4C78A8", label="Aggregated Entropy", edgecolor='none')

	plt.grid(True, linestyle="--", alpha=0.3, axis='y')

	# Theoretical reference lines (fixed codebook)
	if codebook_size is not None and codebook_size > 0:
	max_entropy = math.log(codebook_size) / math.log(log_base)
	plt.axhline(y=max_entropy, color='red', linestyle='--', linewidth=2.0,
	label=f'Max Entropy (Uniform over {codebook_size}): {max_entropy:.2f}', alpha=0.8, zorder=10)
	if L > 0:
	codes_per_pos = codebook_size / L
	if codes_per_pos >= 1.0:
	split_entropy = math.log(codes_per_pos) / math.log(log_base)
	plt.axhline(y=split_entropy, color='orange', linestyle='-.', linewidth=2.0,
	label=f'Split Codebook ({codebook_size}/{L}={codes_per_pos:.1f}): {split_entropy:.2f}',
	alpha=0.8, zorder=10)
	plt.xlim(-0.5, max(0, L - 0.5))

	# Thin out x-axis ticks
	step = max(1, L // 16)
	xticks = list(range(0, L, step))
	if (L - 1) not in xticks:
	xticks.append(L - 1)
	plt.xticks(xticks)

	# Compute the y-axis range
	valid_vals = []
	if sample_entropy is not None:
	valid_vals += [v for v in sample_entropy if isinstance(v, (int, float)) and not math.isnan(v)]
	if aggregated_entropy is not None:
	valid_vals += [v for v in aggregated_entropy if isinstance(v, (int, float)) and not math.isnan(v)]
	if codebook_size is not None and codebook_size > 0:
	valid_vals.append(math.log(codebook_size) / math.log(log_base))
	if L > 0 and codebook_size / L >= 1.0:
	valid_vals.append(math.log(codebook_size / L) / math.log(log_base))

	if valid_vals:
	ymax = max(valid_vals)
	ymax = max(ymax, 0.0)
	y_max_tick = int(math.ceil(ymax * 1.1)) # leave 10% headroom
	plt.yticks(list(range(0, y_max_tick + 1, max(1, y_max_tick // 5))))
	plt.ylim(0, y_max_tick)
	else:
	plt.ylim(bottom=0)

	plt.xlabel("position d (0-based)")
	plt.ylabel(f"entropy (log_base={log_base})")
	plt.title(f"Aggregated Posterior Entropy per Position")
	plt.legend()
	plt.tight_layout()
	plt.savefig(str(fig_path), dpi=200)
	plt.close(fig)

	accelerator.log({"analysis/ae_pos_posterior_entropy": wandb.Image(str(fig_path)), "global_step": global_step + 1}, step=global_step+1)


	def plot_codebook_usage(
	codebook_usage: torch.Tensor \| None,
	*,
	accelerator: Accelerator,
	save_dir: str,
	global_step: int,
	rFID: float = 0.0,
	gFID: float = 0.0,
	out_name: str = "ae_pos_code_usage_rate.png",
	) -> None:
	"""Save per-position unique-code / K usage from ``codebook_usage[L, K]`` counts."""
	if plt is None or not accelerator.is_main_process:
	return
	if codebook_usage is None or codebook_usage.dim() != 2:
	return

	# Per-position codebook utilization
	used_per_pos = (codebook_usage > 0).sum(dim=1).float() # [L]
	usage = (used_per_pos / float(codebook_usage.shape[1])).detach().cpu().tolist()

	L = int(len(usage))
	if L <= 0:
	return

	out_dir = Path(save_dir) / "analysis_ae" / f"Step={global_step+1}-rFID={rFID:.4f}-gFID={gFID:.4f}"
	out_dir.mkdir(exist_ok=True, parents=True)
	fig_path = out_dir / out_name

	# Bar plot
	fig = plt.figure(figsize=(max(10, L * 0.05), 4))
	plt.bar(list(range(L)), usage, color="#54A24B", width=1.0, edgecolor='none')
	plt.grid(True, linestyle="--", alpha=0.3, axis='y')
	plt.xlim(-0.5, max(0, L - 0.5))
	plt.ylim(0.0, 1.05)
	# Thin out x-axis ticks
	step = max(1, L // 16)
	xticks = list(range(0, L, step))
	if (L - 1) not in xticks:
	xticks.append(L - 1)
	plt.xticks(xticks)
	plt.xlabel("position d (0-based)")
	plt.ylabel(f"unique / K (K={codebook_usage.shape[1]})")

	plt.title("Codebook Usage Rate per Position")
	plt.tight_layout()
	plt.savefig(str(fig_path), dpi=200)
	plt.close(fig)
	accelerator.log({"analysis/ae_pos_code_usage_rate": wandb.Image(str(fig_path)), "global_step": global_step + 1}, step=global_step+1)

	def seed_everything(seed):
	"""Set python/numpy/torch (CPU+CUDA)/hash seeds."""
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	os.environ['PYTHONHASHSEED'] = str(seed)

	def worker_init_fn(worker_id):
	worker_seed = torch.initial_seed() % 2**32
	np.random.seed(worker_seed)
	random.seed(worker_seed)

	def make_worker_init_fn(base_seed: int):
	base_seed = int(base_seed) % (2**32)

	def _init(worker_id: int):
	rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
	worker_seed = (base_seed + worker_id + 1000 * rank) % (2**32)
	np.random.seed(worker_seed)
	random.seed(worker_seed)
	torch.manual_seed(worker_seed)

	return _init


	def load_accelerate_weights_only(
	*,
	accelerator: Accelerator,
	input_dir: str,
	strict: bool = True,
	map_location: str \| torch.device \| None = "cpu",
	) -> None:
	"""Load only ``model*.safetensors`` from an ``accelerator.save_state()`` dir (no optim/RNG/dl state)."""
	input_dir = os.path.expanduser(str(input_dir))
	if not os.path.isdir(input_dir):
	raise ValueError(f"Tried to load weights from {input_dir} but folder does not exist")

	from accelerate.state import DistributedType
	from accelerate.utils import load as accelerate_load, load_fsdp_model
	from accelerate.checkpointing import SAFE_MODEL_NAME, MODEL_NAME, load_model

	device_str = "cpu" if map_location in (None, "cpu") else str(map_location)
	input_path = Path(input_dir)

	# Iterate over accelerator._models to preserve save_state ordering.
	models = getattr(accelerator, "_models", None) or []
	if len(models) == 0:
	print0("[warn] No models registered in accelerator; skip loading weights.")
	return

	for i, model in enumerate(models):
	if accelerator.distributed_type == DistributedType.FSDP:
	load_fsdp_model(accelerator.state.fsdp_plugin, accelerator, model, input_dir, i)
	continue

	if accelerator.distributed_type == DistributedType.DEEPSPEED:
	ckpt_id = f"{MODEL_NAME}" if i == 0 else f"{MODEL_NAME}_{i}"
	model.load_checkpoint(
	input_dir,
	ckpt_id,
	load_optimizer_states=False,
	load_lr_scheduler_states=False,
	load_module_strict=bool(strict),
	)
	continue

	if accelerator.distributed_type == DistributedType.MEGATRON_LM:
	raise NotImplementedError(
	"resume_train=False (weights-only) is not supported for Megatron-LM checkpoints in this script."
	)

	ending = f"_{i}" if i > 0 else ""
	safe_file = input_path / f"{SAFE_MODEL_NAME}{ending}.safetensors"
	if safe_file.exists():
	load_model(model, safe_file, strict=bool(strict), device=device_str)
	continue

	bin_file = input_path / f"{MODEL_NAME}{ending}.bin"
	if bin_file.exists():
	state_dict = accelerate_load(bin_file, map_location=map_location)
	model.load_state_dict(state_dict, strict=bool(strict))
	continue

	raise FileNotFoundError(
	f"Could not find model weights for model index {i} under {input_dir}. "
	f"Tried: {safe_file.name} and {bin_file.name}"
	)


	@torch.no_grad()
	def draw_data_conditional_entropy(
	trie_root: TrieNode \| None,
	*,
	idx: torch.Tensor \| None = None,
	accelerator: Accelerator,
	save_dir: str,
	global_step: int,
	log_base: float = 2.0,
	finalize: bool = False,
	rFID: float = 0.0,
	gFID: float = 0.0,
	max_depth: int = 0,
	codebook_size: int \| None = None,
	) -> TrieNode \| None:
	"""Trie builder for data conditional entropy: idx chunks until ``finalize=True``, then plot + wandb log."""
	if not finalize:
	if idx is None:
	return trie_root
	idx_all = accelerator.gather(idx.detach()) # [B_total, L]
	if accelerator.is_main_process:
	if trie_root is None:
	trie_root = TrieNode()
	L = int(idx_all.shape[1])
	if max_depth <= 0:
	max_depth = L
	idx_cpu = idx_all.cpu().tolist()
	for seq in idx_cpu:
	trie_insert(trie_root, seq, max_depth=max_depth)
	return trie_root

	accelerator.wait_for_everyone()
	if not accelerator.is_main_process or trie_root is None:
	return trie_root
	if max_depth <= 0:
	max_depth = 256

	H_cond, num_contexts = trie_conditional_entropy_all_positions(
	trie_root, max_depth=max_depth, log_base=log_base
	)

	if len(H_cond) == 0:
	return trie_root

	out_dir = Path(save_dir) / "analysis_ae" / f"Step={global_step+1}-rFID={rFID:.4f}-gFID={gFID:.4f}"
	out_dir.mkdir(exist_ok=True, parents=True)
	fig_path = out_dir / "ae_data_conditional_entropy.png"

	saved = plot_data_conditional_entropy(
	out_path=str(fig_path),
	H_cond=H_cond,
	log_base=log_base,
	codebook_size=codebook_size,
	title_prefix="AE Data Conditional Entropy",
	)

	if saved and wandb is not None:
	accelerator.log(
	{"analysis/ae_data_conditional_entropy": wandb.Image(str(fig_path)), "global_step": global_step + 1},
	step=global_step+1
	)
	if len(H_cond) > 0:
	mean_H = float(np.nanmean(np.array(H_cond, dtype=np.float64)))
	accelerator.log(
	{"analysis/ae_data_cond_entropy_mean": mean_H, "global_step": global_step + 1},
	step=global_step+1
	)
	return trie_root

	@torch.no_grad()
	def draw_conditional_entropy(
	acc: dict,
	*,
	logits: torch.Tensor \| None = None,
	accelerator: Accelerator,
	save_dir: str,
	global_step: int,
	log_base: float = 2.0,
	finalize: bool = False,
	rFID: float = 0.0,
	gFID: float = 0.0,
	codebook_size: int \| None = None,
	) -> None:
	"""Accumulate logit entropy from existing logits; ``finalize=True`` reduces + plots + wandb-logs."""
	device = accelerator.device
	if not finalize:
	if logits is None:
	return

	ent = _entropy_from_logits(logits, log_base=log_base) # [B, L]
	L = int(ent.shape[1])
	if acc.get("ent_sum") is None:
	acc["ent_sum"] = torch.zeros(L, dtype=ent.dtype, device=device)
	acc["ent_cnt"] = torch.zeros(L, dtype=torch.long, device=device)
	elif int(acc["ent_sum"].shape[0]) < L:
	pad = L - int(acc["ent_sum"].shape[0])
	acc["ent_sum"] = torch.cat(
	[acc["ent_sum"], torch.zeros(pad, dtype=acc["ent_sum"].dtype, device=device)], dim=0,
	)
	acc["ent_cnt"] = torch.cat(
	[acc["ent_cnt"], torch.zeros(pad, dtype=torch.long, device=device)], dim=0,
	)
	acc["ent_sum"][:L] += ent.sum(dim=0)
	acc["ent_cnt"][:L] += int(ent.shape[0])
	return

	# finalize mode
	accelerator.wait_for_everyone()
	if acc.get("ent_sum") is not None and acc.get("ent_cnt") is not None:
	acc["ent_sum"] = accelerator.reduce(acc["ent_sum"], reduction='sum')
	acc["ent_cnt"] = accelerator.reduce(acc["ent_cnt"], reduction='sum')

	if not accelerator.is_main_process:
	return

	H = None
	if acc.get("ent_sum") is not None and acc.get("ent_cnt") is not None:
	denom = acc["ent_cnt"].clamp(min=1).to(acc["ent_sum"].dtype)
	H = (acc["ent_sum"] / denom).detach().cpu().tolist()

	out_dir = Path(save_dir) / "analysis_ar" / f"Step={global_step+1}-rFID={rFID:.4f}-gFID={gFID:.4f}"
	out_dir.mkdir(exist_ok=True, parents=True)
	fig_path = out_dir / "ar_prefix_conditional_entropy.png"
	saved = plot_ar_prefix_conditional_entropy(
	out_path=str(fig_path),
	H=H,
	log_base=log_base,
	codebook_size=codebook_size,
	)
	if saved:
	accelerator.log({"analysis/ar_prefix_conditional_entropy": wandb.Image(str(fig_path)), "global_step": global_step + 1}, step=global_step+1)
	if H is not None and len(H) > 0:
	accelerator.log(
	{
	"analysis/ar_entropy_mean_per_pos": float(np.nanmean(np.array(H, dtype=np.float64))),
	"global_step": global_step + 1,
	},
	step=global_step+1,
	)

	def generate_uniform_labels(
	*,
	num_samples: int,
	num_classes: int,
	accelerator: Accelerator,
	exclude_uncond: bool = True,
	) -> torch.Tensor:
	"""Uniform class label indices for this rank (excludes uncond class by default)."""
	num_valid_classes = num_classes - 1 if exclude_uncond else num_classes
	all_classes = list(range(num_valid_classes)) * (num_samples // num_valid_classes + 1)
	all_classes = all_classes[:num_samples]
	all_classes_tensor = torch.tensor(all_classes, dtype=torch.long)

	rank = accelerator.process_index
	num_devices = accelerator.num_processes
	samples_per_rank = num_samples // num_devices
	start_idx = rank * samples_per_rank
	end_idx = start_idx + samples_per_rank if rank < num_devices - 1 else num_samples

	return all_classes_tensor[start_idx:end_idx].to(accelerator.device)


	class InfiniteIterator(Iterator):
	def __init__(self, iterable: Iterable, dl_generator: torch.Generator = None):
	self.iterable = iterable
	self.dl_generator = dl_generator
	self._pre_epoch_gen_state = (
	dl_generator.get_state().clone() if dl_generator is not None else None
	)
	self._it = iter(iterable)
	self.total_yielded = 0

	def __iter__(self):
	return self

	def __next__(self):
	try:
	item = next(self._it)
	except StopIteration:
	if self.dl_generator is not None:
	self._pre_epoch_gen_state = self.dl_generator.get_state().clone()
	self._it = iter(self.iterable)
	item = next(self._it)
	self.total_yielded += 1
	return item

	# ============================================================================
	# File / Checkpoint Utils
	# ============================================================================

	def safe_remove_file(path: str):
	"""Remove a single file safely: warn on failure but never raise."""
	try:
	if path is not None and os.path.exists(path):
	os.remove(path)
	except Exception as e:
	print(f"Warning: Failed to remove {path}: {e}")


	def save_training_state(accelerator, save_path, extra_state, **updates):
	"""Save accelerator state + extra state for fully consistent resume."""
	if updates:
	extra_state.update(updates)
	accelerator.save_state(save_path)
	if accelerator.is_main_process:
	torch.save(extra_state, os.path.join(save_path, "extra_state.pt"))
	accelerator.wait_for_everyone()


	def remove_old_best_checkpoints(ckpt_dir: str, metric_type: str = "gFID"):
	"""Delete older ``best--{metric_type}=`` checkpoints under ``ckpt_dir``."""
	pattern = os.path.join(ckpt_dir, f"best--{metric_type}=")
	old_best_ckpts = glob_module.glob(pattern)

	for old_ckpt in old_best_ckpts:
	try:
	if os.path.isdir(old_ckpt):
	shutil.rmtree(old_ckpt)
	print(f"Removed old best checkpoint: {os.path.basename(old_ckpt)}")
	except Exception as e:
	print(f"Warning: Failed to remove {old_ckpt}: {e}")


	# ============================================================================
	# Image Processing Utils
	# ============================================================================

	def patchify(x, patch_size):
	x = rearrange(x, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size)
	return x


	def img_uint8_to_norm(x):
	return x.float() / 127.5 - 1.0


	def unpatchify(x, image_size, patch_size):
	x = rearrange(x, 'b (h w) (p1 p2 c) -> b c (h p1) (w p2)', p1=patch_size, p2=patch_size, h=image_size//patch_size, w=image_size//patch_size)
	return x


	def img_denormalize(x):
	return x.clamp(-1, 1) * 0.5 + 0.5


	def img_norm_to_uint8(x):
	return torch.clamp(127.5 * x + 128.0, 0, 255).byte()


	# ============================================================================
	# FID Utils
	# ============================================================================

	def adm_fid_evaluator(sample_cached_path, gt_cache_path, config, accelerator: Accelerator, compute_is=False):
	if not os.path.exists(gt_cache_path):
	raise FileNotFoundError(f"Ground-truth cache not found: {gt_cache_path}")
	if not os.path.exists(sample_cached_path):
	raise FileNotFoundError(f"Sample cache not found: {sample_cached_path}")

	fid_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'eval_fid.py')
	env = os.environ.copy()
	cmd = [sys.executable, fid_script, "--ref_batch", gt_cache_path, "--sample_batch", sample_cached_path, "--batch_size", str(config.eval_batch_size)]
	if compute_is:
	cmd.append("--compute_is")
	print0(f"Running FID evaluation via {fid_script}..." + (" (with IS)" if compute_is else ""))

	FID = 0.0
	IS = 0.0
	process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, env=env)
	assert process.stdout is not None
	for line in process.stdout:
	line = line.rstrip("\n")
	if line:
	print0(line, flush=True)
	if line.startswith("FID_RESULT:"):
	try:
	FID = float(line.split("FID_RESULT:")[1].strip())
	except ValueError:
	pass
	elif line.startswith("IS_RESULT:"):
	try:
	IS = float(line.split("IS_RESULT:")[1].strip())
	except ValueError:
	pass
	retcode = process.wait()
	if retcode != 0 and FID == 0.:
	print0(f"eval_fid.py exited with code {retcode} and no FID_RESULT was parsed.")

	if compute_is:
	return FID, IS
	return FID


	# ============================================================================
	# Training Utils
	# ============================================================================

	@torch.no_grad()
	@torch._dynamo.disable
	def _unwrap(model):
	"""Unwrap torch.compile / DDP wrappers to access the raw nn.Module."""
	while hasattr(model, '_orig_mod'):
	model = model._orig_mod
	while hasattr(model, 'module'):
	model = model.module
	return model


	@torch.no_grad()
	@torch._dynamo.disable
	def ema_update(model, ema_model, ema_rate):
	if model is None or ema_model is None:
	return
	for p, ema_p in zip(model.parameters(), ema_model.parameters()):
	ema_p.copy_(p.detach().lerp(ema_p, ema_rate))


	def sync_gradients(model, sub_modules=None):
	import torch.distributed as dist
	if not dist.is_initialized():
	return
	params = []
	if sub_modules is not None:
	for name in sub_modules:
	params.extend(getattr(model, name).parameters())
	else:
	params = list(model.parameters())
	for p in params:
	if p.grad is not None:
	dist.all_reduce(p.grad, op=dist.ReduceOp.AVG)


	def toggle_require_grad(model, grads=True, accelerator=None, sub_modules=None):
	if model is None:
	return
	if accelerator is not None:
	model = accelerator.unwrap_model(model)
	elif hasattr(model, "_orig_mod"):
	model = model._orig_mod

	if sub_modules is not None:
	for name in sub_modules:
	getattr(model, name).requires_grad_(grads)
	elif hasattr(model, "requires_grad_"):
	model.requires_grad_(grads)
	else:
	for p in model.parameters():
	p.requires_grad_(grads)


	def toggle_train_eval(model, train=True, accelerator=None, sub_modules=None):
	if model is None: return
	if accelerator is not None:
	model = accelerator.unwrap_model(model)
	elif hasattr(model, "_orig_mod"):
	model = model._orig_mod

	if sub_modules is not None:
	for name in sub_modules:
	getattr(model, name).train(mode=train)
	elif hasattr(model, "train"):
	model.train(mode=train)


	def zero_nan_gradients(model, accelerator=None):
	if model is None: return
	if accelerator is not None:
	model = accelerator.unwrap_model(model)
	elif hasattr(model, "_orig_mod"):
	model = model._orig_mod
	for name, param in model.named_parameters():
	if param.grad is not None:
	param.grad.nan_to_num_(nan=0.0, posinf=1e5, neginf=-1e5)


	def calc_grad_norm(
	named_models: dict,
	global_step: int,
	grad_norm_freq: int,
	accelerator=None,
	) -> dict:
	"""Per-parameter grad L2 norms as a flat dict for wandb (only on ``(step+1) % freq == 0``)."""
	if grad_norm_freq <= 0 or (global_step + 1) % grad_norm_freq != 0:
	return {}

	result = {}
	for group, model in named_models.items():
	if model is None:
	continue
	# unwrap DDP wrapper, then strip torch.compile's OptimizedModule
	raw = accelerator.unwrap_model(model) if accelerator is not None else model
	while hasattr(raw, "_orig_mod"):
	raw = raw._orig_mod
	sq_sum = 0.0
	for name, param in raw.named_parameters():
	if param.grad is not None:
	pnorm = param.grad.norm().item()
	result[f"Gradient_Norm/{group}/{name}"] = pnorm
	sq_sum += pnorm ** 2
	if sq_sum > 0.0:
	result[f"Gradient_Norm/{group}/_total"] = sq_sum ** 0.5
	return result


	def save_tensor_image_png_pdf(tensor, png_path: str, dpi: float = 300.0) -> None:
	"""Save ``[N, C, H, W]`` in [0, 1] to ``png_path`` and a sibling PDF (figure-friendly).
	"""
	import torchvision.utils

	torchvision.utils.save_image(tensor, png_path)
	pdf_path = os.path.splitext(png_path)[0] + ".pdf"
	from PIL import Image

	im = Image.open(png_path)
	if im.mode == "RGBA":
	bg = Image.new("RGB", im.size, (255, 255, 255))
	bg.paste(im, mask=im.split()[3])
	im.close()
	im = bg
	else:
	im = im.convert("RGB")
	im.save(pdf_path, "PDF", resolution=dpi)
	im.close()