modelling/soilformer.py · earthroverprogram/soilformer at main

Kuangdai

Initial release of SoilFormer

6fb6c07 6 days ago

26.3 kB

	# soilformer.py
	# -- coding: utf-8 --

	import json
	import os
	from pathlib import Path
	from typing import Dict, Optional, Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F # noqa

	from decode_categorical import CategoricalDecoder
	from decode_numeric import NumericDecoder
	from embed_categorical import (
	CategoricalEmbedding,
	build_cat_vocab_spec_from_meta,
	get_categorical_feature_names_from_meta,
	save_cat_vocab_json,
	)
	from embed_numeric import (
	NumericEmbedding,
	build_numeric_vocab_spec_from_meta,
	)
	from embed_vision_gemma3n import Gemma3nVisionFeatureExtractor
	from layer import TabularImageGQALayer
	from utils import load_json, save_json, get_dtype


	# ============================================================
	# SoilFormer
	# ============================================================

	class SoilFormer(nn.Module):
	"""
	Full model: embeddings -> TabularImageGQALayer stack -> decoders.
	"""

	def __init__(self, config: Dict, device: Optional[str] = None):
	super().__init__()
	self.config = dict(config)

	dtype = get_dtype(self.config.get("dtype", "bfloat16"))
	dev = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))

	# ---- Tabular dims
	cat_hidden = int(self.config["cat_hidden_size"])
	num_hidden = int(self.config["numeric_hidden_size"])
	if cat_hidden != num_hidden:
	raise ValueError("Expect cat_hidden_size == numeric_hidden_size for one tabular stream.")
	self.tabular_dim = cat_hidden

	# ---- Embeddings
	self.embed_cat = CategoricalEmbedding(
	hidden_size=cat_hidden,
	cat_vocab_json=self.config["cat_vocab_json"],
	)
	self.embed_num = NumericEmbedding(
	hidden_size=num_hidden,
	numeric_vocab_json=self.config["numeric_vocab_json"],
	middle_size=self.config.get("numeric_encode_middle_size", None),
	)

	# ---- Decoders
	self.decode_cat = CategoricalDecoder(
	hidden_size=cat_hidden,
	cat_vocab_json=self.config["cat_vocab_json"],
	middle_size=self.config.get("cat_decode_middle_size", None),
	homoscedastic=self.config.get("cat_homoscedastic", True),
	)
	self.decode_num = NumericDecoder(
	hidden_size=num_hidden,
	numeric_vocab_json=self.config["numeric_vocab_json"],
	middle_size=self.config.get("numeric_decode_middle_size", None),
	homoscedastic=self.config.get("num_homoscedastic", True),
	)

	# ---- Vision
	self.vision_extractor = Gemma3nVisionFeatureExtractor.from_pretrained_vision_only_dir(
	model_dir=self.config["vision_model_dir"],
	map_location="cpu",
	num_output_tokens_reduced=self.config["vision_num_output_tokens_reduced"],
	num_heads_for_token_reduction=self.config["vision_num_heads_for_token_reduction"],
	reducer_bottleneck_dim=self.config["vision_reducer_bottleneck_dim"],
	reducer_project_back=self.config["vision_reducer_project_back"],
	)

	# ---- Layers
	L = int(self.config["layer_num_layers"])
	self.layers = nn.ModuleList([
	TabularImageGQALayer(
	tabular_dim=self.tabular_dim,
	vision_dim=self.vision_extractor.get_actual_hidden_dim(),
	num_query_heads=int(self.config["layer_num_query_heads"]),
	num_kv_heads=int(self.config["layer_num_kv_heads"]),
	head_dim=int(self.config["layer_head_dim"]),
	mlp_ratio=float(self.config["layer_mlp_ratio"]),
	dropout=float(self.config["layer_dropout"]),
	)
	for _ in range(L)
	])

	# ---- Move
	self.to(device=dev, dtype=dtype)

	def init_weights(self, std: float = 0.02):
	self.embed_cat.init_weights(std=std)
	self.embed_num.init_weights(std=std)

	self.decode_cat.init_weights(std=std)
	self.decode_num.init_weights(std=std)

	self.vision_extractor.init_weights(std=std)

	for blk in self.layers:
	blk.init_weights(std=std)

	def forward(
	self,
	cat_local_ids: torch.LongTensor, # [B, M_cat]
	numeric_values_by_nin: Dict[int, torch.Tensor], # {n_in: [B, V, n_in]}
	cat_valid_positions: Optional[torch.Tensor] = None, # [B, M_cat] bool
	numeric_valid_positions_by_nin: Optional[Dict[int, torch.Tensor]] = None, # {n_in: [B,V] bool}
	pixel_values: Optional[torch.Tensor] = None, # [B, 3, H, W]
	vision_valid_positions: Optional[torch.Tensor] = None, # [B] bool OR indices [K]
	):
	# ----------------------------
	# Embeddings (tabular)
	# ----------------------------
	x_cat, cat_mask = self.embed_cat(
	local_ids=cat_local_ids,
	valid_positions=cat_valid_positions,
	)

	x_num, num_mask = self.embed_num(
	values_by_nin=numeric_values_by_nin,
	valid_positions_by_nin=numeric_valid_positions_by_nin,
	)

	x_tab = torch.cat([x_cat, x_num], dim=1) # [B, T_tab, H]

	B, T_tab, _ = x_tab.shape
	M_cat = x_cat.size(1)
	T_num = x_num.size(1)

	# ----------------------------
	# Tabular attention mask
	# ----------------------------
	cat_mask = cat_mask.to(device=x_tab.device, dtype=torch.long)
	num_mask = num_mask.to(device=x_tab.device, dtype=torch.long)

	if self.config["disable_tabular_attention_mask"]:
	attention_mask_tab = torch.ones(B, T_tab, device=x_tab.device, dtype=torch.long)
	else:
	attention_mask_tab = torch.cat([cat_mask, num_mask], dim=1)
	if attention_mask_tab.shape != (B, T_tab):
	raise RuntimeError("Internal attention_mask_tab shape mismatch")

	# ----------------------------
	# Vision features
	# ----------------------------
	if pixel_values is None:

	vision_features = None
	vision_mask = None

	else:

	vision_features, vision_mask = self.vision_extractor(
	pixel_values=pixel_values,
	valid_positions=vision_valid_positions,
	)

	if vision_features.shape[0] != B:
	raise ValueError("vision_features batch mismatch with tabular batch")

	if vision_mask.shape[0] != B or vision_mask.shape[1] != vision_features.shape[1]:
	raise ValueError("vision_mask shape mismatch with vision_features")

	vision_mask = vision_mask.to(
	device=attention_mask_tab.device,
	dtype=attention_mask_tab.dtype,
	)

	# ----------------------------
	# Transformer blocks
	# ----------------------------
	for blk in self.layers: # type: TabularImageGQALayer
	x_tab = blk(
	x_tab=x_tab,
	attention_mask=attention_mask_tab,
	vision_features=vision_features,
	vision_mask=vision_mask
	)

	# ----------------------------
	# Slice outputs
	# ----------------------------
	x_cat_out = x_tab[:, :M_cat, :]
	x_num_out = x_tab[:, M_cat:M_cat + T_num, :]

	# ----------------------------
	# Decode
	# ----------------------------
	cat_logits_padded, cat_s, valid_class_mask = self.decode_cat(
	x_cat_out,
	return_padded=True,
	)

	value_by_nin, s_by_nin = self.decode_num(
	x_num_out
	)

	return cat_logits_padded, cat_s, valid_class_mask, value_by_nin, s_by_nin, x_tab

	def _checkpoint_state_dict(self) -> Dict[str, torch.Tensor]:
	"""
	State dict used for save/load.

	Excludes pretrained frozen vision weights:
	- vision_extractor.vision_tower.*
	- vision_extractor.embed_vision.*

	Keeps reducer weights if reducer exists.
	"""
	full_sd = self.state_dict()
	out = {}

	for k, v in full_sd.items():
	if k.startswith("vision_extractor.vision_tower."):
	continue
	if k.startswith("vision_extractor.embed_vision."):
	continue
	out[k] = v

	return out

	def save_weights(self, path: str):
	"""
	Save model weights needed for SoilFormer training/inference,
	excluding pretrained frozen vision weights.
	"""
	payload = {
	"model_state_dict": self._checkpoint_state_dict(),
	"config": self.config,
	}
	torch.save(payload, path)

	def load_weights(self, path: str, map_location: str = "cpu", strict: bool = True):
	"""
	Load weights saved by save_weights().

	Only the checkpoint-managed subset is loaded:
	- embeddings / decoders / layers
	- vision_extractor.reducer.* (if present)

	Pretrained frozen vision weights are ignored here and are expected
	to come from vision_model_dir during model construction.
	"""
	ckpt = torch.load(path, map_location=map_location)

	if isinstance(ckpt, dict) and "model_state_dict" in ckpt:
	sd = ckpt["model_state_dict"]
	elif isinstance(ckpt, dict):
	sd = ckpt
	else:
	raise ValueError(f"Unsupported checkpoint format: {path}")

	expected_sd = self._checkpoint_state_dict()

	# Only keep keys that belong to the checkpoint-managed subset
	loadable_sd = {k: v for k, v in sd.items() if k in expected_sd}

	missing = sorted(set(expected_sd.keys()) - set(loadable_sd.keys()))
	unexpected = sorted(set(sd.keys()) - set(expected_sd.keys()))

	# Actually load
	load_info = self.load_state_dict(loadable_sd, strict=False)

	# PyTorch may still report missing keys from the full model state_dict;
	# keep only checkpoint-managed ones.
	missing_after_load = [
	k for k in load_info.missing_keys
	if k in expected_sd
	]
	unexpected_after_load = [
	k for k in load_info.unexpected_keys
	if k in expected_sd
	]

	# Merge both sources of mismatch info
	missing_final = sorted(set(missing) \| set(missing_after_load))
	unexpected_final = sorted(set(unexpected) \| set(unexpected_after_load))

	if strict and (missing_final or unexpected_final):
	raise RuntimeError(
	"Checkpoint load mismatch.\n"
	f"Missing keys: {missing_final}\n"
	f"Unexpected keys: {unexpected_final}"
	)

	return {
	"missing_keys": missing_final,
	"unexpected_keys": unexpected_final,
	}


	def loss_function(
	x_cat: torch.Tensor, # [B,M,Cmax] padded logits
	s_cat: torch.Tensor, # [B,M] log-variance
	y_cat: torch.Tensor, # [B,M] class index
	loss_mask_cat: torch.Tensor, # [B,M] 0/1
	valid_class_mask: torch.Tensor, # [M,Cmax] bool
	x_num: Dict[int, torch.Tensor], # {n_in: [B,V,n_in]}
	s_num: Dict[int, torch.Tensor], # {n_in: [B,V]}
	y_num: Dict[int, torch.Tensor], # {n_in: [B,V,n_in]}
	loss_mask_num: Dict[int, torch.Tensor], # {n_in: [B,V]} 0/1
	cat_temperature: float = 1.0,
	reduction: str = "mean", # "mean" or "sum"
	eps: float = 1e-12,
	cat_s_bound: Optional[float] = None,
	num_s_bound: Optional[float] = None,
	) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
	"""
	Strict loss for SoilFormer.

	Categorical:
	- Uses per-column CE over the valid class range only.
	- Does NOT rely on padded logits values.
	- s_cat[b,m] = log sigma^2 for categorical column m.

	Numeric:
	- Per-variable MSE averaged over n_in dimensions.
	- s_num[n_in][b,v] = log sigma^2 for numeric variable v.

	Optional soft bound:
	If cat_s_bound or num_s_bound is not None, apply
	s <- bound * tanh(s / bound)
	before using s in heteroscedastic weighting.

	Returns:
	total_loss: scalar (float32)
	stats: dict with cat_loss, num_loss, cat_base, num_base, counts...
	"""

	def _soft_bound_logvar(s_: torch.Tensor, bound: Optional[float]) -> torch.Tensor:
	if bound is None:
	return s_
	b = float(bound)
	if b <= 0:
	# Turn off weighting by signalling a non-positive bound
	return torch.zeros_like(s_)
	return b * torch.tanh(s_ / b)

	# ---------------------------------------------------
	# 1) Categorical loss (strict per-column CE)
	# ---------------------------------------------------
	if x_cat.dim() != 3:
	raise ValueError(f"x_cat must be [B,M,Cmax], got {tuple(x_cat.shape)}")

	B, M, Cmax = x_cat.shape

	if s_cat.shape != (B, M):
	raise ValueError(f"s_cat must be [B,M]=({B},{M}), got {tuple(s_cat.shape)}")
	if y_cat.shape != (B, M):
	raise ValueError(f"y_cat must be [B,M]=({B},{M}), got {tuple(y_cat.shape)}")
	if loss_mask_cat.shape != (B, M):
	raise ValueError(f"loss_mask_cat must be [B,M]=({B},{M}), got {tuple(loss_mask_cat.shape)}")
	if valid_class_mask.shape != (M, Cmax):
	raise ValueError(
	f"valid_class_mask must be [M,Cmax]=({M},{Cmax}), got {tuple(valid_class_mask.shape)}"
	)

	x_cat_f = x_cat.float()
	s_cat_f = _soft_bound_logvar(s_cat.float(), cat_s_bound)
	y_cat_l = y_cat.long()
	mcat = loss_mask_cat.float()
	valid_class_mask = valid_class_mask.to(device=x_cat.device, dtype=torch.bool)

	if cat_temperature != 1.0:
	x_cat_f = x_cat_f / float(cat_temperature)

	cat_loss_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
	cat_base_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
	cat_correct_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)

	# denominator = number of actively supervised categorical cells
	cat_denom = mcat.sum().clamp_min(float(eps))

	for m in range(M):
	cm = int(valid_class_mask[m].sum().item()) # real class count for column m
	if cm <= 0:
	raise ValueError(f"Column {m} has no valid classes")

	logits_m = x_cat_f[:, m, :cm] # [B, C_m]
	target_m = y_cat_l[:, m] # [B]
	s_m = s_cat_f[:, m] # [B]
	mask_m = mcat[:, m] # [B]

	active = mask_m > 0
	if active.any():
	tgt_active = target_m[active]
	if (tgt_active < 0).any() or (tgt_active >= cm).any():
	raise ValueError(f"y_cat contains invalid class id for categorical column {m}")

	target_m_safe = target_m.clone()
	target_m_safe[~active] = 0

	ce_m = F.cross_entropy(
	logits_m,
	target_m_safe,
	reduction="none",
	) # [B], float32

	# ---------------------------------------------------
	# accuracy (only count active positions)
	# ---------------------------------------------------
	pred_m = logits_m.argmax(dim=-1) # [B]
	correct_m = (pred_m == target_m_safe) & active # [B]
	cat_correct_acc = cat_correct_acc + correct_m.float().sum()

	# heteroscedastic weighting: exp(-s) * CE + s
	L_m = torch.exp(-s_m) * ce_m + s_m # [B]

	cat_loss_acc = cat_loss_acc + (L_m * mask_m).sum()
	cat_base_acc = cat_base_acc + (ce_m * mask_m).sum()

	if reduction == "mean":
	cat_loss = cat_loss_acc / cat_denom
	cat_base = cat_base_acc / cat_denom
	elif reduction == "sum":
	cat_loss = cat_loss_acc
	cat_base = cat_base_acc
	else:
	raise ValueError(f"Unsupported reduction: {reduction}")
	cat_acc = cat_correct_acc / cat_denom

	# ---------------------------------------------------
	# 2) Numeric loss (per-variable heteroscedastic MSE)
	# ---------------------------------------------------
	num_loss_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
	num_base_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)
	num_denom_acc = torch.zeros((), device=x_cat.device, dtype=torch.float32)

	for n_in, x in x_num.items():
	if n_in not in y_num or n_in not in s_num or n_in not in loss_mask_num:
	raise KeyError(f"Missing key n_in={n_in} in y_num/s_num/loss_mask_num")

	y = y_num[n_in]
	s = s_num[n_in]
	m = loss_mask_num[n_in]

	if x.shape != y.shape:
	raise ValueError(
	f"x_num[{n_in}] and y_num[{n_in}] shape mismatch: "
	f"{tuple(x.shape)} vs {tuple(y.shape)}"
	)
	if x.dim() != 3:
	raise ValueError(f"x_num[{n_in}] must be [B,V,n_in], got {tuple(x.shape)}")

	Bb, V, Nin = x.shape
	if Nin != n_in:
	raise ValueError(f"x_num[{n_in}] last dim mismatch: got {Nin}, expected {n_in}")
	if s.shape != (Bb, V):
	raise ValueError(f"s_num[{n_in}] must be [B,V], got {tuple(s.shape)}")
	if m.shape != (Bb, V):
	raise ValueError(f"loss_mask_num[{n_in}] must be [B,V], got {tuple(m.shape)}")

	x_f = x.float()
	y_f = y.float()
	s_f = _soft_bound_logvar(s.float(), num_s_bound)
	m_f = m.float()

	# base numeric loss per variable: mean over n_in dims
	mse = (x_f - y_f).pow(2).mean(dim=-1) # [B,V]

	# heteroscedastic weighting: exp(-s) * mse + s
	L = torch.exp(-s_f) * mse + s_f # [B,V]

	num_loss_acc = num_loss_acc + (L * m_f).sum()
	num_base_acc = num_base_acc + (mse * m_f).sum()
	num_denom_acc = num_denom_acc + m_f.sum()

	num_denom = num_denom_acc.clamp_min(float(eps))

	if reduction == "mean":
	num_loss = num_loss_acc / num_denom
	num_base = num_base_acc / num_denom
	elif reduction == "sum":
	num_loss = num_loss_acc
	num_base = num_base_acc
	else:
	raise ValueError(f"Unsupported reduction: {reduction}")

	# ---------------------------------------------------
	# 3) Total
	# ---------------------------------------------------
	total = cat_loss + num_loss

	stats = {
	"total": total.detach(),
	"cat_loss": cat_loss.detach(),
	"num_loss": num_loss.detach(),
	"cat_base": cat_base.detach(),
	"num_base": num_base.detach(),
	"cat_count": cat_denom.detach(),
	"num_count": num_denom.detach(),
	"cat_acc": cat_acc.detach(),
	}
	return total, stats


	# ============================================================
	# DEMO
	# ============================================================

	def _demo_main():
	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument("--config_json", type=str, default="config/config_model.json")
	parser.add_argument("--batch_size", type=int, default=2)
	parser.add_argument("--with_vision", action="store_true")
	args = parser.parse_args()

	cfg = load_json(args.config_json)

	print("===== Loaded config =====")
	print(json.dumps(cfg, ensure_ascii=False, indent=2))

	# --------------------------------------------------
	# Ensure vocab files exist
	# --------------------------------------------------
	tabular_meta = load_json(cfg["tabular_meta"])

	if not os.path.isfile(cfg["cat_vocab_json"]):
	cat_names = get_categorical_feature_names_from_meta(tabular_meta)
	vocab = build_cat_vocab_spec_from_meta(tabular_meta, cat_names)
	Path(cfg["cat_vocab_json"]).parent.mkdir(parents=True, exist_ok=True)
	save_cat_vocab_json(vocab, cfg["cat_vocab_json"])
	print(f"[demo] Built cat_vocab_json at {cfg['cat_vocab_json']}")

	if not os.path.isfile(cfg["numeric_vocab_json"]):
	spec = build_numeric_vocab_spec_from_meta(tabular_meta)
	Path(cfg["numeric_vocab_json"]).parent.mkdir(parents=True, exist_ok=True)
	save_json(spec, cfg["numeric_vocab_json"])
	print(f"[demo] Built numeric_vocab_json at {cfg['numeric_vocab_json']}")

	# --------------------------------------------------
	# Build model
	# --------------------------------------------------
	model = SoilFormer(cfg)
	model.init_weights()
	model.eval()

	device = next(model.parameters()).device
	dtype = next(model.parameters()).dtype

	B = args.batch_size

	# --------------------------------------------------
	# Build dummy categorical inputs
	# --------------------------------------------------
	cat_spec = load_json(cfg["cat_vocab_json"])
	cat_items = sorted(cat_spec.items(), key=lambda x: x[1]["col_id"])
	M_cat = len(cat_items)

	cat_local_ids = torch.zeros(B, M_cat, dtype=torch.long, device=device)
	cat_valid_positions = torch.ones(B, M_cat, dtype=torch.bool, device=device)

	# --------------------------------------------------
	# Build dummy numeric inputs
	# --------------------------------------------------
	num_spec = load_json(cfg["numeric_vocab_json"])

	numeric_values_by_nin: Dict[int, torch.Tensor] = {}
	numeric_valid_positions_by_nin: Dict[int, torch.Tensor] = {}

	for g in num_spec["groups"]:
	n_in = int(g["n_in"])
	V = len(g["feature_names"])

	numeric_values_by_nin[n_in] = torch.randn(B, V, n_in, device=device, dtype=dtype)
	numeric_valid_positions_by_nin[n_in] = torch.ones(B, V, dtype=torch.bool, device=device)

	# --------------------------------------------------
	# Build dummy vision inputs
	# --------------------------------------------------
	if args.with_vision:
	pixel_values = torch.randn(B, 3, 224, 224, device=device, dtype=dtype)
	vision_valid_positions = torch.ones(B, dtype=torch.bool, device=device)
	else:
	pixel_values = None
	vision_valid_positions = None

	# --------------------------------------------------
	# Vision debug
	# --------------------------------------------------
	print("\n===== Vision debug =====")
	if pixel_values is None:
	print("pixel_values: None")
	print("vision_features: None")
	print("vision_mask: None")
	else:
	print("pixel_values:", tuple(pixel_values.shape), pixel_values.dtype, pixel_values.device)
	with torch.no_grad():
	vision_features, vision_mask = model.vision_extractor.forward(
	pixel_values=pixel_values,
	valid_positions=vision_valid_positions,
	)
	print("vision_features:", tuple(vision_features.shape), vision_features.dtype, vision_features.device)
	print("vision_mask:", tuple(vision_mask.shape), vision_mask.dtype, vision_mask.device)

	# --------------------------------------------------
	# Forward
	# --------------------------------------------------
	with torch.no_grad():
	cat_logits_padded, cat_s, valid_class_mask, value_by_nin, s_by_nin, x_tab = model.forward(
	cat_local_ids=cat_local_ids, # noqa
	numeric_values_by_nin=numeric_values_by_nin,
	cat_valid_positions=cat_valid_positions,
	numeric_valid_positions_by_nin=numeric_valid_positions_by_nin,
	pixel_values=pixel_values,
	vision_valid_positions=vision_valid_positions,
	)

	print("\n===== SoilFormer demo =====")
	print("cat_local_ids:", tuple(cat_local_ids.shape))
	print("cat_valid_positions:", tuple(cat_valid_positions.shape))
	print("numeric_values_by_nin:", {k: tuple(v.shape) for k, v in numeric_values_by_nin.items()})
	print("numeric_valid_positions_by_nin:", {k: tuple(v.shape) for k, v in numeric_valid_positions_by_nin.items()})
	print("x_tab_final:", tuple(x_tab.shape), x_tab.dtype, x_tab.device)

	print("Categorical outputs:")
	print("cat_logits_padded:", tuple(cat_logits_padded.shape), cat_logits_padded.dtype, cat_logits_padded.device)
	print("cat_s:", tuple(cat_s.shape), cat_s.dtype, cat_s.device)

	print("Numeric decoded values:", {k: tuple(v.shape) for k, v in value_by_nin.items()})
	print("Numeric decoded s:", {k: tuple(s.shape) for k, s in s_by_nin.items()})

	# --------------------------------------------------
	# Loss debug
	# --------------------------------------------------
	print("\n===== Loss debug =====")

	if cat_logits_padded.dim() != 3:
	raise RuntimeError(f"cat_logits_padded must be [B,M,Cmax], got {tuple(cat_logits_padded.shape)}")

	B_logits, M_cat2, Cmax2 = cat_logits_padded.shape
	if cat_s.shape != (B_logits, M_cat2):
	raise RuntimeError(f"cat_s shape mismatch: got {tuple(cat_s.shape)} expected {(B_logits, M_cat2)}")

	# Build dummy categorical targets within valid class ranges
	num_classes = [int(s["num_classes"]) for _, s in cat_items]
	if len(num_classes) != M_cat2:
	raise RuntimeError("M_cat mismatch between vocab and model output")

	y_cat = torch.zeros(B_logits, M_cat2, dtype=torch.long, device=device)
	for m, cm in enumerate(num_classes):
	y_cat[:, m] = torch.randint(low=0, high=cm, size=(B_logits,), device=device)

	mask_cat = torch.ones(B_logits, M_cat2, dtype=torch.long, device=device)

	# Build dummy numeric targets and masks
	y_num = {
	n_in: torch.randn_like(x_pred)
	for n_in, x_pred in value_by_nin.items()
	}

	mask_num = {
	n_in: torch.ones(x_pred.size(0), x_pred.size(1), dtype=torch.long, device=x_pred.device)
	for n_in, x_pred in value_by_nin.items()
	}

	total_loss, stats = loss_function(
	x_cat=cat_logits_padded,
	s_cat=cat_s,
	y_cat=y_cat,
	loss_mask_cat=mask_cat,
	x_num=value_by_nin,
	s_num=s_by_nin,
	y_num=y_num,
	loss_mask_num=mask_num,
	reduction="mean",
	valid_class_mask=valid_class_mask
	)

	print("total_loss:", float(total_loss))
	print("stats:", {k: float(v) for k, v in stats.items()})

	if not torch.isfinite(total_loss):
	raise RuntimeError("Loss is not finite!")


	if __name__ == "__main__":
	_demo_main()