Replace toy PP tests with real-model-based pipeline tests [skip-build]

Use Motif-2.6B (dense) and torchtitan Llama4 MoE (MoE) models with
realistic PP model splitting (deep copy → delete non-stage layers →
per-stage FSDP) matching actual torchtitan training pipeline. MoE test
uses torchtitan's parallelize_llama() directly. Both tests verify
correctness against sequential baseline with atol=0, rtol=0.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

test/test_muon.py +95 -102
test/test_muon_moe.py +91 -135

test/test_muon.py CHANGED Viewed

@@ -12,8 +12,8 @@ from torch.distributed.tensor import (DTensor, Replicate, Shard,
                                       distribute_tensor)
 from torch.profiler import ProfilerActivity, profile
-from .utils import (ParallelDims, assert_params_equal, parallelize_motif,
-                    parallelize_qk_logits)
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -393,126 +393,119 @@ def test_parallel_muon_uneven_shard(init_dist, uneven_dim):
                 uneven_dim, rank)
-def test_pp_dp_replicate_no_deadlock(init_dist):
-    """Regression: PP-like setup where different rank subsets call
-    construct_shard_mesh for different parameters must not deadlock.
-    Also verifies correctness (atol=0, rtol=0) against sequential baseline.
-    Simulates PP=2 with dp_replicate=2, dp_shard=2. Each PP stage has
-    4 ranks with a (2,2) mesh and [Replicate, Shard(0)] placements
-    (created via fully_shard, matching the real HSDP pattern).
-    Stages create different numbers of layers, forcing
-    construct_shard_mesh to be called independently per stage.
-    Without use_local_synchronization=True in dist.new_group(),
-    this would deadlock.
     """
     from optimizer.distributed.utils import _ranks_to_dist_cache
-    from optimizer.newton_schulz import set_ns_compile
-    from torch.distributed.fsdp import fully_shard
     rank = dist.get_rank()
-    world_size = dist.get_world_size()
-    assert world_size == 8
     set_ns_compile(False)
-    # Clear cache to ensure dist.new_group is actually called
     _ranks_to_dist_cache.clear()
-    # Create full mesh: PP=2, dp_replicate=2, dp_shard=2
     full_mesh = dist.init_device_mesh(
         "cuda",
         (2, 2, 2),
         mesh_dim_names=("pp", "dp_replicate", "dp_shard"),
     )
-    stage_mesh = full_mesh["dp_replicate", "dp_shard"]
     pp_rank = full_mesh.get_local_rank("pp")
-    # Asymmetric layer counts per stage (mimics PP)
-    num_layers = 3 if pp_rank == 0 else 5
-    hidden = 64
-    # Same seed per stage so all ranks in a stage get identical init weights
-    torch.manual_seed(42 + pp_rank)
-    # Create model and save initial state for sequential baseline
-    model = torch.nn.Sequential(*[
-        torch.nn.Linear(hidden, hidden, bias=False) for _ in range(num_layers)
-    ]).cuda()
-    init_state = {n: p.data.clone() for n, p in model.named_parameters()}
-    grads = {n: torch.randn_like(p) for n, p in model.named_parameters()}
-    # Apply FSDP (creates proper DTensors with [Replicate, Shard(0)])
-    for layer in model:
-        fully_shard(layer, mesh=stage_mesh)
-    fully_shard(model, mesh=stage_mesh)
-    model.reshard()
-    # Apply grads with proper DTensor redistribution
-    for n, p in model.named_parameters():
-        g = grads[n]
-        if isinstance(p.data, DTensor):
-            ug = DTensor.from_local(
-                g,
-                device_mesh=p.data.device_mesh,
-                placements=[Replicate()] * p.data.device_mesh.ndim,
-            )
-            p.grad = ug.redistribute(device_mesh=p.data.device_mesh,
-                                     placements=p.data.placements)
         else:
-            p.grad = g
-    # Parallel Muon step — must not deadlock
-    muon_names = [n for n, _ in model.named_parameters()]
-    muon_params = [p for _, p in model.named_parameters()]
-    param_groups = [{
-        "params": muon_params,
-        "names": muon_names,
-        "use_muon": True,
-        "lr": 0.02,
-        "weight_decay": 0.01,
-        "momentum": 0.95,
-        "nesterov": True,
-        "ns_steps": 5,
-        "none_grad": False,
-    }]
-    optim = Muon(params=param_groups, chunk_size=1, warmup_step=0)
-    optim.step()
-    # Sequential baseline (base path, no sharding)
-    torch.manual_seed(42 + pp_rank)
-    model_seq = torch.nn.Sequential(*[
-        torch.nn.Linear(hidden, hidden, bias=False) for _ in range(num_layers)
-    ]).cuda()
-    for n, p in model_seq.named_parameters():
-        p.grad = grads[n].clone()
-    seq_names = [n for n, _ in model_seq.named_parameters()]
-    seq_params = [p for _, p in model_seq.named_parameters()]
-    param_groups_seq = [{
-        "params": seq_params,
-        "names": seq_names,
-        "use_muon": True,
-        "lr": 0.02,
-        "weight_decay": 0.01,
-        "momentum": 0.95,
-        "nesterov": True,
-        "ns_steps": 5,
-        "none_grad": False,
-    }]
-    optim_seq = Muon(params=param_groups_seq)
-    optim_seq.step()
     # Correctness: parallel must match sequential exactly
-    for (n_par, p_par), (n_seq, p_seq) in zip(model.named_parameters(),
-                                              model_seq.named_parameters()):
-        par_data = p_par.data
-        if isinstance(par_data, DTensor):
-            par_data = par_data.full_tensor()
-        torch.testing.assert_close(par_data, p_seq.data, atol=0, rtol=0)
     set_ns_compile(True)
     logger.info(

                                       distribute_tensor)
 from torch.profiler import ProfilerActivity, profile
+from .utils import (ParallelDims, _apply_fsdp, assert_params_equal,
+                    parallelize_motif, parallelize_qk_logits)
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
                 uneven_dim, rank)
+def test_pp_dp_replicate_no_deadlock(init_dist, inputs):
+    """PP regression test using real Motif model.
+    PP=2, dp_replicate=2, dp_shard=2 on 8 GPUs.  Splits the
+    Motif-2.6B-4layer model across 2 pipeline stages following the
+    torchtitan pattern (deep copy → delete non-stage layers → per-stage
+    FSDP).  Each stage independently runs Muon optimizer and the result
+    is verified against a sequential baseline (atol=0, rtol=0).
+    Without use_local_synchronization=True in construct_shard_mesh(),
+    different stages would deadlock on dist.new_group() because they
+    call it for different parameters.
     """
+    import re
+    import torch.nn as nn
     from optimizer.distributed.utils import _ranks_to_dist_cache
     rank = dist.get_rank()
+    assert dist.get_world_size() == 8
     set_ns_compile(False)
     _ranks_to_dist_cache.clear()
+    model_orig, grads_orig, _ = inputs
+    # Build name→grad mapping from original model
+    grad_dict = {
+        name: grad
+        for (name, _), grad in zip(model_orig.named_parameters(), grads_orig)
+    }
+    # Full mesh: PP=2, dp_replicate=2, dp_shard=2
     full_mesh = dist.init_device_mesh(
         "cuda",
         (2, 2, 2),
         mesh_dim_names=("pp", "dp_replicate", "dp_shard"),
     )
+    dp_mesh = full_mesh["dp_replicate", "dp_shard"]
     pp_rank = full_mesh.get_local_rank("pp")
+    # -- Helpers ----------------------------------------------------------
+    def _split_motif(model):
+        """Split Motif model per PP stage (torchtitan pattern).
+        Stage 0: embed_tokens + layers[0:2]
+        Stage 1: layers[2:4] + norm + output
+        Non-stage components replaced with nn.Identity (no params).
+        """
+        all_layers = list(model.model.layers)
+        if pp_rank == 0:
+            model.model.layers = nn.ModuleList(all_layers[:2])
+            model.model.norm = nn.Identity()
+            if hasattr(model, "output"):
+                model.output = nn.Identity()
+            if hasattr(model, "lm_head"):
+                model.lm_head = nn.Identity()
         else:
+            model.model.layers = nn.ModuleList(all_layers[2:])
+            model.model.embed_tokens = nn.Identity()
+        return model
+    layer_offset = 0 if pp_rank == 0 else 2
+    def _remap(name):
+        """Map stage param name → original param name (layer index offset).
+        Also handles weight tying: Motif ties lm_head.weight to
+        model.embed_tokens.weight, so named_parameters() only lists the
+        latter.  After stage-split, stage 1 loses embed_tokens but keeps
+        lm_head, so we remap it back.
+        """
+        # Weight tying: lm_head.weight ↔ model.embed_tokens.weight
+        if name == "lm_head.weight":
+            return "model.embed_tokens.weight"
+        if layer_offset == 0:
+            return name
+        def _replace(m):
+            return f"layers.{int(m.group(1)) + layer_offset}."
+        return re.sub(r"layers\.(\d+)\.", _replace, name)
+    def _stage_grads(model):
+        """Build grads list aligned with stage model parameters."""
+        return [grad_dict[_remap(n)] for n, _ in model.named_parameters()]
+    # -- Parallel path: split → FSDP → Muon step -------------------------
+    par_model = _split_motif(copy.deepcopy(model_orig).cuda())
+    _apply_fsdp(par_model, dp_mesh)
+    par_model, _ = apply_muon_step(
+        model=par_model,
+        parallel_dims=None,
+        grads=_stage_grads(par_model),
+        warmup_step=5,
+        chunk_size=2,
+        qk_logits=None,
+    )
+    # -- Sequential baseline: split → no FSDP → base Muon ----------------
+    seq_model = _split_motif(copy.deepcopy(model_orig).cuda())
+    seq_model, _ = apply_muon_step(
+        model=seq_model,
+        parallel_dims=None,
+        grads=_stage_grads(seq_model),
+        warmup_step=-1,
+        chunk_size=-1,
+        qk_logits=None,
+    )
     # Correctness: parallel must match sequential exactly
+    assert_params_equal(par_model, seq_model, atol=0, rtol=0)
     set_ns_compile(True)
     logger.info(

test/test_muon_moe.py CHANGED Viewed

@@ -404,157 +404,113 @@ def test_parallel_muon_moe_uneven_shard(init_dist, uneven_dim):
         uneven_dim, rank)
-def test_pp_dp_replicate_moe_no_deadlock(init_dist):
-    """Regression: PP-like MoE setup where different stages have different
-    parameter types must not deadlock in construct_shard_mesh.
-    Also verifies correctness (atol=0, rtol=0) against sequential baseline.
-    Simulates PP=2 with dp_replicate=2, dp_shard=2. Stage 0 has only
-    non-expert 2D FSDP-sharded params; stage 1 has 2D FSDP-sharded params
-    plus 3D expert plain-tensor params. This mirrors real PP+MoE where
-    expert layers exist only in certain stages.
     """
     from optimizer.distributed.utils import _ranks_to_dist_cache
     from optimizer.newton_schulz import set_ns_compile
-    from torch.distributed.fsdp import fully_shard
     rank = dist.get_rank()
-    world_size = dist.get_world_size()
-    assert world_size == 8
     set_ns_compile(False)
-    # Clear cache to ensure dist.new_group is actually called
     _ranks_to_dist_cache.clear()
-    # Create full mesh: PP=2, dp_replicate=2, dp_shard=2
-    full_mesh = dist.init_device_mesh(
-        "cuda",
-        (2, 2, 2),
-        mesh_dim_names=("pp", "dp_replicate", "dp_shard"),
     )
-    stage_mesh = full_mesh["dp_replicate", "dp_shard"]
-    pp_rank = full_mesh.get_local_rank("pp")
-    num_dense = 2 if pp_rank == 0 else 3
-    num_experts = 4
-    hidden = 64
-    torch.manual_seed(42 + pp_rank)
-    # Create model with dense layers (+ expert param for stage 1)
-    model = torch.nn.Sequential(*[
-        torch.nn.Linear(hidden, hidden, bias=False) for _ in range(num_dense)
-    ]).cuda()
-    # Save init state and grads for sequential baseline
-    init_state = {n: p.data.clone() for n, p in model.named_parameters()}
-    dense_grads = {n: torch.randn_like(p) for n, p in model.named_parameters()}
-    # Expert param (stage 1 only, plain tensor — not FSDP-sharded)
-    expert_data = None
-    expert_grad = None
-    if pp_rank == 1:
-        expert_data = torch.randn(num_experts, hidden, hidden, device="cuda")
-        expert_grad = torch.randn(num_experts, hidden, hidden, device="cuda")
-    # Apply FSDP to dense layers
-    for layer in model:
-        fully_shard(layer, mesh=stage_mesh)
-    fully_shard(model, mesh=stage_mesh)
-    model.reshard()
-    # Apply dense grads with DTensor redistribution
-    for n, p in model.named_parameters():
-        g = dense_grads[n]
-        if isinstance(p.data, DTensor):
-            ug = DTensor.from_local(
-                g,
-                device_mesh=p.data.device_mesh,
-                placements=[Replicate()] * p.data.device_mesh.ndim,
-            )
-            p.grad = ug.redistribute(device_mesh=p.data.device_mesh,
-                                     placements=p.data.placements)
         else:
-            p.grad = g
-    # Build param groups: dense (FSDP DTensors) + expert (plain tensor)
-    muon_names = [n for n, _ in model.named_parameters()]
-    muon_params = list(model.parameters())
-    if pp_rank == 1:
-        expert_p = torch.nn.Parameter(expert_data.clone())
-        expert_p.grad = expert_grad.clone()
-        muon_params.append(expert_p)
-        muon_names.append("experts.w1.weight")
-    param_groups = [{
-        "params": muon_params,
-        "names": muon_names,
-        "use_muon": True,
-        "lr": 0.02,
-        "weight_decay": 0.01,
-        "momentum": 0.95,
-        "nesterov": True,
-        "ns_steps": 5,
-        "none_grad": False,
-    }]
-    # Must not deadlock
-    optim = Muon(params=param_groups,
-                 chunk_size=1,
-                 warmup_step=0,
-                 expert_keys=["experts"])
-    optim.step()
-    # Sequential baseline
-    torch.manual_seed(42 + pp_rank)
-    model_seq = torch.nn.Sequential(*[
-        torch.nn.Linear(hidden, hidden, bias=False) for _ in range(num_dense)
-    ]).cuda()
-    seq_names = [n for n, _ in model_seq.named_parameters()]
-    seq_params = list(model_seq.parameters())
-    for n, p in model_seq.named_parameters():
-        p.grad = dense_grads[n].clone()
-    if pp_rank == 1:
-        expert_p_seq = torch.nn.Parameter(expert_data.clone())
-        expert_p_seq.grad = expert_grad.clone()
-        seq_params.append(expert_p_seq)
-        seq_names.append("experts.w1.weight")
-    param_groups_seq = [{
-        "params": seq_params,
-        "names": seq_names,
-        "use_muon": True,
-        "lr": 0.02,
-        "weight_decay": 0.01,
-        "momentum": 0.95,
-        "nesterov": True,
-        "ns_steps": 5,
-        "none_grad": False,
-    }]
-    optim_seq = Muon(params=param_groups_seq, expert_keys=["experts"])
-    optim_seq.step()
     # Correctness: parallel must match sequential exactly
-    # Dense params
-    for (n_par, p_par), (n_seq, p_seq) in zip(model.named_parameters(),
-                                              model_seq.named_parameters()):
-        par_data = p_par.data
-        if isinstance(par_data, DTensor):
-            par_data = par_data.full_tensor()
-        torch.testing.assert_close(par_data, p_seq.data, atol=0, rtol=0)
-    # Expert params (stage 1 only)
-    if pp_rank == 1:
-        torch.testing.assert_close(muon_params[-1].data,
-                                   seq_params[-1].data,
-                                   atol=0,
-                                   rtol=0)
     set_ns_compile(True)
     logger.info(

         uneven_dim, rank)
+def test_pp_dp_replicate_moe_no_deadlock(init_dist, moe_inputs):
+    """PP regression test using real torchtitan Llama4 MoE model.
+    PP=2, dp_replicate=2, dp_shard=2 on 8 GPUs.  Splits the Llama4 MoE
+    model (4 layers, 8 experts) across 2 pipeline stages following the
+    torchtitan pattern.  Uses torchtitan's ``parallelize_llama`` for
+    realistic FSDP application (same function as real training).
+    Each stage independently runs Muon optimizer with expert_keys and
+    the result is verified against a sequential baseline (atol=0, rtol=0).
+    Without use_local_synchronization=True in construct_shard_mesh(),
+    different stages would deadlock on dist.new_group().
     """
     from optimizer.distributed.utils import _ranks_to_dist_cache
     from optimizer.newton_schulz import set_ns_compile
+    from torchtitan.config import JobConfig
+    from torchtitan.distributed import ParallelDims as TTParallelDims
+    from torchtitan.models.llama4.infra.parallelize import parallelize_llama
     rank = dist.get_rank()
+    assert dist.get_world_size() == 8
     set_ns_compile(False)
     _ranks_to_dist_cache.clear()
+    model_orig, grads_orig = moe_inputs
+    # Build name→grad mapping from original model
+    grad_dict = {
+        name: grad
+        for (name, _), grad in zip(model_orig.named_parameters(), grads_orig)
+    }
+    # torchtitan ParallelDims with PP=2 (same as real training config)
+    tt_dims = TTParallelDims(
+        dp_replicate=2,
+        dp_shard=2,
+        cp=1,
+        tp=1,
+        pp=2,
+        ep=1,
+        etp=1,
+        world_size=8,
     )
+    # Accessing world_mesh triggers build_mesh() (lazy init).
+    # All ranks participate in init_device_mesh (collective).
+    pp_rank = tt_dims.world_mesh.get_local_rank("pp")
+    job_config = JobConfig()
+    job_config.training.mixed_precision_param = "float32"
+    job_config.activation_checkpoint.mode = "none"
+    job_config.compile.enable = False
+    job_config.parallelism.disable_loss_parallel = True
+    # -- Helpers ----------------------------------------------------------
+    def _split_llama4(model):
+        """Split Llama4 MoE model per PP stage (torchtitan pattern).
+        Stage 0: tok_embeddings + layers["0"], ["1"]
+        Stage 1: layers["2"], ["3"] + norm + output
+        ModuleDict preserves keys → param names unchanged.
+        torchtitan model natively supports None modules in forward().
+        """
+        if pp_rank == 0:
+            for key in ["2", "3"]:
+                if key in model.layers:
+                    del model.layers[key]
+            model.norm = None
+            model.output = None
         else:
+            for key in ["0", "1"]:
+                if key in model.layers:
+                    del model.layers[key]
+            model.tok_embeddings = None
+        return model
+    def _stage_grads(model):
+        """Build grads list aligned with stage model parameters."""
+        return [grad_dict[n] for n, _ in model.named_parameters()]
+    # -- Parallel path: split → parallelize_llama → Muon step -------------
+    par_model = _split_llama4(copy.deepcopy(model_orig).cuda())
+    parallelize_llama(par_model, tt_dims, job_config)
+    par_model, _ = apply_muon_step_moe(
+        model=par_model,
+        parallel_dims=None,
+        grads=_stage_grads(par_model),
+        warmup_step=5,
+        chunk_size=2,
+    )
+    # -- Sequential baseline: split → no parallelization → base Muon ------
+    seq_model = _split_llama4(copy.deepcopy(model_orig).cuda())
+    seq_model, _ = apply_muon_step_moe(
+        model=seq_model,
+        parallel_dims=None,
+        grads=_stage_grads(seq_model),
+        warmup_step=-1,
+        chunk_size=-1,
+    )
     # Correctness: parallel must match sequential exactly
+    assert_params_equal(par_model, seq_model, atol=0, rtol=0)
     set_ns_compile(True)
     logger.info(