AshenNav
/

twill-swp-ws

Model card Files Files and versions

xet

Community

AshenNav commited on 14 days ago

Commit

6cf8dee

verified ·

1 Parent(s): 933c21c

Upload twill/gaus_solver.py with huggingface_hub

Browse files

Files changed (1) hide show

twill/gaus_solver.py +888 -0

twill/gaus_solver.py ADDED Viewed

	@@ -0,0 +1,888 @@

+"""
+GauS: Differentiable Scheduling Optimization via Gaussian Reparameterization
+Implementation of the paper by Yaohui Cai et al. (arXiv:2602.20427)
+GauS models operator scheduling as a stochastic relaxation using Gaussian
+distributions, optimized via gradient descent with an Augmented Lagrangian
+Method (ALM). It supports:
+  - Formulation A: Latency-constrained resource + communication optimization
+  - Formulation B: Latency-constrained memory footprint optimization
+  - Formulation C: Modulo scheduling (pipelined) — directly comparable to Twill
+Key advantages over ILP/SMT (Twill Phase 1+2):
+  - O(|V|) parameters vs O(D·|V|) for categorical approaches
+  - Scales to 10K+ operator graphs where ILP/SMT time out
+  - Exploits GPU parallelism for gradient computation
+This module integrates with Twill's DependenceGraph as an alternative solver.
+"""
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import math
+import time
+from typing import Dict, List, Optional, Tuple, Set
+from dataclasses import dataclass, field
+# ============================================================
+# Gaussian CDF helper
+# ============================================================
+def gaussian_cdf(x: torch.Tensor) -> torch.Tensor:
+    """Standard Gaussian CDF: Φ(x) = 0.5 * (1 + erf(x / sqrt(2)))"""
+    return 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+# ============================================================
+# GauS Graph representation (standalone, bridges to Twill)
+# ============================================================
+@dataclass
+class GausGraph:
+    """Graph representation for GauS solver.
+    Attributes:
+        num_nodes: Number of operators |V|
+        edges: List of (src, dst) forward dependency edges
+        back_edges: List of (src, dst, k) loop-carried back-edges with iteration distance k
+        resource_weights: Per-node resource demand w_i, shape [|V|]
+        memory_weights: Per-node storage bitwidth b_i, shape [|V|]
+        successors: Dict mapping node -> list of successor nodes
+        predecessors: Dict mapping node -> list of predecessor nodes
+        node_names: Optional names for nodes
+    """
+    num_nodes: int
+    edges: List[Tuple[int, int]]  # (src, dst) forward edges
+    back_edges: List[Tuple[int, int, int]] = field(default_factory=list)  # (src, dst, k)
+    resource_weights: Optional[np.ndarray] = None  # [|V|]
+    memory_weights: Optional[np.ndarray] = None  # [|V|]
+    node_names: Optional[List[str]] = None
+    def __post_init__(self):
+        # Build adjacency
+        self.successors: Dict[int, List[int]] = {i: [] for i in range(self.num_nodes)}
+        self.predecessors: Dict[int, List[int]] = {i: [] for i in range(self.num_nodes)}
+        for (u, v) in self.edges:
+            self.successors[u].append(v)
+            self.predecessors[v].append(u)
+        # Defaults
+        if self.resource_weights is None:
+            self.resource_weights = np.ones(self.num_nodes)
+        if self.memory_weights is None:
+            self.memory_weights = np.ones(self.num_nodes)
+        if self.node_names is None:
+            self.node_names = [f"v{i}" for i in range(self.num_nodes)]
+    def topological_sort(self) -> List[int]:
+        """Kahn's algorithm for topological sort."""
+        in_degree = [0] * self.num_nodes
+        for (u, v) in self.edges:
+            in_degree[v] += 1
+        queue = [i for i in range(self.num_nodes) if in_degree[i] == 0]
+        order = []
+        while queue:
+            node = queue.pop(0)
+            order.append(node)
+            for succ in self.successors[node]:
+                in_degree[succ] -= 1
+                if in_degree[succ] == 0:
+                    queue.append(succ)
+        return order
+# ============================================================
+# ASAP / ALAP computation
+# ============================================================
+def compute_asap(graph: GausGraph) -> np.ndarray:
+    """Compute As-Soon-As-Possible schedule (longest path from sources)."""
+    asap = np.zeros(graph.num_nodes, dtype=np.float64)
+    for v in graph.topological_sort():
+        for pred in graph.predecessors[v]:
+            asap[v] = max(asap[v], asap[pred] + 1)
+    return asap
+def compute_alap(graph: GausGraph, D: int) -> np.ndarray:
+    """Compute As-Late-As-Possible schedule (latest feasible time given depth D)."""
+    alap = np.full(graph.num_nodes, D - 1, dtype=np.float64)
+    for v in reversed(graph.topological_sort()):
+        for succ in graph.successors[v]:
+            alap[v] = min(alap[v], alap[succ] - 1)
+    return alap
+# ============================================================
+# GauS Solver Result
+# ============================================================
+@dataclass
+class GausResult:
+    """Result from GauS solver.
+    Attributes:
+        schedule: Dict mapping node index -> scheduled time step
+        initiation_interval: II (for modulo scheduling, else None)
+        objective_value: Final objective value
+        num_violations: Number of constraint violations in final schedule
+        solve_time_seconds: Wall-clock solve time
+        iterations: Number of optimization iterations
+        loss_history: List of total loss values per iteration
+        node_names: Optional node names for display
+    """
+    schedule: Dict[int, int]
+    initiation_interval: Optional[int] = None
+    objective_value: float = 0.0
+    num_violations: int = 0
+    solve_time_seconds: float = 0.0
+    iterations: int = 0
+    loss_history: List[float] = field(default_factory=list)
+    node_names: Optional[List[str]] = None
+    @property
+    def is_feasible(self) -> bool:
+        return self.num_violations == 0
+    def named_schedule(self) -> Dict[str, int]:
+        if self.node_names:
+            return {self.node_names[i]: t for i, t in self.schedule.items()}
+        return {f"v{i}": t for i, t in self.schedule.items()}
+    def __repr__(self):
+        sched_str = self.named_schedule()
+        return (
+            f"GausResult(\n"
+            f"  schedule={sched_str}\n"
+            f"  II={self.initiation_interval}\n"
+            f"  objective={self.objective_value:.4f}\n"
+            f"  violations={self.num_violations}\n"
+            f"  feasible={self.is_feasible}\n"
+            f"  solve_time={self.solve_time_seconds:.2f}s\n"
+            f"  iterations={self.iterations}\n"
+            f")"
+        )
+# ============================================================
+# Core GauS Solver
+# ============================================================
+class GauSSolver:
+    """Differentiable scheduling solver using Gaussian reparameterization.
+    Implements Algorithm 1 from the paper with all three formulations.
+    Usage:
+        solver = GauSSolver(graph, D=10)
+        result = solver.solve_regular()          # Formulation A
+        result = solver.solve_modulo(II=3)       # Formulation C (like Twill)
+    """
+    def __init__(
+        self,
+        graph: GausGraph,
+        D: int,
+        kappa: float = 1.0 / 6.0,
+        rho: float = 1e-4,
+        tau: float = 1e-2,
+        lr: float = 1e-2,
+        device: str = "cpu",
+    ):
+        """
+        Args:
+            graph: The scheduling graph
+            D: Maximum schedule depth (latency bound)
+            kappa: Std init scale factor (σ = κ · (ALAP - ASAP))
+            rho: ALM penalty coefficient
+            tau: LogSumExp temperature
+            lr: Adam learning rate
+            device: torch device
+        """
+        self.graph = graph
+        self.D = D
+        self.kappa = kappa
+        self.rho = rho
+        self.tau = tau
+        self.lr = lr
+        self.device = device
+        self.N = graph.num_nodes
+        # Precompute ASAP/ALAP
+        self.s_asap = compute_asap(graph)
+        self.s_alap = compute_alap(graph, D)
+        # Precompute edge tensors for vectorized loss computation
+        if graph.edges:
+            self.edge_src = torch.tensor([e[0] for e in graph.edges], dtype=torch.long, device=device)
+            self.edge_dst = torch.tensor([e[1] for e in graph.edges], dtype=torch.long, device=device)
+        else:
+            self.edge_src = torch.tensor([], dtype=torch.long, device=device)
+            self.edge_dst = torch.tensor([], dtype=torch.long, device=device)
+        # Resource weights
+        self.w = torch.tensor(graph.resource_weights, dtype=torch.float32, device=device)
+        self.b = torch.tensor(graph.memory_weights, dtype=torch.float32, device=device)
+    def _init_params(self, mu_init: Optional[np.ndarray] = None) -> Tuple[nn.Parameter, nn.Parameter]:
+        """Initialize μ and σ parameters (Section 3.1)."""
+        if mu_init is not None:
+            mu = torch.tensor(mu_init, dtype=torch.float32, device=self.device)
+        else:
+            # μ₀ = (ASAP + ALAP) / 2
+            mu = torch.tensor(
+                (self.s_asap + self.s_alap) / 2.0,
+                dtype=torch.float32, device=self.device,
+            )
+        # σ = κ · (ALAP - ASAP), with minimum to avoid zero
+        freedom = self.s_alap - self.s_asap
+        sigma_init = self.kappa * np.maximum(freedom, 0.5)
+        sigma = torch.tensor(sigma_init, dtype=torch.float32, device=self.device)
+        return nn.Parameter(mu), nn.Parameter(sigma)
+    def _compute_P(self, mu: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+        """Compute P_i^d for all (i, d). Equation 7b.
+        P_i^d = Φ((d+0.5-μ_i)/σ_i) - Φ((d-0.5-μ_i)/σ_i)
+        Returns: shape [N, D]
+        """
+        sigma_safe = sigma.abs() + 1e-6  # [N]
+        d = torch.arange(self.D, dtype=torch.float32, device=self.device)  # [D]
+        # Broadcasting: mu [N,1], sigma [N,1], d [1,D]
+        mu_exp = mu.unsqueeze(1)      # [N, 1]
+        sig_exp = sigma_safe.unsqueeze(1)  # [N, 1]
+        d_exp = d.unsqueeze(0)        # [1, D]
+        # Boundary handling (footnote 2 in paper):
+        # d=0: lower bound is -inf; d=D-1: upper bound is +inf
+        upper = (d_exp + 0.5 - mu_exp) / sig_exp  # [N, D]
+        lower = (d_exp - 0.5 - mu_exp) / sig_exp  # [N, D]
+        P = gaussian_cdf(upper) - gaussian_cdf(lower)  # [N, D]
+        # Clamp to avoid numerical issues
+        P = P.clamp(min=1e-10, max=1.0)
+        return P
+    # ============================================================
+    # Loss functions (Section 3.2 + Appendix B)
+    # ============================================================
+    def _loss_dependency(self, P: torch.Tensor) -> torch.Tensor:
+        """Expected dependency violations (Equation 8).
+        V_dep = Σ_{(i,j)∈E} Σ_{d_i=1}^{D-1} Σ_{d_j=0}^{d_i-1} P_i^{d_i} · P_j^{d_j}
+        Vectorized: for each edge (i,j), V = P_i · cumsum(P_j) shifted
+        """
+        if len(self.edge_src) == 0:
+            return torch.tensor(0.0, device=self.device)
+        P_src = P[self.edge_src]  # [|E|, D]
+        P_dst = P[self.edge_dst]  # [|E|, D]
+        # CDF_j(d) = cumsum of P_j up to d
+        cdf_dst = torch.cumsum(P_dst, dim=1)  # [|E|, D]
+        # Shifted: CDF_j(d_i - 1) = probability consumer is before d_i
+        cdf_shifted = torch.cat([
+            torch.zeros(len(self.edge_src), 1, device=self.device),
+            cdf_dst[:, :-1]
+        ], dim=1)  # [|E|, D]
+        # V_dep per edge = Σ_{d_i} P_i^{d_i} * CDF_j(d_i - 1)
+        V_dep = torch.sum(P_src * cdf_shifted, dim=1)  # [|E|]
+        return V_dep.sum()
+    def _loss_resource(self, P: torch.Tensor, R_cap: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Expected resource usage + violations (Equations 18-20).
+        Res(d) = Σ_i w_i · P_i^d
+        L_res = τ · log(Σ_d exp(Res(d)/τ))          (LogSumExp smooth-max)
+        V_res = Σ_d ReLU(Res(d) - R)                 (violations)
+        Returns: (L_res, V_res)
+        """
+        # Res(d) shape [D]
+        res_d = torch.matmul(self.w, P)  # [D]
+        # LogSumExp smooth-max
+        L_res = self.tau * torch.logsumexp(res_d / self.tau, dim=0)
+        # Violations
+        if R_cap is not None:
+            V_res = torch.sum(torch.relu(res_d - R_cap))
+        else:
+            V_res = torch.tensor(0.0, device=self.device)
+        return L_res, V_res
+    def _loss_communication(self, P: torch.Tensor) -> torch.Tensor:
+        """Expected communication overhead (Equation 17).
+        L_com = Σ_{(i,j)∈E} Σ_{d_i} Σ_{d_j≥d_i} P_i^{d_i} · P_j^{d_j} · (d_j - d_i)
+        Vectorized using expected value: E[d_j - d_i] = E[d_j] - E[d_i] (for valid pairs)
+        Simplified to: Σ_{(i,j)∈E} (μ_j - μ_i) when σ→0
+        More precise: use the full double sum with broadcasting
+        """
+        if len(self.edge_src) == 0:
+            return torch.tensor(0.0, device=self.device)
+        P_src = P[self.edge_src]  # [|E|, D]
+        P_dst = P[self.edge_dst]  # [|E|, D]
+        d = torch.arange(self.D, dtype=torch.float32, device=self.device)  # [D]
+        # E[d_j | valid] = Σ_{d_j} d_j · P_j^{d_j}
+        # E[d_i | valid] = Σ_{d_i} d_i · P_i^{d_i}
+        # Approximate: L_com ≈ Σ edges (E[d_j] - E[d_i])
+        E_src = torch.sum(P_src * d.unsqueeze(0), dim=1)  # [|E|]
+        E_dst = torch.sum(P_dst * d.unsqueeze(0), dim=1)  # [|E|]
+        L_com = torch.sum(E_dst - E_src)
+        return L_com
+    def _loss_modulo_resource(
+        self, P: torch.Tensor, II: int, R_cap: float
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Modulo resource usage + violations (Equations 21-22, Appendix B.3).
+        P_mod[i,t] = Σ_k P_i^{t + k·II}       (wrap probabilities into II slots)
+        MRes(t) = Σ_i w_i · P_mod[i,t]
+        V_mres = Σ_t ReLU(MRes(t) - R_cap)
+        """
+        # Wrap probabilities into modulo II slots
+        P_mod = torch.zeros(self.N, II, device=self.device)
+        for k in range((self.D + II - 1) // II):
+            for t in range(II):
+                d = t + k * II
+                if d < self.D:
+                    P_mod[:, t] += P[:, d]
+        # MRes(t) = Σ_i w_i · P_mod[i,t], shape [II]
+        mres_t = torch.matmul(self.w, P_mod)
+        # LogSumExp objective
+        L_mres = self.tau * torch.logsumexp(mres_t / self.tau, dim=0)
+        # Violations
+        V_mres = torch.sum(torch.relu(mres_t - R_cap))
+        return L_mres, V_mres
+    def _loss_recurrence(self, P: torch.Tensor, II: int) -> torch.Tensor:
+        """Expected recurrence violations for loop-carried deps (Equation 24).
+        V_rec = Σ_{(v_i,v_j,k)∈E_B} Σ_{d_i=1}^{D-1} Σ_{d_j=d_i+k·II+1}^{D-1} P_i^{d_i} · P_j^{d_j}
+        A violation occurs when s_j > s_i + k·II (consumer too late).
+        """
+        if not self.graph.back_edges:
+            return torch.tensor(0.0, device=self.device)
+        V_rec = torch.tensor(0.0, device=self.device)
+        for (vi, vj, k) in self.graph.back_edges:
+            # Self-loops: constraint is II ≥ Lat(v), always satisfied if II was
+            # chosen correctly. Skip to avoid confusing the optimizer.
+            if vi == vj:
+                continue
+            Pi = P[vi]  # [D]
+            Pj = P[vj]  # [D]
+            # Reverse cumsum of Pj: rcs[d] = Σ_{d_j >= d} P_j^{d_j}
+            rcs_j = torch.flip(torch.cumsum(torch.flip(Pj, [0]), dim=0), [0])
+            # Violation: d_j > d_i + k*II → rcs_j[d_i + k*II + 1]
+            for d_i in range(self.D):
+                threshold = d_i + k * II + 1
+                if threshold < self.D:
+                    V_rec = V_rec + Pi[d_i] * rcs_j[threshold]
+        return V_rec
+    def _loss_memory(self, P: torch.Tensor) -> torch.Tensor:
+        """Expected peak memory footprint (Equations 9-11).
+        Active(v_i, d) = P(X_i ≤ d) · P(max_{j∈succ(i)} X_j > d)
+        Mem(d) = Σ_i b_i · Active(v_i, d)
+        L_mem = τ · log(Σ_d exp(Mem(d)/τ))
+        """
+        d = torch.arange(self.D, dtype=torch.float32, device=self.device)
+        # CDF_i(d) = Σ_{d'≤d} P_i^{d'}
+        cdf = torch.cumsum(P, dim=1)  # [N, D]
+        # For each node, compute P(all successors finished by d)
+        # = Π_{j∈succ(i)} CDF_j(d)
+        all_succ_done = torch.ones(self.N, self.D, device=self.device)
+        for i in range(self.N):
+            for j in self.graph.successors[i]:
+                all_succ_done[i] *= cdf[j]
+        # Active(i, d) = CDF_i(d) · (1 - Π_{j∈succ(i)} CDF_j(d))
+        # (started and at least one successor hasn't finished)
+        active = cdf * (1.0 - all_succ_done)  # [N, D]
+        # Mem(d) = Σ_i b_i · Active(i, d)
+        mem_d = torch.matmul(self.b, active)  # [D]
+        L_mem = self.tau * torch.logsumexp(mem_d / self.tau, dim=0)
+        return L_mem
+    # ============================================================
+    # Legalization (Appendix A)
+    # ============================================================
+    def _legalize_regular(self, s: np.ndarray) -> np.ndarray:
+        """Algorithm 2: Regular schedule legalization via topological pass."""
+        s_new = np.clip(s, self.s_asap, self.s_alap).astype(int)
+        for v in self.graph.topological_sort():
+            preds = self.graph.predecessors[v]
+            if preds:
+                t_req = max(s_new[p] for p in preds) + 1
+                s_new[v] = max(s_new[v], t_req)
+        return s_new
+    def _legalize_modulo(self, s: np.ndarray, II: int) -> np.ndarray:
+        """Algorithm 3: Modulo schedule legalization via fixed-point iteration."""
+        s_new = s.copy().astype(int)
+        topo = self.graph.topological_sort()
+        for _ in range(self.N):
+            changed = False
+            for v in topo:
+                # Forward dependency requirement
+                preds = self.graph.predecessors[v]
+                t_min = max((s_new[u] for u in preds), default=-1) + 1
+                # Back-edge requirement: s_i + k·II ≥ s_j + Lat(v_j)
+                # => s_j ≤ s_i + k·II - Lat(v_j) (already scheduled)
+                # For the current node v as consumer (vj), the constraint is:
+                # s_v ≥ s_producer - k·II + 1
+                # Skip self-loops (automatically satisfied by modulo structure)
+                t_back = 0
+                for (vi, vj, k) in self.graph.back_edges:
+                    if vj == v and vi != v:  # v is the consumer, skip self-loops
+                        t_back = max(t_back, s_new[vi] - k * II + 1)
+                t_req = max(t_min, t_back)
+                if t_req > s_new[v]:
+                    s_new[v] = t_req
+                    changed = True
+            if not changed:
+                break
+        return s_new
+    def _count_violations(self, s: np.ndarray, II: Optional[int] = None) -> int:
+        """Count constraint violations in a discrete schedule."""
+        count = 0
+        # Forward dependency violations
+        for (u, v) in self.graph.edges:
+            if s[v] <= s[u]:
+                count += 1
+        # Recurrence violations
+        if II is not None:
+            for (vi, vj, k) in self.graph.back_edges:
+                if s[vi] + k * II < s[vj]:
+                    count += 1
+        return count
+    # ============================================================
+    # Main solve methods
+    # ============================================================
+    def solve_regular(
+        self,
+        max_iters: int = 2000,
+        legalize_every: int = 200,
+        alpha_com: float = 0.1,
+        R_cap: Optional[float] = None,
+        verbose: bool = True,
+    ) -> GausResult:
+        """Solve Formulation A: latency-constrained resource + communication optimization.
+        Args:
+            max_iters: Maximum optimization iterations
+            legalize_every: Legalize and warm-restart every N iterations
+            alpha_com: Weight for communication objective
+            R_cap: Resource capacity limit (None = no hard cap)
+            verbose: Print progress
+        """
+        return self._solve(
+            formulation="A",
+            max_iters=max_iters,
+            legalize_every=legalize_every,
+            alpha_com=alpha_com,
+            R_cap=R_cap,
+            II=None,
+            verbose=verbose,
+        )
+    def solve_modulo(
+        self,
+        II: int,
+        R_cap: float = 1.0,
+        max_iters: int = 3000,
+        legalize_every: int = 300,
+        verbose: bool = True,
+    ) -> GausResult:
+        """Solve Formulation C: modulo scheduling (pipelined).
+        This is the formulation directly comparable to Twill's domain.
+        Args:
+            II: Initiation interval (target pipeline rate)
+            R_cap: Per-slot resource capacity in modulo reservation table
+            max_iters: Maximum optimization iterations
+            legalize_every: Legalize and warm-restart every N iterations
+            verbose: Print progress
+        """
+        return self._solve(
+            formulation="C",
+            max_iters=max_iters,
+            legalize_every=legalize_every,
+            R_cap=R_cap,
+            II=II,
+            verbose=verbose,
+        )
+    def _solve(
+        self,
+        formulation: str,
+        max_iters: int,
+        legalize_every: int,
+        alpha_com: float = 0.1,
+        R_cap: Optional[float] = None,
+        II: Optional[int] = None,
+        verbose: bool = True,
+    ) -> GausResult:
+        """Core optimization loop implementing Algorithm 1."""
+        start_time = time.time()
+        if verbose:
+            print(f"GauS Solver — Formulation {formulation}")
+            print(f"  |V|={self.N}, |E|={len(self.graph.edges)}, D={self.D}")
+            if II: print(f"  II={II}, R_cap={R_cap}")
+            print(f"  ASAP: {self.s_asap}")
+            print(f"  ALAP: {self.s_alap}")
+        # Initialize parameters
+        mu, sigma = self._init_params()
+        optimizer = optim.Adam([mu, sigma], lr=self.lr)
+        # Lagrange multipliers (ALM)
+        lambda_dep = torch.tensor(1e-6, device=self.device)
+        lambda_res = torch.tensor(1e-6, device=self.device)
+        lambda_mres = torch.tensor(1e-6, device=self.device)
+        lambda_rec = torch.tensor(1e-6, device=self.device)
+        loss_history = []
+        best_schedule = None
+        best_objective = float('inf')
+        best_violations = float('inf')
+        for it in range(max_iters):
+            optimizer.zero_grad()
+            # Compute P_i^d
+            P = self._compute_P(mu, sigma)
+            # Primary objectives
+            if formulation == "A":
+                L_res, V_res = self._loss_resource(P, R_cap)
+                L_com = self._loss_communication(P)
+                L_primary = L_res + alpha_com * L_com
+            elif formulation == "B":
+                L_primary = self._loss_memory(P)
+                V_res = torch.tensor(0.0, device=self.device)
+            elif formulation == "C":
+                L_mres, V_mres = self._loss_modulo_resource(P, II, R_cap)
+                # Add compactness term: encourage short schedules
+                L_com = self._loss_communication(P)
+                L_primary = L_mres + 0.1 * L_com
+                V_res = torch.tensor(0.0, device=self.device)
+            else:
+                raise ValueError(f"Unknown formulation: {formulation}")
+            # Constraint violations
+            V_dep = self._loss_dependency(P)
+            if formulation == "C":
+                V_rec = self._loss_recurrence(P, II)
+            else:
+                V_rec = torch.tensor(0.0, device=self.device)
+                V_mres = torch.tensor(0.0, device=self.device)
+            # Augmented Lagrangian
+            L_total = L_primary
+            L_total = L_total + lambda_dep * V_dep + (self.rho / 2) * V_dep ** 2
+            L_total = L_total + lambda_res * V_res + (self.rho / 2) * V_res ** 2
+            if formulation == "C":
+                L_total = L_total + lambda_mres * V_mres + (self.rho / 2) * V_mres ** 2
+                L_total = L_total + lambda_rec * V_rec + (self.rho / 2) * V_rec ** 2
+            # Backward + optimize
+            L_total.backward()
+            optimizer.step()
+            # Update Lagrange multipliers
+            with torch.no_grad():
+                lambda_dep = lambda_dep + self.rho * V_dep.detach()
+                lambda_res = lambda_res + self.rho * V_res.detach()
+                if formulation == "C":
+                    lambda_mres = lambda_mres + self.rho * V_mres.detach()
+                    lambda_rec = lambda_rec + self.rho * V_rec.detach()
+            loss_history.append(L_total.item())
+            # Periodic legalization + warm restart
+            if (it + 1) % legalize_every == 0 or it == max_iters - 1:
+                with torch.no_grad():
+                    s_rounded = torch.round(mu).cpu().numpy().astype(int)
+                    if formulation == "C" and II:
+                        s_legal = self._legalize_modulo(s_rounded, II)
+                    else:
+                        s_legal = self._legalize_regular(s_rounded)
+                    violations = self._count_violations(s_legal, II)
+                    # Track best
+                    obj_val = L_primary.item()
+                    if violations < best_violations or (violations == best_violations and obj_val < best_objective):
+                        best_violations = violations
+                        best_objective = obj_val
+                        best_schedule = s_legal.copy()
+                    if verbose and ((it + 1) % (legalize_every) == 0):
+                        sigma_mean = sigma.abs().mean().item()
+                        print(f"  iter {it+1:5d}: L={L_total.item():.4f}, "
+                              f"V_dep={V_dep.item():.4f}, "
+                              f"σ_mean={sigma_mean:.4f}, "
+                              f"violations={violations}")
+                    # Warm restart: re-initialize μ from legalized schedule
+                    if violations > 0:
+                        mu.data = torch.tensor(
+                            s_legal.astype(np.float64),
+                            dtype=torch.float32, device=self.device
+                        )
+        # Final extraction
+        with torch.no_grad():
+            s_final = torch.round(mu).cpu().numpy().astype(int)
+            if formulation == "C" and II:
+                s_final = self._legalize_modulo(s_final, II)
+            else:
+                s_final = self._legalize_regular(s_final)
+            final_violations = self._count_violations(s_final, II)
+            # Use best if final has more violations
+            if best_schedule is not None and final_violations > best_violations:
+                s_final = best_schedule
+                final_violations = best_violations
+        solve_time = time.time() - start_time
+        schedule = {i: int(s_final[i]) for i in range(self.N)}
+        if verbose:
+            print(f"\n  DONE in {solve_time:.2f}s, {max_iters} iterations")
+            print(f"  Final schedule: {self._format_schedule(schedule)}")
+            print(f"  Violations: {final_violations}")
+            if II: print(f"  II: {II}")
+        return GausResult(
+            schedule=schedule,
+            initiation_interval=II,
+            objective_value=best_objective,
+            num_violations=final_violations,
+            solve_time_seconds=solve_time,
+            iterations=max_iters,
+            loss_history=loss_history,
+            node_names=self.graph.node_names,
+        )
+    def _format_schedule(self, schedule: Dict[int, int]) -> str:
+        names = self.graph.node_names
+        return ", ".join(f"{names[i]}@{t}" for i, t in sorted(schedule.items(), key=lambda x: x[1]))
+# ============================================================
+# Integration with Twill's DependenceGraph
+# ============================================================
+def twill_graph_to_gaus(
+    twill_graph,  # twill.graph.DependenceGraph
+    D: Optional[int] = None,
+) -> Tuple[GausGraph, Dict[str, int]]:
+    """Convert a Twill DependenceGraph to GauS format.
+    Args:
+        twill_graph: Twill DependenceGraph object
+        D: Max depth (auto-computed if None)
+    Returns:
+        (GausGraph, name_to_index mapping)
+    """
+    V = twill_graph.V
+    E = twill_graph.E
+    # Build name -> index mapping
+    name_to_idx = {v.name: i for i, v in enumerate(V)}
+    # Separate forward edges (δ=0) from back-edges (δ>0)
+    forward_edges = []
+    back_edges = []
+    for e in E:
+        src_idx = name_to_idx[e.src]
+        dst_idx = name_to_idx[e.dst]
+        if e.iteration_delay == 0:
+            forward_edges.append((src_idx, dst_idx))
+        else:
+            # Back-edge: (src, dst, iteration_delay)
+            back_edges.append((src_idx, dst_idx, e.iteration_delay))
+    # Resource weights: sum of RRT usage across all FUs for each instruction
+    resource_weights = np.array([v.rrt.sum() for v in V], dtype=np.float64)
+    # Per-FU resource weights (for modulo scheduling, use the dominant FU)
+    memory_weights = np.array([
+        sum(v.memory_footprint.values()) if v.memory_footprint else 0
+        for v in V
+    ], dtype=np.float64)
+    node_names = [v.name for v in V]
+    # Auto-compute D if not given
+    if D is None:
+        D = sum(v.cycles for v in V) + len(V)
+    graph = GausGraph(
+        num_nodes=len(V),
+        edges=forward_edges,
+        back_edges=back_edges,
+        resource_weights=resource_weights,
+        memory_weights=memory_weights,
+        node_names=node_names,
+    )
+    return graph, name_to_idx
+def gaus_solve_twill_graph(
+    twill_graph,  # twill.graph.DependenceGraph
+    target_II: Optional[int] = None,
+    D: Optional[int] = None,
+    max_iters: int = 3000,
+    verbose: bool = True,
+) -> GausResult:
+    """Convenience: solve a Twill DependenceGraph using GauS.
+    If target_II is given, uses modulo scheduling (Formulation C).
+    Otherwise, uses regular scheduling (Formulation A).
+    Args:
+        twill_graph: Twill DependenceGraph
+        target_II: Initiation interval for modulo scheduling
+        D: Max depth (auto if None)
+        max_iters: Max optimization iterations
+        verbose: Print progress
+    Returns:
+        GausResult with named schedule
+    """
+    gaus_graph, name_to_idx = twill_graph_to_gaus(twill_graph, D)
+    solver = GauSSolver(gaus_graph, D=D or (sum(v.cycles for v in twill_graph.V) + len(twill_graph.V)))
+    if target_II is not None:
+        R_cap = 1.0  # Default: 1 resource per modulo slot (matches Twill's capacity=1)
+        result = solver.solve_modulo(
+            II=target_II,
+            R_cap=R_cap,
+            max_iters=max_iters,
+            verbose=verbose,
+        )
+    else:
+        result = solver.solve_regular(
+            max_iters=max_iters,
+            verbose=verbose,
+        )
+    return result
+# ============================================================
+# Synthetic graph generation for scalability testing
+# ============================================================
+def generate_random_dag(
+    num_nodes: int,
+    edge_density: float = 0.3,
+    max_weight: int = 3,
+    num_back_edges: int = 0,
+    seed: int = 42,
+) -> GausGraph:
+    """Generate a random DAG for benchmarking.
+    Args:
+        num_nodes: Number of operators
+        edge_density: Probability of edge between valid pairs
+        max_weight: Max resource weight per node
+        num_back_edges: Number of loop-carried back-edges to add
+        seed: Random seed
+    Returns:
+        GausGraph
+    """
+    rng = np.random.RandomState(seed)
+    edges = []
+    for i in range(num_nodes):
+        for j in range(i + 1, num_nodes):
+            if rng.random() < edge_density:
+                edges.append((i, j))
+    # Ensure connectivity: at least a chain
+    if not edges:
+        for i in range(num_nodes - 1):
+            edges.append((i, i + 1))
+    resource_weights = rng.randint(1, max_weight + 1, size=num_nodes).astype(np.float64)
+    memory_weights = rng.randint(1, max_weight + 1, size=num_nodes).astype(np.float64)
+    back_edges = []
+    if num_back_edges > 0:
+        # Add back-edges from later nodes to earlier nodes
+        for _ in range(num_back_edges):
+            src = rng.randint(num_nodes // 2, num_nodes)
+            dst = rng.randint(0, num_nodes // 2)
+            k = rng.randint(1, 3)
+            back_edges.append((src, dst, k))
+    return GausGraph(
+        num_nodes=num_nodes,
+        edges=edges,
+        back_edges=back_edges,
+        resource_weights=resource_weights,
+        memory_weights=memory_weights,
+    )