""" Core data structures: Dependence Graph, Instructions, RRTs, Machine Description. Based on Section 3.1 of the paper: - G = (V, E) where V = instructions, E = dependence edges - Each instruction v has an RRT (Resource Reservation Table) - Each edge (u, v, d, δ) has clock delay d and iteration delay δ - Machine description D defines functional unit capacities and memory capacities """ from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple, Set import numpy as np @dataclass class Instruction: """A tile-level instruction in the loop body. Attributes: name: Unique identifier for this instruction rrt: Resource Reservation Table - 2D array [cycle, functional_unit] -> usage count Each row = a clock cycle of execution, each column = a functional unit type variable_latency: Whether this instruction has variable latency (e.g., TMA loads) memory_footprint: Dict mapping memory_space -> bytes used by this instruction's output streaming: Whether this is a streaming variable-latency op (no incoming data deps) """ name: str rrt: np.ndarray # shape: (cycles, num_functional_units) variable_latency: bool = False memory_footprint: Dict[str, int] = field(default_factory=dict) streaming: bool = False @property def cycles(self) -> int: """Number of clock cycles this instruction takes.""" return self.rrt.shape[0] @property def functional_units_used(self) -> Set[int]: """Set of functional unit indices used by this instruction.""" return set(np.where(self.rrt.sum(axis=0) > 0)[0]) def __repr__(self): return f"Instruction({self.name}, cycles={self.cycles}, var_lat={self.variable_latency})" @dataclass class DependenceEdge: """A data dependence edge in the loop dependence graph. From Section 3.1: (u, v, d, δ): v must be issued at least d cycles after u, where v is from iteration i and u is from iteration i - δ. Attributes: src: Source instruction name dst: Destination instruction name delay: Minimum clock cycle delay d (v must start >= d cycles after u starts) iteration_delay: δ - the iteration distance (0 = same iteration, 1 = loop-carried) """ src: str dst: str delay: int # d: minimum clock cycles between src issue and dst issue iteration_delay: int = 0 # δ: iteration distance def __repr__(self): return f"Edge({self.src} -> {self.dst}, d={self.delay}, δ={self.iteration_delay})" @dataclass class MachineDescription: """Description of the target GPU architecture. Attributes: name: Architecture name (e.g., "Hopper", "Blackwell") functional_units: List of functional unit names (e.g., ["TC", "EXP", "TMA"]) capacities: Dict mapping functional_unit_name -> capacity (max simultaneous usage) memory_spaces: Dict mapping memory_space_name -> capacity in bytes num_warps: Number of available warps for WS variable_latency_warp: Index of the warp designated for variable-latency ops """ name: str functional_units: List[str] capacities: Dict[str, int] memory_spaces: Dict[str, int] = field(default_factory=dict) num_warps: int = 4 variable_latency_warp: int = 0 # W_vl def capacity(self, fu_name: str) -> int: """Get capacity of a functional unit by name.""" return self.capacities.get(fu_name, 0) def fu_index(self, fu_name: str) -> int: """Get index of a functional unit by name.""" return self.functional_units.index(fu_name) @property def num_functional_units(self) -> int: return len(self.functional_units) @property def capacity_vector(self) -> np.ndarray: """Array of capacities indexed by functional unit index.""" return np.array([self.capacities[fu] for fu in self.functional_units]) class DependenceGraph: """Loop dependence graph G = (V, E). This is the primary input to Twill's optimization pipeline. Usage: graph = DependenceGraph(machine) graph.add_instruction(Instruction("S", rrt_s)) graph.add_instruction(Instruction("P", rrt_p)) graph.add_edge(DependenceEdge("S", "P", delay=1)) ... """ def __init__(self, machine: MachineDescription): self.machine = machine self.instructions: Dict[str, Instruction] = {} self.edges: List[DependenceEdge] = [] self._instruction_order: List[str] = [] # maintain insertion order def add_instruction(self, instr: Instruction): """Add an instruction to the graph.""" assert instr.name not in self.instructions, f"Duplicate instruction: {instr.name}" assert instr.rrt.shape[1] == self.machine.num_functional_units, \ f"RRT width {instr.rrt.shape[1]} != num_functional_units {self.machine.num_functional_units}" self.instructions[instr.name] = instr self._instruction_order.append(instr.name) def add_edge(self, edge: DependenceEdge): """Add a dependence edge to the graph.""" assert edge.src in self.instructions, f"Unknown source: {edge.src}" assert edge.dst in self.instructions, f"Unknown destination: {edge.dst}" self.edges.append(edge) @property def V(self) -> List[Instruction]: """List of instructions in insertion order.""" return [self.instructions[name] for name in self._instruction_order] @property def E(self) -> List[DependenceEdge]: """List of dependence edges.""" return self.edges @property def num_instructions(self) -> int: return len(self.instructions) def get_instruction(self, name: str) -> Instruction: return self.instructions[name] def outgoing_edges(self, name: str) -> List[DependenceEdge]: """Get all edges where name is the source.""" return [e for e in self.edges if e.src == name] def incoming_edges(self, name: str) -> List[DependenceEdge]: """Get all edges where name is the destination.""" return [e for e in self.edges if e.dst == name] def has_loop_carried_output(self, name: str) -> bool: """Check if instruction has any outgoing loop-carried edge (δ > 0).""" return any(e.iteration_delay > 0 for e in self.outgoing_edges(name)) def get_cycle_counts(self) -> List[int]: """Get list of all edge delays (for cost normalization).""" delays = set() for instr in self.V: delays.add(instr.cycles) for edge in self.edges: delays.add(edge.delay) return sorted(delays) def compute_min_initiation_interval(self) -> int: """Compute the resource-constrained lower bound on I. For each functional unit f: I >= ceil(sum of RRT usage across all instructions / capacity(f)) """ min_I = 1 cap_vec = self.machine.capacity_vector for fu_idx in range(self.machine.num_functional_units): total_usage = sum(instr.rrt[:, fu_idx].sum() for instr in self.V) if cap_vec[fu_idx] > 0: resource_bound = int(np.ceil(total_usage / cap_vec[fu_idx])) min_I = max(min_I, resource_bound) # Recurrence-constrained lower bound for edge in self.edges: if edge.iteration_delay > 0: rec_bound = int(np.ceil(edge.delay / edge.iteration_delay)) min_I = max(min_I, rec_bound) return min_I def __repr__(self): return (f"DependenceGraph(|V|={self.num_instructions}, |E|={len(self.edges)}, " f"machine={self.machine.name})") # ============================================================ # Pre-defined machine descriptions # ============================================================ def hopper_machine( tc_capacity: int = 1, exp_capacity: int = 1, tma_capacity: int = 1, ) -> MachineDescription: """NVIDIA Hopper (H100) machine description.""" return MachineDescription( name="Hopper", functional_units=["TC", "EXP", "TMA"], capacities={"TC": tc_capacity, "EXP": exp_capacity, "TMA": tma_capacity}, memory_spaces={"SMEM": 228 * 1024, "REGS": 256 * 1024}, num_warps=4, variable_latency_warp=0, ) def blackwell_machine( tc_capacity: int = 1, exp_capacity: int = 1, tma_capacity: int = 1, tmem_capacity: int = 1, ) -> MachineDescription: """NVIDIA Blackwell (B200) machine description.""" return MachineDescription( name="Blackwell", functional_units=["TC", "EXP", "TMA", "TMEM"], capacities={"TC": tc_capacity, "EXP": exp_capacity, "TMA": tma_capacity, "TMEM": tmem_capacity}, memory_spaces={"SMEM": 228 * 1024, "REGS": 256 * 1024, "TMEM": 128 * 1024}, num_warps=4, variable_latency_warp=0, ) def make_rrt(cycles: int, fu_usage: Dict[int, List[int]], num_fus: int) -> np.ndarray: """Create an RRT array.""" rrt = np.zeros((cycles, num_fus), dtype=int) for fu_idx, usage_per_cycle in fu_usage.items(): for c, usage in enumerate(usage_per_cycle): if c < cycles: rrt[c, fu_idx] = usage return rrt