AshenNav
/

twill-swp-ws

Model card Files Files and versions

xet

Community

AshenNav commited on 14 days ago

Commit

a3e198c

verified ·

1 Parent(s): 2dc86df

Upload twill/graph.py

Browse files

Files changed (1) hide show

twill/graph.py +250 -0

twill/graph.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Core data structures: Dependence Graph, Instructions, RRTs, Machine Description.
+Based on Section 3.1 of the paper:
+- G = (V, E) where V = instructions, E = dependence edges
+- Each instruction v has an RRT (Resource Reservation Table)
+- Each edge (u, v, d, δ) has clock delay d and iteration delay δ
+- Machine description D defines functional unit capacities and memory capacities
+"""
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple, Set
+import numpy as np
+@dataclass
+class Instruction:
+    """A tile-level instruction in the loop body.
+    Attributes:
+        name: Unique identifier for this instruction
+        rrt: Resource Reservation Table - 2D array [cycle, functional_unit] -> usage count
+             Each row = a clock cycle of execution, each column = a functional unit type
+        variable_latency: Whether this instruction has variable latency (e.g., TMA loads)
+        memory_footprint: Dict mapping memory_space -> bytes used by this instruction's output
+        streaming: Whether this is a streaming variable-latency op (no incoming data deps)
+    """
+    name: str
+    rrt: np.ndarray  # shape: (cycles, num_functional_units)
+    variable_latency: bool = False
+    memory_footprint: Dict[str, int] = field(default_factory=dict)
+    streaming: bool = False
+    @property
+    def cycles(self) -> int:
+        """Number of clock cycles this instruction takes."""
+        return self.rrt.shape[0]
+    @property
+    def functional_units_used(self) -> Set[int]:
+        """Set of functional unit indices used by this instruction."""
+        return set(np.where(self.rrt.sum(axis=0) > 0)[0])
+    def __repr__(self):
+        return f"Instruction({self.name}, cycles={self.cycles}, var_lat={self.variable_latency})"
+@dataclass
+class DependenceEdge:
+    """A data dependence edge in the loop dependence graph.
+    From Section 3.1:
+        (u, v, d, δ): v must be issued at least d cycles after u,
+        where v is from iteration i and u is from iteration i - δ.
+    Attributes:
+        src: Source instruction name
+        dst: Destination instruction name
+        delay: Minimum clock cycle delay d (v must start >= d cycles after u starts)
+        iteration_delay: δ - the iteration distance (0 = same iteration, 1 = loop-carried)
+    """
+    src: str
+    dst: str
+    delay: int  # d: minimum clock cycles between src issue and dst issue
+    iteration_delay: int = 0  # δ: iteration distance
+    def __repr__(self):
+        return f"Edge({self.src} -> {self.dst}, d={self.delay}, δ={self.iteration_delay})"
+@dataclass
+class MachineDescription:
+    """Description of the target GPU architecture.
+    Attributes:
+        name: Architecture name (e.g., "Hopper", "Blackwell")
+        functional_units: List of functional unit names (e.g., ["TC", "EXP", "TMA"])
+        capacities: Dict mapping functional_unit_name -> capacity (max simultaneous usage)
+        memory_spaces: Dict mapping memory_space_name -> capacity in bytes
+        num_warps: Number of available warps for WS
+        variable_latency_warp: Index of the warp designated for variable-latency ops
+    """
+    name: str
+    functional_units: List[str]
+    capacities: Dict[str, int]
+    memory_spaces: Dict[str, int] = field(default_factory=dict)
+    num_warps: int = 4
+    variable_latency_warp: int = 0  # W_vl
+    def capacity(self, fu_name: str) -> int:
+        """Get capacity of a functional unit by name."""
+        return self.capacities.get(fu_name, 0)
+    def fu_index(self, fu_name: str) -> int:
+        """Get index of a functional unit by name."""
+        return self.functional_units.index(fu_name)
+    @property
+    def num_functional_units(self) -> int:
+        return len(self.functional_units)
+    @property
+    def capacity_vector(self) -> np.ndarray:
+        """Array of capacities indexed by functional unit index."""
+        return np.array([self.capacities[fu] for fu in self.functional_units])
+class DependenceGraph:
+    """Loop dependence graph G = (V, E).
+    This is the primary input to Twill's optimization pipeline.
+    Usage:
+        graph = DependenceGraph(machine)
+        graph.add_instruction(Instruction("S", rrt_s))
+        graph.add_instruction(Instruction("P", rrt_p))
+        graph.add_edge(DependenceEdge("S", "P", delay=1))
+        ...
+    """
+    def __init__(self, machine: MachineDescription):
+        self.machine = machine
+        self.instructions: Dict[str, Instruction] = {}
+        self.edges: List[DependenceEdge] = []
+        self._instruction_order: List[str] = []  # maintain insertion order
+    def add_instruction(self, instr: Instruction):
+        """Add an instruction to the graph."""
+        assert instr.name not in self.instructions, f"Duplicate instruction: {instr.name}"
+        assert instr.rrt.shape[1] == self.machine.num_functional_units, \
+            f"RRT width {instr.rrt.shape[1]} != num_functional_units {self.machine.num_functional_units}"
+        self.instructions[instr.name] = instr
+        self._instruction_order.append(instr.name)
+    def add_edge(self, edge: DependenceEdge):
+        """Add a dependence edge to the graph."""
+        assert edge.src in self.instructions, f"Unknown source: {edge.src}"
+        assert edge.dst in self.instructions, f"Unknown destination: {edge.dst}"
+        self.edges.append(edge)
+    @property
+    def V(self) -> List[Instruction]:
+        """List of instructions in insertion order."""
+        return [self.instructions[name] for name in self._instruction_order]
+    @property
+    def E(self) -> List[DependenceEdge]:
+        """List of dependence edges."""
+        return self.edges
+    @property
+    def num_instructions(self) -> int:
+        return len(self.instructions)
+    def get_instruction(self, name: str) -> Instruction:
+        return self.instructions[name]
+    def outgoing_edges(self, name: str) -> List[DependenceEdge]:
+        """Get all edges where name is the source."""
+        return [e for e in self.edges if e.src == name]
+    def incoming_edges(self, name: str) -> List[DependenceEdge]:
+        """Get all edges where name is the destination."""
+        return [e for e in self.edges if e.dst == name]
+    def has_loop_carried_output(self, name: str) -> bool:
+        """Check if instruction has any outgoing loop-carried edge (δ > 0)."""
+        return any(e.iteration_delay > 0 for e in self.outgoing_edges(name))
+    def get_cycle_counts(self) -> List[int]:
+        """Get list of all edge delays (for cost normalization)."""
+        delays = set()
+        for instr in self.V:
+            delays.add(instr.cycles)
+        for edge in self.edges:
+            delays.add(edge.delay)
+        return sorted(delays)
+    def compute_min_initiation_interval(self) -> int:
+        """Compute the resource-constrained lower bound on I.
+        For each functional unit f:
+            I >= ceil(sum of RRT usage across all instructions / capacity(f))
+        """
+        min_I = 1
+        cap_vec = self.machine.capacity_vector
+        for fu_idx in range(self.machine.num_functional_units):
+            total_usage = sum(instr.rrt[:, fu_idx].sum() for instr in self.V)
+            if cap_vec[fu_idx] > 0:
+                resource_bound = int(np.ceil(total_usage / cap_vec[fu_idx]))
+                min_I = max(min_I, resource_bound)
+        # Recurrence-constrained lower bound
+        for edge in self.edges:
+            if edge.iteration_delay > 0:
+                rec_bound = int(np.ceil(edge.delay / edge.iteration_delay))
+                min_I = max(min_I, rec_bound)
+        return min_I
+    def __repr__(self):
+        return (f"DependenceGraph(|V|={self.num_instructions}, |E|={len(self.edges)}, "
+                f"machine={self.machine.name})")
+# ============================================================
+# Pre-defined machine descriptions
+# ============================================================
+def hopper_machine(
+    tc_capacity: int = 1,
+    exp_capacity: int = 1,
+    tma_capacity: int = 1,
+) -> MachineDescription:
+    """NVIDIA Hopper (H100) machine description."""
+    return MachineDescription(
+        name="Hopper",
+        functional_units=["TC", "EXP", "TMA"],
+        capacities={"TC": tc_capacity, "EXP": exp_capacity, "TMA": tma_capacity},
+        memory_spaces={"SMEM": 228 * 1024, "REGS": 256 * 1024},
+        num_warps=4,
+        variable_latency_warp=0,
+    )
+def blackwell_machine(
+    tc_capacity: int = 1,
+    exp_capacity: int = 1,
+    tma_capacity: int = 1,
+    tmem_capacity: int = 1,
+) -> MachineDescription:
+    """NVIDIA Blackwell (B200) machine description."""
+    return MachineDescription(
+        name="Blackwell",
+        functional_units=["TC", "EXP", "TMA", "TMEM"],
+        capacities={"TC": tc_capacity, "EXP": exp_capacity, "TMA": tma_capacity, "TMEM": tmem_capacity},
+        memory_spaces={"SMEM": 228 * 1024, "REGS": 256 * 1024, "TMEM": 128 * 1024},
+        num_warps=4,
+        variable_latency_warp=0,
+    )
+def make_rrt(cycles: int, fu_usage: Dict[int, List[int]], num_fus: int) -> np.ndarray:
+    """Create an RRT array."""
+    rrt = np.zeros((cycles, num_fus), dtype=int)
+    for fu_idx, usage_per_cycle in fu_usage.items():
+        for c, usage in enumerate(usage_per_cycle):
+            if c < cycles:
+                rrt[c, fu_idx] = usage
+    return rrt