Enable CUDA graph AR decode for discrete actions

Generalizes the existing single-token decode CUDA graph helper and uses it for no-depth discrete action autoregressive decoding when enable_cuda_graph=True. The original KV-cache path is preserved when enable_cuda_graph=False.

Files changed (1) hide show

modeling_molmoact2.py +59 -10

modeling_molmoact2.py CHANGED Viewed

@@ -4127,14 +4127,14 @@ class MolmoAct2ForConditionalGeneration(MolmoAct2PreTrainedModel, GenerationMixi
     def _embed_base_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
         # Skips MolmoAct2Embedding's per-call cat([base, new]); safe only for IDs
-        # below text_config.vocab_size, which is the case for all depth tokens.
         wte = self.model.transformer.wte
         base_embedding = getattr(wte, "embedding", None)
         if base_embedding is None:
             return wte(input_ids)
         return F.embedding(input_ids, base_embedding)
-    def _run_depth_decode_step(
         self,
         token_ids: torch.Tensor,
         *,
@@ -4178,6 +4178,19 @@ class MolmoAct2ForConditionalGeneration(MolmoAct2PreTrainedModel, GenerationMixi
         )
         return outputs.last_hidden_state[:, -1:, :], outputs.past_key_values
     def _project_depth_logits(self, last_hidden: torch.Tensor) -> torch.Tensor:
         start = int(self.config.depth_token_start_id)
         end_id = start + int(self.config.num_depth_tokens)
@@ -4190,6 +4203,12 @@ class MolmoAct2ForConditionalGeneration(MolmoAct2PreTrainedModel, GenerationMixi
             1,
         )
     def _make_depth_static_cache(self, inputs: Mapping[str, Any]) -> Cache:
         prompt_len = inputs["input_ids"].shape[1]
         action_horizon = int(self.config.action_horizon or 1)
@@ -4210,6 +4229,7 @@ class MolmoAct2ForConditionalGeneration(MolmoAct2PreTrainedModel, GenerationMixi
         attention_mask: Optional[torch.Tensor],
         end_token_id: int,
         max_steps: int,
     ) -> torch.Tensor:
         generated_tokens: List[torch.Tensor] = []
         current_output = initial_output
@@ -4222,12 +4242,23 @@ class MolmoAct2ForConditionalGeneration(MolmoAct2PreTrainedModel, GenerationMixi
             if bool((next_token == int(end_token_id)).all()):
                 hit_end = True
                 break
-            current_output, current_attention_mask = self._consume_generation_tokens(
-                next_token,
-                past_key_values=current_past_key_values,
-                attention_mask=current_attention_mask,
-            )
-            current_past_key_values = current_output.past_key_values
         if not generated_tokens:
             raise RuntimeError("Discrete continuation generated no tokens.")
         if not hit_end:
@@ -4705,13 +4736,31 @@ class MolmoAct2ForConditionalGeneration(MolmoAct2PreTrainedModel, GenerationMixi
                         .astype(np.int64),
                     }
             else:
-                prefill_output = self(**inputs, use_cache=True)
                 action_token_ids = self._continue_discrete_generation_from_output(
                     prefill_output,
                     past_key_values=prefill_output.past_key_values,
                     attention_mask=inputs.get("attention_mask"),
                     end_token_id=self._require_eos_token_id(),
-                    max_steps=max(1, int(self.config.action_horizon * 16)),
                 )
                 generated_token_ids = action_token_ids
             actions = self._decode_discrete_action_chunk(

     def _embed_base_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
         # Skips MolmoAct2Embedding's per-call cat([base, new]); safe only for IDs
+        # below text_config.vocab_size. This includes released depth/action tokens.
         wte = self.model.transformer.wte
         base_embedding = getattr(wte, "embedding", None)
         if base_embedding is None:
             return wte(input_ids)
         return F.embedding(input_ids, base_embedding)
+    def _run_ar_decode_step(
         self,
         token_ids: torch.Tensor,
         *,
         )
         return outputs.last_hidden_state[:, -1:, :], outputs.past_key_values
+    def _run_depth_decode_step(
+        self,
+        token_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Cache]:
+        return self._run_ar_decode_step(
+            token_ids,
+            past_key_values=past_key_values,
+            attention_bias=attention_bias,
+        )
     def _project_depth_logits(self, last_hidden: torch.Tensor) -> torch.Tensor:
         start = int(self.config.depth_token_start_id)
         end_id = start + int(self.config.num_depth_tokens)
             1,
         )
+    def _make_ar_decode_static_cache(self, inputs: Mapping[str, Any], max_steps: int) -> Cache:
+        prompt_len = inputs["input_ids"].shape[1]
+        return self.depth_decode_cuda_graph_manager.make_static_cache(
+            max_cache_len=prompt_len + max(1, int(max_steps)),
+        )
     def _make_depth_static_cache(self, inputs: Mapping[str, Any]) -> Cache:
         prompt_len = inputs["input_ids"].shape[1]
         action_horizon = int(self.config.action_horizon or 1)
         attention_mask: Optional[torch.Tensor],
         end_token_id: int,
         max_steps: int,
+        attention_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         generated_tokens: List[torch.Tensor] = []
         current_output = initial_output
             if bool((next_token == int(end_token_id)).all()):
                 hit_end = True
                 break
+            if attention_bias is None:
+                current_output, current_attention_mask = self._consume_generation_tokens(
+                    next_token,
+                    past_key_values=current_past_key_values,
+                    attention_mask=current_attention_mask,
+                )
+                current_past_key_values = current_output.past_key_values
+            else:
+                last_hidden, current_past_key_values = self._run_ar_decode_step(
+                    next_token,
+                    past_key_values=current_past_key_values,
+                    attention_bias=attention_bias,
+                )
+                current_output = MolmoAct2CausalLMOutputWithPast(
+                    logits=self.lm_head(last_hidden),
+                    past_key_values=current_past_key_values,
+                )
         if not generated_tokens:
             raise RuntimeError("Discrete continuation generated no tokens.")
         if not hit_end:
                         .astype(np.int64),
                     }
             else:
+                max_action_decode_steps = max(1, int(self.config.action_horizon * 16))
+                action_attention_bias = None
+                if enable_cuda_graph:
+                    action_static_cache = self._make_ar_decode_static_cache(
+                        inputs,
+                        max_steps=max_action_decode_steps,
+                    )
+                    action_attention_bias = self._make_depth_decode_attention_bias(
+                        inputs,
+                        action_static_cache,
+                    )
+                    prefill_output = self(
+                        **inputs,
+                        use_cache=True,
+                        past_key_values=action_static_cache,
+                    )
+                else:
+                    prefill_output = self(**inputs, use_cache=True)
                 action_token_ids = self._continue_discrete_generation_from_output(
                     prefill_output,
                     past_key_values=prefill_output.past_key_values,
                     attention_mask=inputs.get("attention_mask"),
                     end_token_id=self._require_eos_token_id(),
+                    max_steps=max_action_decode_steps,
+                    attention_bias=action_attention_bias,
                 )
                 generated_token_ids = action_token_ids
             actions = self._decode_discrete_action_chunk(