Chunk relational model for consistent CUDA performance

Run rect_proj, combined_proj (BatchNorm), and the transformer encoder
in fixed chunk_size batches so every kernel invocation sees the same
tensor shape. This prevents cuDNN from re-benchmarking on every new
batch size, eliminating ~460ms latency spikes.

Also compute cdist per-image on actual regions instead of padding all
images to k_max, reducing wasted compute on the quadratic distance
matrix.

Made-with: Cursor

Files changed (1) hide show

nemotron-ocr/src/nemotron_ocr/inference/models/relational.py +101 -115

nemotron-ocr/src/nemotron_ocr/inference/models/relational.py CHANGED Viewed

@@ -111,6 +111,33 @@ class GlobalRelationalModel(nn.Module):
             nn.Linear(dim, 3),
         )
     def get_target_rects(
         self,
         quads: torch.Tensor,
@@ -254,14 +281,15 @@ class GlobalRelationalModel(nn.Module):
         original_quads = original_quads / self.quad_downscale
         mid_pts = original_quads.detach().mean(dim=1, dtype=torch.float32)
-        rectified_quads = self.rect_proj(rectified_quads)
-        avg_rects = rectified_quads.flatten(2).sum(
-            dim=2, dtype=torch.float32
-        ) / num_valid_pos.unsqueeze(1)
-        recog_encoding = self.recog_tx(recog_features.detach()).mean(dim=1, dtype=torch.float32)
-        semantic_encoding = self.combined_proj(torch.cat((avg_rects, recog_encoding), dim=1))
         h1 = original_quads[:, 3] - original_quads[:, 0]
         h2 = original_quads[:, 2] - original_quads[:, 1]
@@ -320,11 +348,13 @@ class GlobalRelationalModel(nn.Module):
         counts_list = region_counts.tolist() if region_counts.dim() > 0 else [int(region_counts.item())]
         batch_size = len(counts_list)
-        k_max_raw = max(counts_list) if counts_list else 0
-        if k_max_raw == 0:
-            device = proj_rects.device
-            dtype = proj_rects.dtype
             return {
                 "words": [torch.empty(0, 1, dtype=dtype, device=device) for _ in range(batch_size)],
                 "lines": [torch.empty(0, 1, dtype=dtype, device=device) for _ in range(batch_size)],
@@ -333,111 +363,67 @@ class GlobalRelationalModel(nn.Module):
                 ],
             }
-        chunk_size = max(1, int(getattr(self, "chunk_size", DEFAULT_CHUNK)))
-        k_max = ((k_max_raw + chunk_size - 1) // chunk_size) * chunk_size
-        device = proj_rects.device
-        counts_gpu = region_counts.to(device=device, dtype=torch.long)
-        rects_b = _pad_flat_to_batched(proj_rects, region_counts, k_max, pad_value=0.0)
-        centers_b = _pad_flat_to_batched(mid_pts, region_counts, k_max, pad_value=0.0)
-        quads_b = _pad_flat_to_batched(quads, region_counts, k_max, pad_value=0.0)
-        arange_k = torch.arange(k_max, device=device)
-        valid_rows = arange_k.unsqueeze(0) < counts_gpu.unsqueeze(1)
-        k_per_image = (counts_gpu - 1).clamp_min(0).clamp_max(self.k - 1)
-        z_max = int(k_per_image.max().item())
-        if z_max == 0:
-            to_rects = torch.zeros(batch_size, k_max, 1, proj_rects.shape[1] + 2, **options(proj_rects))
-            closest_other_idxs = torch.zeros(batch_size, k_max, 1, dtype=torch.long, device=device)
-            key_padding_mask = torch.zeros(batch_size, k_max, 1, dtype=torch.bool, device=device)
-        else:
-            all_dists = get_cdist_batched(quads_b, counts_gpu)
-            invalid_rows = ~valid_rows
-            all_dists.masked_fill_(invalid_rows.unsqueeze(1), float("inf"))
-            all_dists.masked_fill_(invalid_rows.unsqueeze(2), float("inf"))
-            all_dists.diagonal(dim1=-2, dim2=-1).fill_(float("inf"))
-            topk_dists, topk_idxs = torch.topk(
-                all_dists, k=z_max, dim=2, largest=False, sorted=False
-            )
-            gather_idx = topk_idxs.unsqueeze(-1).expand(-1, -1, -1, rects_b.shape[-1])
-            neighbor_rects = torch.gather(
-                rects_b.unsqueeze(1).expand(-1, k_max, -1, -1),
-                dim=2,
-                index=gather_idx,
-            )
-            gather_idx_2 = topk_idxs.unsqueeze(-1).expand(-1, -1, -1, 2)
-            neighbor_centers = torch.gather(
-                centers_b.unsqueeze(1).expand(-1, k_max, -1, -1),
-                dim=2,
-                index=gather_idx_2,
-            )
-            directions = _batched_directions(quads_b, neighbor_centers)
-            neighbor_valid = valid_rows.unsqueeze(2) & (
-                torch.arange(z_max, device=device).view(1, 1, -1) < k_per_image.view(batch_size, 1, 1)
-            )
-            neighbor_rects = neighbor_rects.masked_fill(~neighbor_valid.unsqueeze(-1), 0)
-            topk_dists = topk_dists.masked_fill(~neighbor_valid, 0)
-            directions = directions.masked_fill(~neighbor_valid, 0)
-            null_rects = torch.zeros(batch_size, k_max, 1, rects_b.shape[-1], **options(rects_b))
-            null_dists = torch.full((batch_size, k_max, 1), -1, **options(rects_b))
-            null_dirs = torch.full((batch_size, k_max, 1), -2, **options(rects_b))
-            to_rects = torch.cat(
-                (
-                    torch.cat((null_rects, neighbor_rects), dim=2),
-                    torch.cat((null_dists, topk_dists), dim=2).unsqueeze(-1),
-                    torch.cat((null_dirs, directions), dim=2).unsqueeze(-1),
-                ),
-                dim=-1,
-            )
-            key_padding_mask = torch.ones(batch_size, k_max, z_max + 1, dtype=torch.bool, device=device)
-            key_padding_mask[:, :, 0] = False
-            key_padding_mask[:, :, 1:] = ~neighbor_valid
-            invalid_target_idx = torch.full_like(topk_idxs, k_max)
-            target_idxs = torch.where(neighbor_valid, topk_idxs + 1, invalid_target_idx)
-            closest_other_idxs = torch.cat(
-                (torch.zeros(batch_size, k_max, 1, dtype=torch.long, device=device), target_idxs),
-                dim=2,
-            )
-        from_rects = rects_b.unsqueeze(2).expand(-1, -1, to_rects.shape[2], -1)
-        enc_input = torch.cat((from_rects, to_rects), dim=3)
-        enc_flat = enc_input.reshape(batch_size * k_max, enc_input.shape[2], enc_input.shape[3])
-        mask_flat = key_padding_mask.reshape(batch_size * k_max, key_padding_mask.shape[2])
-        if enc_flat.shape[0]:
-            dots = self.encoder[0](enc_flat, src_key_padding_mask=mask_flat)
-            dots = self.encoder[1](dots)
-        else:
-            dots = torch.empty(0, 1, 3, dtype=enc_input.dtype, device=device)
-        dots = dots.reshape(batch_size, k_max, enc_input.shape[2], 3).permute(0, 3, 1, 2)
-        dots = self.prohibit_self_connection(dots, closest_other_idxs)
         all_dots = dict(words=[], lines=[], line_log_var_unc=[])
-        for i, region_count in enumerate(counts_list):
-            if region_count == 0:
-                word_pred = torch.empty(0, 1, dtype=dots.dtype, device=device)
-                line_pred = torch.empty(0, 1, dtype=dots.dtype, device=device)
-                line_log_var_pred = torch.empty(0, 1, dtype=dots.dtype, device=device)
-            else:
-                word_pred = dots[i, 0, :region_count, : region_count + 1]
-                line_pred = dots[i, 1, :region_count, : region_count + 1]
-                line_log_var_pred = dots[i, 2, :region_count, : region_count + 1]
-            all_dots["words"].append(word_pred)
-            all_dots["lines"].append(line_pred)
-            all_dots["line_log_var_unc"].append(line_log_var_pred)
         return {
             "words": all_dots["words"],

             nn.Linear(dim, 3),
         )
+    def _chunked_forward(self, fn, x, *extra, pad_extra_ones=False):
+        """Run *fn* in fixed ``chunk_size`` batches along dim-0.
+        Pads the last chunk so every call sees the same tensor shape,
+        preventing cuDNN autotuning on varying batch sizes.
+        ``extra`` tensors are sliced/padded in sync with ``x``.
+        """
+        n = x.shape[0]
+        cs = max(1, self.chunk_size)
+        if n == 0:
+            return fn(x, *extra)
+        parts = []
+        for start in range(0, n, cs):
+            end = min(start + cs, n)
+            real_n = end - start
+            xc = x[start:end]
+            ec = [e[start:end] for e in extra]
+            if real_n < cs:
+                xc = torch.cat((xc, xc[:1].expand(cs - real_n, *[-1] * (xc.ndim - 1))), dim=0)
+                for i, e in enumerate(ec):
+                    if pad_extra_ones and e.dtype == torch.bool:
+                        ec[i] = torch.cat((e, torch.ones(cs - real_n, *e.shape[1:], dtype=torch.bool, device=e.device)), dim=0)
+                    else:
+                        ec[i] = torch.cat((e, e[:1].expand(cs - real_n, *[-1] * (e.ndim - 1))), dim=0)
+            parts.append(fn(xc, *ec)[:real_n])
+        return torch.cat(parts, dim=0)
     def get_target_rects(
         self,
         quads: torch.Tensor,
         original_quads = original_quads / self.quad_downscale
         mid_pts = original_quads.detach().mean(dim=1, dtype=torch.float32)
+        def _input_enc_nn(rq, rf, nvp):
+            rq = self.rect_proj(rq)
+            avg = rq.flatten(2).sum(dim=2, dtype=torch.float32) / nvp.unsqueeze(1)
+            rec = self.recog_tx(rf.detach()).mean(dim=1, dtype=torch.float32)
+            return self.combined_proj(torch.cat((avg, rec), dim=1))
+        semantic_encoding = self._chunked_forward(
+            _input_enc_nn, rectified_quads, recog_features, num_valid_pos,
+        )
         h1 = original_quads[:, 3] - original_quads[:, 0]
         h2 = original_quads[:, 2] - original_quads[:, 1]
         counts_list = region_counts.tolist() if region_counts.dim() > 0 else [int(region_counts.item())]
         batch_size = len(counts_list)
+        device = proj_rects.device
+        dtype = proj_rects.dtype
+        feat_dim = proj_rects.shape[1]
+        z = self.k - 1
+        seq_len = z + 1
+        if max(counts_list, default=0) == 0:
             return {
                 "words": [torch.empty(0, 1, dtype=dtype, device=device) for _ in range(batch_size)],
                 "lines": [torch.empty(0, 1, dtype=dtype, device=device) for _ in range(batch_size)],
                 ],
             }
+        # Per-image cdist + topk, then concatenate into flat [N_total, seq_len, ...]
+        offsets = [0]
+        for c in counts_list:
+            offsets.append(offsets[-1] + c)
+        n_total = offsets[-1]
+        enc_input_flat = torch.zeros(n_total, seq_len, 2 * feat_dim + 2, dtype=dtype, device=device)
+        mask_flat = torch.ones(n_total, seq_len, dtype=torch.bool, device=device)
+        closest_flat = torch.zeros(n_total, seq_len, dtype=torch.long, device=device)
+        for i, n_i in enumerate(counts_list):
+            if n_i == 0:
+                continue
+            s, e = offsets[i], offsets[i + 1]
+            rects_i = proj_rects[s:e]
+            centers_i = mid_pts[s:e]
+            quads_i = quads[s:e]
+            z_i = min(n_i - 1, z)
+            from_r = rects_i.unsqueeze(1).expand(-1, seq_len, -1)
+            enc_input_flat[s:e, 0, :feat_dim] = rects_i
+            enc_input_flat[s:e, 0, 2 * feat_dim] = -1
+            enc_input_flat[s:e, 0, 2 * feat_dim + 1] = -2
+            mask_flat[s:e, 0] = False
+            if z_i > 0:
+                dists_i = get_cdist(quads_i, centers_i)
+                topk_d, topk_idx = torch.topk(dists_i, k=z_i, dim=1, largest=False, sorted=False)
+                nb_r = torch.gather(rects_i.unsqueeze(0).expand(n_i, -1, -1), 1, topk_idx.unsqueeze(2).expand(-1, -1, feat_dim))
+                nb_c = torch.gather(centers_i.unsqueeze(0).expand(n_i, -1, -1), 1, topk_idx.unsqueeze(2).expand(-1, -1, 2))
+                dirs_i = get_directions(quads_i, nb_c)
+                enc_input_flat[s:e, 1:z_i + 1, :feat_dim] = from_r[:, 1:z_i + 1]
+                enc_input_flat[s:e, 1:z_i + 1, feat_dim:2 * feat_dim] = nb_r
+                enc_input_flat[s:e, 1:z_i + 1, 2 * feat_dim] = topk_d
+                enc_input_flat[s:e, 1:z_i + 1, 2 * feat_dim + 1] = dirs_i
+                mask_flat[s:e, 1:z_i + 1] = False
+                closest_flat[s:e, 1:z_i + 1] = topk_idx + 1
+        # Chunked encoder on flat regions — always sees [chunk_size, seq_len, dim]
+        def _run_encoder(enc, mask):
+            out = self.encoder[0](enc, src_key_padding_mask=mask)
+            return self.encoder[1](out)
+        dots_flat = self._chunked_forward(_run_encoder, enc_input_flat, mask_flat, pad_extra_ones=True)
+        # Per-image: scatter encoder output into full relation matrices
         all_dots = dict(words=[], lines=[], line_log_var_unc=[])
+        for i, n_i in enumerate(counts_list):
+            if n_i == 0:
+                all_dots["words"].append(torch.empty(0, 1, dtype=torch.float32, device=device))
+                all_dots["lines"].append(torch.empty(0, 1, dtype=torch.float32, device=device))
+                all_dots["line_log_var_unc"].append(torch.empty(0, 1, dtype=torch.float32, device=device))
+                continue
+            s, e = offsets[i], offsets[i + 1]
+            dots_i = dots_flat[s:e].unsqueeze(0).permute(0, 3, 1, 2)
+            cidx_i = closest_flat[s:e].unsqueeze(0)
+            dots_i = self.prohibit_self_connection(dots_i, cidx_i)
+            all_dots["words"].append(dots_i[0, 0, :n_i, :n_i + 1])
+            all_dots["lines"].append(dots_i[0, 1, :n_i, :n_i + 1])
+            all_dots["line_log_var_unc"].append(dots_i[0, 2, :n_i, :n_i + 1])
         return {
             "words": all_dots["words"],