diff --git "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" --- "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" +++ "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.25s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
-
Wed Oct 29 00:36:08 2025       
+
Wed Oct 29 04:14:16 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   30C    P0             87W /  350W |       0MiB /  46068MiB |     18%      Default |
+| N/A   35C    P0            121W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.23s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 6.07s
+Cell: benchmark | 5.98s
  | 
 
 Raw
@@ -3973,19 +3973,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.543us      3556.67%     144.543us     144.543us             1  
-                               hf_kernels_causal_conv1d         8.86%     163.685us        99.64%       1.842ms       1.842ms       0.000us         0.00%       5.504us       5.504us             1  
-                                         CausalConv1dFn         5.76%     106.513us        90.78%       1.678ms     559.289us       0.000us         0.00%       5.504us       1.835us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.24%      41.381us        81.42%       1.505ms     501.611us       4.064us       100.00%       5.504us       1.835us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
-                                Activity Buffer Request        76.78%       1.419ms        76.78%       1.419ms       1.419ms       1.440us        35.43%       1.440us       1.440us             1  
-                                       aten::empty_like         0.97%      17.931us         3.60%      66.522us      22.174us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.63%      48.591us         2.63%      48.591us      16.197us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.40%      44.403us         2.40%      44.403us      14.801us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.36%       6.650us         0.36%       6.650us       6.650us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     148.159us      3616.28%     148.159us     148.159us             1  
+                               hf_kernels_causal_conv1d         9.16%     170.084us        99.61%       1.850ms       1.850ms       0.000us         0.00%       5.537us       5.537us             1  
+                                         CausalConv1dFn         5.86%     108.753us        90.45%       1.680ms     559.949us       0.000us         0.00%       5.537us       1.846us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.32%      24.540us        81.01%       1.505ms     501.517us       4.097us       100.00%       5.537us       1.846us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.097us       100.00%       4.097us       1.366us             3  
+                                Activity Buffer Request        77.15%       1.433ms        77.15%       1.433ms       1.433ms       1.440us        35.15%       1.440us       1.440us             1  
+                                       aten::empty_like         1.01%      18.711us         3.58%      66.542us      22.181us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.58%      47.831us         2.58%      47.831us      15.944us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.54%      47.160us         2.54%      47.160us      15.720us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.39%       7.281us         0.39%       7.281us       7.281us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.848ms
-Self CUDA time total: 4.064us
+Self CPU time total: 1.857ms
+Self CUDA time total: 4.097us
 
 
 
@@ -3995,19 +3995,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.574us      3378.19%     128.574us     128.574us             1  
-                               hf_kernels_causal_conv1d         6.39%     108.804us        99.67%       1.696ms       1.696ms       0.000us         0.00%       5.085us       5.085us             1  
-                                         CausalConv1dFn         4.62%      78.561us        93.27%       1.588ms     529.188us       0.000us         0.00%       5.085us       1.695us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.51%      25.693us        86.85%       1.478ms     492.734us       3.806us       100.00%       5.085us       1.695us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.806us       100.00%       3.806us       1.269us             3  
-                                Activity Buffer Request        83.54%       1.422ms        83.54%       1.422ms       1.422ms       1.279us        33.60%       1.279us       1.279us             1  
-                                       aten::empty_like         0.47%       8.001us         1.81%      30.802us      10.267us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.34%      22.801us         1.34%      22.801us       7.600us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.80%      30.601us         1.80%      30.601us      10.200us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.33%       5.681us         0.33%       5.681us       5.681us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.854us      3438.93%     129.854us     129.854us             1  
+                               hf_kernels_causal_conv1d         5.93%     101.743us        99.70%       1.711ms       1.711ms       0.000us         0.00%       5.056us       5.056us             1  
+                                         CausalConv1dFn         4.51%      77.441us        93.77%       1.609ms     536.404us       0.000us         0.00%       5.056us       1.685us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.49%      25.500us        87.40%       1.500ms     499.951us       3.776us       100.00%       5.056us       1.685us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
+                                Activity Buffer Request        84.02%       1.442ms        84.02%       1.442ms       1.442ms       1.280us        33.90%       1.280us       1.280us             1  
+                                       aten::empty_like         0.47%       8.120us         1.86%      31.920us      10.640us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.39%      23.800us         1.39%      23.800us       7.933us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.89%      32.481us         1.89%      32.481us      10.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.091us         0.30%       5.091us       5.091us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.702ms
-Self CUDA time total: 3.806us
+Self CPU time total: 1.716ms
+Self CUDA time total: 3.776us
 
 
 
@@ -4017,19 +4017,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.111us      3339.80%     126.111us     126.111us             1  
-                               hf_kernels_causal_conv1d         5.63%      95.933us        99.70%       1.698ms       1.698ms       0.000us         0.00%       5.056us       5.056us             1  
-                                         CausalConv1dFn         4.46%      75.892us        94.07%       1.602ms     534.022us       0.000us         0.00%       5.056us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.40%      23.785us        87.77%       1.495ms     498.271us       3.776us       100.00%       5.056us       1.685us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
-                                Activity Buffer Request        84.61%       1.441ms        84.61%       1.441ms       1.441ms       1.280us        33.90%       1.280us       1.280us             1  
-                                       aten::empty_like         0.49%       8.320us         1.84%      31.360us      10.453us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.35%      23.040us         1.35%      23.040us       7.680us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.77%      30.070us         1.77%      30.070us      10.023us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       5.061us         0.30%       5.061us       5.061us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.646us      3302.51%     123.646us     123.646us             1  
+                               hf_kernels_causal_conv1d         7.17%     123.413us        99.66%       1.716ms       1.716ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         4.50%      77.551us        92.49%       1.592ms     530.774us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.44%      24.709us        86.24%       1.485ms     494.900us       3.744us       100.00%       4.992us       1.664us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
+                                Activity Buffer Request        82.96%       1.428ms        82.96%       1.428ms       1.428ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.44%       7.490us         1.75%      30.072us      10.024us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.31%      22.582us         1.31%      22.582us       7.527us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.85%      31.791us         1.85%      31.791us      10.597us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.34%       5.860us         0.34%       5.860us       5.860us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.703ms
-Self CUDA time total: 3.776us
+Self CPU time total: 1.722ms
+Self CUDA time total: 3.744us
 
 
 
@@ -4039,19 +4039,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.703us      3352.51%     128.703us     128.703us             1  
-                               hf_kernels_causal_conv1d         5.02%      93.431us        99.72%       1.856ms       1.856ms       0.000us         0.00%       5.119us       5.119us             1  
-                                         CausalConv1dFn         4.18%      77.825us        94.70%       1.762ms     587.414us       0.000us         0.00%       5.119us       1.706us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      25.311us        88.83%       1.653ms     551.005us       3.839us       100.00%       5.119us       1.706us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.839us       100.00%       3.839us       1.280us             3  
-                                Activity Buffer Request        76.83%       1.430ms        76.83%       1.430ms       1.430ms       1.280us        33.34%       1.280us       1.280us             1  
-                                       aten::empty_like         0.45%       8.401us         1.69%      31.401us      10.467us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.24%      23.000us         1.24%      23.000us       7.667us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        10.65%     198.147us        10.65%     198.147us      66.049us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.120us         0.28%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.647us      3380.48%     127.647us     127.647us             1  
+                               hf_kernels_causal_conv1d         5.10%      96.061us        99.74%       1.878ms       1.878ms       0.000us         0.00%       5.056us       5.056us             1  
+                                         CausalConv1dFn         4.15%      78.092us        94.64%       1.782ms     593.849us       0.000us         0.00%       5.056us       1.685us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.38%      26.001us        88.88%       1.673ms     557.705us       3.776us       100.00%       5.056us       1.685us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
+                                Activity Buffer Request        75.38%       1.419ms        75.38%       1.419ms       1.419ms       1.280us        33.90%       1.280us       1.280us             1  
+                                       aten::empty_like         0.40%       7.440us         1.61%      30.340us      10.113us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.22%      22.900us         1.22%      22.900us       7.633us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        12.12%     228.205us        12.12%     228.205us      76.068us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.821us         0.26%       4.821us       4.821us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.861ms
-Self CUDA time total: 3.839us
+Self CPU time total: 1.882ms
+Self CUDA time total: 3.776us
 
 
 
@@ -4061,18 +4061,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.247us      2609.31%     125.247us     125.247us             1  
-                               hf_kernels_causal_conv1d         5.46%      99.082us        99.73%       1.809ms       1.809ms       0.000us         0.00%       6.432us       6.432us             1  
-                                         CausalConv1dFn         4.18%      75.835us        94.27%       1.709ms     569.830us       0.000us         0.00%       6.432us       2.144us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.40%      25.379us        88.34%       1.602ms     533.975us       4.800us       100.00%       6.432us       2.144us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.568us      2699.33%     129.568us     129.568us             1  
+                               hf_kernels_causal_conv1d         5.60%     103.653us        99.74%       1.847ms       1.847ms       0.000us         0.00%       6.400us       6.400us             1  
+                                         CausalConv1dFn         4.14%      76.750us        94.15%       1.744ms     581.192us       0.000us         0.00%       6.400us       2.133us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.49%      27.581us        88.36%       1.636ms     545.462us       4.800us       100.00%       6.400us       2.133us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
-                                Activity Buffer Request        77.97%       1.414ms        77.97%       1.414ms       1.414ms       1.632us        34.00%       1.632us       1.632us             1  
-                                       aten::empty_like         0.46%       8.420us         1.75%      31.730us      10.577us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.29%      23.310us         1.29%      23.310us       7.770us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.97%     162.627us         8.97%     162.627us      54.209us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       4.860us         0.27%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        77.01%       1.426ms        77.01%       1.426ms       1.426ms       1.600us        33.33%       1.600us       1.600us             1  
+                                       aten::empty_like         0.43%       7.910us         1.64%      30.441us      10.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.22%      22.531us         1.22%      22.531us       7.510us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.86%     182.544us         9.86%     182.544us      60.848us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.781us         0.26%       4.781us       4.781us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.813ms
+Self CPU time total: 1.852ms
 Self CUDA time total: 4.800us
 
 
@@ -4083,19 +4083,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.190us      2487.38%     120.190us     120.190us             1  
-                               hf_kernels_causal_conv1d        14.45%      80.914us        99.14%     554.970us     554.970us       0.000us         0.00%       6.464us       6.464us             1  
-                                         CausalConv1dFn        12.94%      72.432us        84.69%     474.056us     158.019us       0.000us         0.00%       6.464us       2.155us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.57%      25.572us        66.53%     372.404us     124.135us       4.832us       100.00%       6.464us       2.155us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.832us       100.00%       4.832us       1.611us             3  
-                                Activity Buffer Request        34.22%     191.566us        34.22%     191.566us     191.566us       1.632us        33.77%       1.632us       1.632us             1  
-                                       aten::empty_like         1.40%       7.860us         5.22%      29.220us       9.740us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.82%      21.360us         3.82%      21.360us       7.120us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.74%     155.266us        27.74%     155.266us      51.755us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.86%       4.800us         0.86%       4.800us       4.800us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.941us      2544.84%     122.941us     122.941us             1  
+                               hf_kernels_causal_conv1d        13.71%      78.022us        99.17%     564.232us     564.232us       0.000us         0.00%       6.463us       6.463us             1  
+                                         CausalConv1dFn        13.23%      75.251us        85.45%     486.210us     162.070us       0.000us         0.00%       6.463us       2.154us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.60%      26.171us        67.28%     382.788us     127.596us       4.831us       100.00%       6.463us       2.154us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.831us       100.00%       4.831us       1.610us             3  
+                                Activity Buffer Request        33.10%     188.324us        33.10%     188.324us     188.324us       1.632us        33.78%       1.632us       1.632us             1  
+                                       aten::empty_like         1.28%       7.271us         4.95%      28.171us       9.390us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.67%      20.900us         3.67%      20.900us       6.967us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        29.58%     168.293us        29.58%     168.293us      56.098us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       4.750us         0.83%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 559.770us
-Self CUDA time total: 4.832us
+Self CPU time total: 568.982us
+Self CUDA time total: 4.831us
 
 
 
@@ -4105,19 +4105,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.279us      1243.27%     133.279us     133.279us             1  
-                               hf_kernels_causal_conv1d         5.54%     100.182us        99.73%       1.805ms       1.805ms       0.000us         0.00%      14.336us      14.336us             1  
-                                         CausalConv1dFn         4.54%      82.173us        94.20%       1.705ms     568.267us       0.000us         0.00%      14.336us       4.779us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      26.531us        87.96%       1.592ms     530.609us      10.720us       100.00%      14.336us       4.779us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.720us       100.00%      10.720us       3.573us             3  
-                                Activity Buffer Request        77.82%       1.408ms        77.82%       1.408ms       1.408ms       3.616us        33.73%       3.616us       3.616us             1  
-                                       aten::empty_like         0.46%       8.260us         1.70%      30.801us      10.267us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.25%      22.541us         1.25%      22.541us       7.514us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.67%     156.947us         8.67%     156.947us      52.316us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       4.830us         0.27%       4.830us       4.830us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.232us      1201.21%     127.232us     127.232us             1  
+                               hf_kernels_causal_conv1d         5.54%     101.402us        99.73%       1.824ms       1.824ms       0.000us         0.00%      14.144us      14.144us             1  
+                                         CausalConv1dFn         4.17%      76.211us        94.19%       1.723ms     574.352us       0.000us         0.00%      14.144us       4.715us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      26.892us        88.33%       1.616ms     538.621us      10.592us       100.00%      14.144us       4.715us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us       100.00%      10.592us       3.531us             3  
+                                Activity Buffer Request        78.00%       1.427ms        78.00%       1.427ms       1.427ms       3.552us        33.53%       3.552us       3.552us             1  
+                                       aten::empty_like         0.41%       7.500us         1.69%      30.982us      10.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.28%      23.482us         1.28%      23.482us       7.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.86%     162.122us         8.86%     162.122us      54.041us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.930us         0.27%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.810ms
-Self CUDA time total: 10.720us
+Self CPU time total: 1.829ms
+Self CUDA time total: 10.592us
 
 
 
@@ -4127,19 +4127,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.037us      1127.54%     123.037us     123.037us             1  
-                               hf_kernels_causal_conv1d        20.63%     102.765us        99.04%     493.397us     493.397us       0.000us         0.00%      14.592us      14.592us             1  
-                                         CausalConv1dFn        14.78%      73.650us        78.41%     390.632us     130.211us       0.000us         0.00%      14.592us       4.864us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.23%      26.041us        57.43%     286.091us      95.364us      10.912us       100.00%      14.592us       4.864us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.912us       100.00%      10.912us       3.637us             3  
-                                Activity Buffer Request        21.15%     105.364us        21.15%     105.364us     105.364us       3.680us        33.72%       3.680us       3.680us             1  
-                                       aten::empty_like         1.51%       7.510us         6.20%      30.891us      10.297us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.69%      23.381us         4.69%      23.381us       7.794us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.05%     154.686us        31.05%     154.686us      51.562us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.96%       4.790us         0.96%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.052us      1149.38%     125.052us     125.052us             1  
+                               hf_kernels_causal_conv1d        19.89%     100.962us        99.07%     502.851us     502.851us       0.000us         0.00%      14.560us      14.560us             1  
+                                         CausalConv1dFn        14.62%      74.213us        79.18%     401.889us     133.963us       0.000us         0.00%      14.560us       4.853us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.11%      25.930us        58.71%     297.996us      99.332us      10.880us       100.00%      14.560us       4.853us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.880us       100.00%      10.880us       3.627us             3  
+                                Activity Buffer Request        22.39%     113.633us        22.39%     113.633us     113.633us       3.680us        33.82%       3.680us       3.680us             1  
+                                       aten::empty_like         1.51%       7.640us         5.85%      29.680us       9.893us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.34%      22.040us         4.34%      22.040us       7.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.22%     158.433us        31.22%     158.433us      52.811us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.93%       4.700us         0.93%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 498.187us
-Self CUDA time total: 10.912us
+Self CPU time total: 507.551us
+Self CUDA time total: 10.880us
 
 
 
@@ -4149,19 +4149,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.944us      1189.53%     130.944us     130.944us             1  
-                               hf_kernels_causal_conv1d         5.42%      97.593us        99.72%       1.796ms       1.796ms       0.000us         0.00%      14.720us      14.720us             1  
-                                         CausalConv1dFn         4.08%      73.404us        94.31%       1.699ms     566.233us       0.000us         0.00%      14.720us       4.907us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.44%      26.001us        88.45%       1.593ms     531.068us      11.008us       100.00%      14.720us       4.907us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.008us       100.00%      11.008us       3.669us             3  
-                                Activity Buffer Request        78.36%       1.411ms        78.36%       1.411ms       1.411ms       3.712us        33.72%       3.712us       3.712us             1  
-                                       aten::empty_like         0.46%       8.350us         1.78%      32.090us      10.697us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.32%      23.740us         1.32%      23.740us       7.913us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.65%     155.786us         8.65%     155.786us      51.929us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       4.990us         0.28%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     137.022us      1248.38%     137.022us     137.022us             1  
+                               hf_kernels_causal_conv1d         5.72%     105.011us        99.74%       1.832ms       1.832ms       0.000us         0.00%      14.656us      14.656us             1  
+                                         CausalConv1dFn         4.21%      77.323us        94.03%       1.727ms     575.663us       0.000us         0.00%      14.656us       4.885us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.50%      27.571us        88.15%       1.619ms     539.695us      10.976us       100.00%      14.656us       4.885us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
+                                Activity Buffer Request        77.70%       1.427ms        77.70%       1.427ms       1.427ms       3.680us        33.53%       3.680us       3.680us             1  
+                                       aten::empty_like         0.44%       8.071us         1.66%      30.580us      10.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.23%      22.509us         1.23%      22.509us       7.503us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.95%     164.363us         8.95%     164.363us      54.788us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.700us         0.26%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.801ms
-Self CUDA time total: 11.008us
+Self CPU time total: 1.837ms
+Self CUDA time total: 10.976us
 
 
 
@@ -4171,19 +4171,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.014us      1080.15%     122.014us     122.014us             1  
-                               hf_kernels_causal_conv1d        12.40%      73.852us        99.19%     590.511us     590.511us       0.000us         0.00%      15.104us      15.104us             1  
-                                         CausalConv1dFn        12.35%      73.524us        86.78%     516.659us     172.220us       0.000us         0.00%      15.104us       5.035us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.03%      24.020us        69.45%     413.474us     137.825us      11.296us       100.00%      15.104us       5.035us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.296us       100.00%      11.296us       3.765us             3  
-                                Activity Buffer Request        38.81%     231.068us        38.81%     231.068us     231.068us       3.808us        33.71%       3.808us       3.808us             1  
-                                       aten::empty_like         1.25%       7.459us         4.98%      29.661us       9.887us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.73%      22.202us         3.73%      22.202us       7.401us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        26.60%     158.386us        26.60%     158.386us      52.795us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.81%       4.840us         0.81%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.064us      1110.89%     124.064us     124.064us             1  
+                               hf_kernels_causal_conv1d        12.84%      76.053us        99.16%     587.503us     587.503us       0.000us         0.00%      14.944us      14.944us             1  
+                                         CausalConv1dFn        12.64%      74.882us        86.32%     511.450us     170.483us       0.000us         0.00%      14.944us       4.981us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.48%      26.570us        68.85%     407.928us     135.976us      11.168us       100.00%      14.944us       4.981us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.168us       100.00%      11.168us       3.723us             3  
+                                Activity Buffer Request        35.24%     208.814us        35.24%     208.814us     208.814us       3.776us        33.81%       3.776us       3.776us             1  
+                                       aten::empty_like         1.25%       7.420us         4.83%      28.640us       9.547us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.58%      21.220us         3.58%      21.220us       7.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        29.12%     172.544us        29.12%     172.544us      57.515us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.84%       4.980us         0.84%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 595.351us
-Self CUDA time total: 11.296us
+Self CPU time total: 592.483us
+Self CUDA time total: 11.168us
 
 
 
@@ -4193,19 +4193,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     135.582us       269.70%     135.582us     135.582us             1  
-                               hf_kernels_causal_conv1d        12.51%      76.722us        99.20%     608.371us     608.371us       0.000us         0.00%      83.711us      83.711us             1  
-                                         CausalConv1dFn        13.24%      81.202us        86.69%     531.649us     177.216us       0.000us         0.00%      83.711us      27.904us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.12%      25.291us        68.50%     420.085us     140.028us      50.271us       100.00%      83.711us      27.904us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.271us       100.00%      50.271us      16.757us             3  
-                                Activity Buffer Request        38.84%     238.229us        38.84%     238.229us     238.229us      33.440us        66.52%      33.440us      33.440us             1  
-                                       aten::empty_like         1.27%       7.790us         4.95%      30.362us      10.121us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.68%      22.572us         3.68%      22.572us       7.524us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        25.53%     156.565us        25.53%     156.565us      52.188us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.80%       4.910us         0.80%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.567us       258.23%     129.567us     129.567us             1  
+                               hf_kernels_causal_conv1d        11.68%      75.622us        99.26%     642.554us     642.554us       0.000us         0.00%      83.743us      83.743us             1  
+                                         CausalConv1dFn        11.60%      75.092us        87.58%     566.932us     188.977us       0.000us         0.00%      83.743us      27.914us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.19%      27.150us        71.35%     461.850us     153.950us      50.175us       100.00%      83.743us      27.914us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.175us       100.00%      50.175us      16.725us             3  
+                                Activity Buffer Request        40.58%     262.656us        40.58%     262.656us     262.656us      33.568us        66.90%      33.568us      33.568us             1  
+                                       aten::empty_like         1.20%       7.749us         4.63%      29.990us       9.997us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.44%      22.241us         3.44%      22.241us       7.414us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        26.58%     172.044us        26.58%     172.044us      57.348us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.74%       4.760us         0.74%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 613.281us
-Self CUDA time total: 50.271us
+Self CPU time total: 647.314us
+Self CUDA time total: 50.175us
 
 
 
@@ -4215,19 +4215,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.740us       248.41%     127.740us     127.740us             1  
-                               hf_kernels_causal_conv1d        15.37%      77.574us        99.04%     499.998us     499.998us       0.000us         0.00%      85.854us      85.854us             1  
-                                         CausalConv1dFn        14.63%      73.842us        83.68%     422.424us     140.808us       0.000us         0.00%      85.854us      28.618us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.23%      26.412us        63.27%     319.402us     106.467us      51.423us       100.00%      85.854us      28.618us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.423us       100.00%      51.423us      17.141us             3  
-                                Activity Buffer Request        27.23%     137.484us        27.23%     137.484us     137.484us      34.431us        66.96%      34.431us      34.431us             1  
-                                       aten::empty_like         1.41%       7.140us         5.78%      29.180us       9.727us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.37%      22.040us         4.37%      22.040us       7.347us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.80%     155.506us        30.80%     155.506us      51.835us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.96%       4.831us         0.96%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.972us       248.31%     126.972us     126.972us             1  
+                               hf_kernels_causal_conv1d        14.84%      76.974us        99.07%     513.681us     513.681us       0.000us         0.00%      85.597us      85.597us             1  
+                                         CausalConv1dFn        14.19%      73.591us        84.22%     436.707us     145.569us       0.000us         0.00%      85.597us      28.532us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.20%      26.940us        64.23%     333.066us     111.022us      51.134us       100.00%      85.597us      28.532us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.134us       100.00%      51.134us      17.045us             3  
+                                Activity Buffer Request        26.85%     139.203us        26.85%     139.203us     139.203us      34.463us        67.40%      34.463us      34.463us             1  
+                                       aten::empty_like         1.42%       7.350us         5.80%      30.050us      10.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.38%      22.700us         4.38%      22.700us       7.567us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.19%     166.923us        32.19%     166.923us      55.641us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.93%       4.840us         0.93%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 504.829us
-Self CUDA time total: 51.423us
+Self CPU time total: 518.521us
+Self CUDA time total: 51.134us
 
 
 
@@ -4237,18 +4237,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.437us      3008.12%     117.437us     117.437us             1  
-                               hf_kernels_causal_conv1d        12.18%      74.242us        99.17%     604.340us     604.340us       0.000us         0.00%       5.152us       5.152us             1  
-                                         CausalConv1dFn        11.66%      71.062us        86.99%     530.098us     176.699us       0.000us         0.00%       5.152us       1.717us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.18%      25.499us        70.51%     429.675us     143.225us       3.904us       100.00%       5.152us       1.717us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.062us      3177.82%     124.062us     124.062us             1  
+                               hf_kernels_causal_conv1d        10.24%      77.632us        99.38%     753.346us     753.346us       0.000us         0.00%       5.152us       5.152us             1  
+                                         CausalConv1dFn         9.77%      74.052us        89.14%     675.714us     225.238us       0.000us         0.00%       5.152us       1.717us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.43%      26.030us        75.34%     571.152us     190.384us       3.904us       100.00%       5.152us       1.717us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.904us       100.00%       3.904us       1.301us             3  
-                                Activity Buffer Request        41.02%     249.979us        41.02%     249.979us     249.979us       1.248us        31.97%       1.248us       1.248us             1  
-                                       aten::empty_like         1.33%       8.110us         4.82%      29.361us       9.787us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.49%      21.251us         3.49%      21.251us       7.084us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        25.30%     154.197us        25.30%     154.197us      51.399us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.83%       5.050us         0.83%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        50.23%     380.798us        50.23%     380.798us     380.798us       1.248us        31.97%       1.248us       1.248us             1  
+                                       aten::empty_like         1.10%       8.319us         4.02%      30.510us      10.170us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.93%      22.191us         2.93%      22.191us       7.397us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        21.68%     164.324us        21.68%     164.324us      54.775us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.62%       4.730us         0.62%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 609.390us
+Self CPU time total: 758.076us
 Self CUDA time total: 3.904us
 
 
@@ -4259,19 +4259,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     157.920us      4012.20%     157.920us     157.920us             1  
-                               hf_kernels_causal_conv1d        19.90%     106.583us        99.11%     530.709us     530.709us       0.000us         0.00%       5.216us       5.216us             1  
-                                         CausalConv1dFn        15.55%      83.245us        79.21%     424.126us     141.375us       0.000us         0.00%       5.216us       1.739us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.02%      26.862us        57.76%     309.281us     103.094us       3.936us       100.00%       5.216us       1.739us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.936us       100.00%       3.936us       1.312us             3  
-                                Activity Buffer Request        22.25%     119.154us        22.25%     119.154us     119.154us       1.280us        32.52%       1.280us       1.280us             1  
-                                       aten::empty_like         1.55%       8.320us         5.90%      31.600us      10.533us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.35%      23.280us         4.35%      23.280us       7.760us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.49%     163.265us        30.49%     163.265us      54.422us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.89%       4.750us         0.89%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     150.205us      3785.41%     150.205us     150.205us             1  
+                               hf_kernels_causal_conv1d        15.05%      79.624us        99.10%     524.121us     524.121us       0.000us         0.00%       5.248us       5.248us             1  
+                                         CausalConv1dFn        15.30%      80.921us        84.04%     444.497us     148.166us       0.000us         0.00%       5.248us       1.749us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.86%      25.730us        59.56%     315.036us     105.012us       3.968us       100.00%       5.248us       1.749us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.968us       100.00%       3.968us       1.323us             3  
+                                Activity Buffer Request        23.84%     126.093us        23.84%     126.093us     126.093us       1.280us        32.26%       1.280us       1.280us             1  
+                                       aten::empty_like         1.46%       7.730us         9.18%      48.540us      16.180us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         7.72%      40.810us         7.72%      40.810us      13.603us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.86%     163.213us        30.86%     163.213us      54.404us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.90%       4.780us         0.90%       4.780us       4.780us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 535.459us
-Self CUDA time total: 3.936us
+Self CPU time total: 528.901us
+Self CUDA time total: 3.968us
 
 
 
@@ -4281,19 +4281,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.777us      2998.47%     123.777us     123.777us             1  
-                               hf_kernels_causal_conv1d        13.54%      78.054us        99.15%     571.700us     571.700us       0.000us         0.00%       5.504us       5.504us             1  
-                                         CausalConv1dFn        12.84%      74.051us        85.62%     493.646us     164.549us       0.000us         0.00%       5.504us       1.835us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.19%      24.152us        67.43%     388.784us     129.595us       4.128us       100.00%       5.504us       1.835us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.128us       100.00%       4.128us       1.376us             3  
-                                Activity Buffer Request        36.14%     208.368us        36.14%     208.368us     208.368us       1.376us        33.33%       1.376us       1.376us             1  
-                                       aten::empty_like         1.37%       7.901us         5.34%      30.811us      10.270us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.97%      22.910us         3.97%      22.910us       7.637us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.10%     156.264us        27.10%     156.264us      52.088us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.85%       4.881us         0.85%       4.881us       4.881us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.822us      3119.81%     125.822us     125.822us             1  
+                               hf_kernels_causal_conv1d         7.34%      76.781us        99.55%       1.041ms       1.041ms       0.000us         0.00%       5.378us       5.378us             1  
+                                         CausalConv1dFn         7.29%      76.213us        92.21%     964.451us     321.484us       0.000us         0.00%       5.378us       1.793us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.57%      26.881us        82.03%     857.988us     285.996us       4.033us       100.00%       5.378us       1.793us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.033us       100.00%       4.033us       1.344us             3  
+                                Activity Buffer Request        64.21%     671.574us        64.21%     671.574us     671.574us       1.345us        33.35%       1.345us       1.345us             1  
+                                       aten::empty_like         0.71%       7.460us         2.89%      30.250us      10.083us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.18%      22.790us         2.18%      22.790us       7.597us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        15.25%     159.533us        15.25%     159.533us      53.178us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.45%       4.740us         0.45%       4.740us       4.740us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 576.581us
-Self CUDA time total: 4.128us
+Self CPU time total: 1.046ms
+Self CUDA time total: 4.033us
 
 
 
@@ -4303,19 +4303,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.875us      2925.07%     118.875us     118.875us             1  
-                               hf_kernels_causal_conv1d        17.67%      83.134us        98.92%     465.527us     465.527us       0.000us         0.00%       5.440us       5.440us             1  
-                                         CausalConv1dFn        15.04%      70.762us        81.26%     382.393us     127.464us       0.000us         0.00%       5.440us       1.813us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.98%      23.432us        59.87%     281.731us      93.910us       4.064us       100.00%       5.440us       1.813us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
-                                Activity Buffer Request        22.07%     103.873us        22.07%     103.873us     103.873us       1.376us        33.86%       1.376us       1.376us             1  
-                                       aten::empty_like         1.61%       7.590us         6.35%      29.900us       9.967us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.74%      22.310us         4.74%      22.310us       7.437us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.82%     154.426us        32.82%     154.426us      51.475us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.08%       5.061us         1.08%       5.061us       5.061us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.429us      3011.78%     122.429us     122.429us             1  
+                               hf_kernels_causal_conv1d        17.20%      83.313us        98.89%     479.020us     479.020us       0.000us         0.00%       5.442us       5.442us             1  
+                                         CausalConv1dFn        15.73%      76.221us        81.69%     395.707us     131.902us       0.000us         0.00%       5.442us       1.814us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.53%      26.781us        59.92%     290.246us      96.749us       4.065us       100.00%       5.442us       1.814us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
+                                Activity Buffer Request        21.02%     101.802us        21.02%     101.802us     101.802us       1.377us        33.87%       1.377us       1.377us             1  
+                                       aten::empty_like         1.46%       7.060us         6.04%      29.240us       9.747us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.58%      22.180us         4.58%      22.180us       7.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.37%     161.663us        33.37%     161.663us      53.888us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.11%       5.400us         1.11%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 470.588us
-Self CUDA time total: 4.064us
+Self CPU time total: 484.420us
+Self CUDA time total: 4.065us
 
 
 
@@ -4325,19 +4325,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.722us      2343.23%     126.722us     126.722us             1  
-                               hf_kernels_causal_conv1d        12.92%     104.393us        99.42%     803.188us     803.188us       0.000us         0.00%       7.264us       7.264us             1  
-                                         CausalConv1dFn         9.39%      75.863us        86.50%     698.795us     232.932us       0.000us         0.00%       7.264us       2.421us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.09%      24.969us        73.13%     590.770us     196.923us       5.408us       100.00%       7.264us       2.421us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.408us       100.00%       5.408us       1.803us             3  
-                                Activity Buffer Request        49.73%     401.794us        49.73%     401.794us     401.794us       1.856us        34.32%       1.856us       1.856us             1  
-                                       aten::empty_like         0.96%       7.780us         3.98%      32.162us      10.721us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.02%      24.382us         3.02%      24.382us       8.127us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        20.30%     164.007us        20.30%     164.007us      54.669us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.58%       4.700us         0.58%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.503us      2408.91%     129.503us     129.503us             1  
+                               hf_kernels_causal_conv1d         4.44%      81.452us        99.70%       1.827ms       1.827ms       0.000us         0.00%       7.232us       7.232us             1  
+                                         CausalConv1dFn         4.25%      77.980us        95.25%       1.746ms     581.952us       0.000us         0.00%       7.232us       2.411us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.46%      26.694us        89.32%       1.637ms     545.686us       5.376us       100.00%       7.232us       2.411us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
+                                Activity Buffer Request        78.58%       1.440ms        78.58%       1.440ms       1.440ms       1.856us        34.52%       1.856us       1.856us             1  
+                                       aten::empty_like         0.42%       7.710us         1.68%      30.820us      10.273us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.26%      23.110us         1.26%      23.110us       7.703us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.28%     170.073us         9.28%     170.073us      56.691us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.541us         0.30%       5.541us       5.541us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 807.888us
-Self CUDA time total: 5.408us
+Self CPU time total: 1.833ms
+Self CUDA time total: 5.376us
 
 
 
@@ -4347,19 +4347,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.446us      2201.13%     120.446us     120.446us             1  
-                               hf_kernels_causal_conv1d        18.67%      89.551us        99.02%     474.966us     474.966us       0.000us         0.00%       7.328us       7.328us             1  
-                                         CausalConv1dFn        15.56%      74.654us        80.35%     385.415us     128.472us       0.000us         0.00%       7.328us       2.443us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.05%      24.231us        58.47%     280.459us      93.486us       5.472us       100.00%       7.328us       2.443us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.472us       100.00%       5.472us       1.824us             3  
-                                Activity Buffer Request        20.97%     100.573us        20.97%     100.573us     100.573us       1.856us        33.92%       1.856us       1.856us             1  
-                                       aten::empty_like         1.52%       7.312us         6.32%      30.302us      10.101us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.79%      22.990us         4.79%      22.990us       7.663us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.45%     155.655us        32.45%     155.655us      51.885us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.98%       4.720us         0.98%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.572us      2216.40%     120.572us     120.572us             1  
+                               hf_kernels_causal_conv1d        15.66%      74.763us        98.99%     472.500us     472.500us       0.000us         0.00%       7.296us       7.296us             1  
+                                         CausalConv1dFn        15.51%      74.030us        83.33%     397.737us     132.579us       0.000us         0.00%       7.296us       2.432us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.46%      26.060us        61.54%     293.747us      97.916us       5.440us       100.00%       7.296us       2.432us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.440us       100.00%       5.440us       1.813us             3  
+                                Activity Buffer Request        22.16%     105.753us        22.16%     105.753us     105.753us       1.856us        34.12%       1.856us       1.856us             1  
+                                       aten::empty_like         1.54%       7.370us         6.28%      29.960us       9.987us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.73%      22.590us         4.73%      22.590us       7.530us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.93%     161.934us        33.93%     161.934us      53.978us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.01%       4.811us         1.01%       4.811us       4.811us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 479.686us
-Self CUDA time total: 5.472us
+Self CPU time total: 477.311us
+Self CUDA time total: 5.440us
 
 
 
@@ -4369,19 +4369,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.208us       742.52%     130.208us     130.208us             1  
-                               hf_kernels_causal_conv1d         5.57%     103.684us        99.74%       1.855ms       1.855ms       0.000us         0.00%      23.424us      23.424us             1  
-                                         CausalConv1dFn         4.08%      75.922us        94.16%       1.751ms     583.780us       0.000us         0.00%      23.424us       7.808us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.22%      22.672us        88.43%       1.645ms     548.249us      17.536us       100.00%      23.424us       7.808us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.536us       100.00%      17.536us       5.845us             3  
-                                Activity Buffer Request        78.77%       1.465ms        78.77%       1.465ms       1.465ms       5.888us        33.58%       5.888us       5.888us             1  
-                                       aten::empty_like         0.43%       7.931us         1.65%      30.671us      10.224us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.22%      22.740us         1.22%      22.740us       7.580us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.44%     157.016us         8.44%     157.016us      52.339us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.860us         0.26%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.791us       771.35%     133.791us     133.791us             1  
+                               hf_kernels_causal_conv1d         4.36%      79.052us        99.74%       1.806ms       1.806ms       0.000us         0.00%      23.137us      23.137us             1  
+                                         CausalConv1dFn         4.20%      76.082us        95.37%       1.727ms     575.785us       0.000us         0.00%      23.137us       7.712us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.44%      26.131us        89.41%       1.619ms     539.771us      17.345us       100.00%      23.137us       7.712us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.345us       100.00%      17.345us       5.782us             3  
+                                Activity Buffer Request        78.88%       1.429ms        78.88%       1.429ms       1.429ms       5.792us        33.39%       5.792us       5.792us             1  
+                                       aten::empty_like         0.48%       8.620us         1.76%      31.960us      10.653us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.29%      23.340us         1.29%      23.340us       7.780us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.09%     164.553us         9.09%     164.553us      54.851us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.770us         0.26%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.860ms
-Self CUDA time total: 17.536us
+Self CPU time total: 1.811ms
+Self CUDA time total: 17.345us
 
 
 
@@ -4391,19 +4391,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.970us       691.76%     123.970us     123.970us             1  
-                               hf_kernels_causal_conv1d        18.87%      88.734us        98.86%     464.856us     464.856us       0.000us         0.00%      23.905us      23.905us             1  
-                                         CausalConv1dFn        15.17%      71.352us        79.99%     376.122us     125.374us       0.000us         0.00%      23.905us       7.968us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.25%      24.691us        58.28%     274.030us      91.343us      17.921us       100.00%      23.905us       7.968us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.921us       100.00%      17.921us       5.974us             3  
-                                Activity Buffer Request        19.83%      93.233us        19.83%      93.233us      93.233us       5.984us        33.39%       5.984us       5.984us             1  
-                                       aten::empty_like         1.60%       7.540us         6.54%      30.740us      10.247us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.93%      23.200us         4.93%      23.200us       7.733us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.20%     156.106us        33.20%     156.106us      52.035us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.14%       5.350us         1.14%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.164us       713.44%     127.164us     127.164us             1  
+                               hf_kernels_causal_conv1d        19.87%      99.623us        99.05%     496.531us     496.531us       0.000us         0.00%      23.808us      23.808us             1  
+                                         CausalConv1dFn        14.81%      74.221us        79.17%     396.908us     132.303us       0.000us         0.00%      23.808us       7.936us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.31%      26.641us        58.33%     292.406us      97.469us      17.824us       100.00%      23.808us       7.936us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.824us       100.00%      17.824us       5.941us             3  
+                                Activity Buffer Request        20.81%     104.332us        20.81%     104.332us     104.332us       5.984us        33.57%       5.984us       5.984us             1  
+                                       aten::empty_like         1.45%       7.280us         6.04%      30.281us      10.094us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.59%      23.001us         4.59%      23.001us       7.667us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.20%     161.433us        32.20%     161.433us      53.811us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.95%       4.781us         0.95%       4.781us       4.781us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 470.206us
-Self CUDA time total: 17.921us
+Self CPU time total: 501.312us
+Self CUDA time total: 17.824us
 
 
 
@@ -4413,19 +4413,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.879us       726.50%     130.879us     130.879us             1  
-                               hf_kernels_causal_conv1d         5.43%      99.212us        99.73%       1.824ms       1.824ms       0.000us         0.00%      24.063us      24.063us             1  
-                                         CausalConv1dFn         4.16%      76.013us        94.31%       1.725ms     574.860us       0.000us         0.00%      24.063us       8.021us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.28%      23.352us        88.43%       1.617ms     539.055us      18.015us       100.00%      24.063us       8.021us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.015us       100.00%      18.015us       6.005us             3  
-                                Activity Buffer Request        78.67%       1.439ms        78.67%       1.439ms       1.439ms       6.048us        33.57%       6.048us       6.048us             1  
-                                       aten::empty_like         0.41%       7.570us         1.72%      31.401us      10.467us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.30%      23.831us         1.30%      23.831us       7.944us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.49%     155.235us         8.49%     155.235us      51.745us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       4.890us         0.27%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     132.125us       742.61%     132.125us     132.125us             1  
+                               hf_kernels_causal_conv1d         5.56%     102.363us        99.74%       1.835ms       1.835ms       0.000us         0.00%      23.776us      23.776us             1  
+                                         CausalConv1dFn         4.21%      77.520us        94.18%       1.733ms     577.645us       0.000us         0.00%      23.776us       7.925us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.43%      26.260us        88.34%       1.625ms     541.815us      17.792us       100.00%      23.776us       7.925us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.792us       100.00%      17.792us       5.931us             3  
+                                Activity Buffer Request        78.18%       1.439ms        78.18%       1.439ms       1.439ms       5.984us        33.63%       5.984us       5.984us             1  
+                                       aten::empty_like         0.40%       7.411us         1.63%      29.972us       9.991us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.23%      22.561us         1.23%      22.561us       7.520us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.73%     160.604us         8.73%     160.604us      53.535us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.730us         0.26%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.829ms
-Self CUDA time total: 18.015us
+Self CPU time total: 1.840ms
+Self CUDA time total: 17.792us
 
 
 
@@ -4435,19 +4435,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.645us       665.08%     123.645us     123.645us             1  
-                               hf_kernels_causal_conv1d        22.59%     109.155us        99.05%     478.537us     478.537us       0.000us         0.00%      24.830us      24.830us             1  
-                                         CausalConv1dFn        15.84%      76.521us        76.45%     369.382us     123.127us       0.000us         0.00%      24.830us       8.277us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.13%      24.791us        54.61%     263.860us      87.953us      18.591us       100.00%      24.830us       8.277us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.591us       100.00%      18.591us       6.197us             3  
-                                Activity Buffer Request        17.60%      85.023us        17.60%      85.023us      85.023us       6.239us        33.56%       6.239us       6.239us             1  
-                                       aten::empty_like         1.53%       7.411us         6.00%      29.001us       9.667us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.47%      21.590us         4.47%      21.590us       7.197us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.88%     154.046us        31.88%     154.046us      51.349us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.95%       4.601us         0.95%       4.601us       4.601us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.846us       704.99%     130.846us     130.846us             1  
+                               hf_kernels_causal_conv1d        20.99%     104.563us        98.97%     492.921us     492.921us       0.000us         0.00%      24.736us      24.736us             1  
+                                         CausalConv1dFn        14.81%      73.740us        77.98%     388.358us     129.453us       0.000us         0.00%      24.736us       8.245us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.56%      27.711us        57.26%     285.166us      95.055us      18.560us       100.00%      24.736us       8.245us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.560us       100.00%      18.560us       6.187us             3  
+                                Activity Buffer Request        19.93%      99.242us        19.93%      99.242us      99.242us       6.176us        33.28%       6.176us       6.176us             1  
+                                       aten::empty_like         1.54%       7.672us         5.91%      29.452us       9.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.37%      21.780us         4.37%      21.780us       7.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.77%     158.213us        31.77%     158.213us      52.738us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.03%       5.120us         1.03%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 483.138us
-Self CUDA time total: 18.591us
+Self CPU time total: 498.041us
+Self CUDA time total: 18.560us
 
 
 
@@ -4457,19 +4457,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         5.67%     104.074us        99.72%       1.829ms       1.829ms       0.000us         0.00%     162.623us     162.623us             1  
-                                         CausalConv1dFn         4.47%      81.893us        94.05%       1.725ms     574.926us       0.000us         0.00%     162.623us      54.208us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      26.950us        87.82%       1.611ms     536.865us      97.823us       100.00%     162.623us      54.208us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     146.719us       149.98%     146.719us     146.719us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.823us       100.00%      97.823us      32.608us             3  
-                                Activity Buffer Request        77.02%       1.413ms        77.02%       1.413ms       1.413ms      64.800us        66.24%      64.800us      64.800us             1  
-                                       aten::empty_like         0.45%       8.219us         1.76%      32.292us      10.764us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.31%      24.073us         1.31%      24.073us       8.024us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.33%     171.076us         9.33%     171.076us      57.025us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.071us         0.28%       5.071us       5.071us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         6.20%     114.892us        99.70%       1.846ms       1.846ms       0.000us         0.00%     163.362us     163.362us             1  
+                                         CausalConv1dFn         4.16%      76.963us        93.49%       1.731ms     577.109us       0.000us         0.00%     163.362us      54.454us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.46%      26.971us        87.71%       1.624ms     541.405us      97.825us       100.00%     163.362us      54.454us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     140.833us       143.96%     140.833us     140.833us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.825us       100.00%      97.825us      32.608us             3  
+                                Activity Buffer Request        77.29%       1.431ms        77.29%       1.431ms       1.431ms      65.537us        66.99%      65.537us      65.537us             1  
+                                       aten::empty_like         0.44%       8.169us         1.63%      30.150us      10.050us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.19%      21.981us         1.19%      21.981us       7.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.97%     166.033us         8.97%     166.033us      55.344us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.600us         0.30%       5.600us       5.600us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.834ms
-Self CUDA time total: 97.823us
+Self CPU time total: 1.852ms
+Self CUDA time total: 97.825us
 
 
 
@@ -4479,19 +4479,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        19.41%      95.622us        98.95%     487.536us     487.536us       0.000us         0.00%     165.309us     165.309us             1  
-                                         CausalConv1dFn        15.06%      74.214us        79.54%     391.914us     130.638us       0.000us         0.00%     165.309us      55.103us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.22%      25.702us        58.53%     288.390us      96.130us      99.646us       100.00%     165.309us      55.103us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     134.941us       135.42%     134.941us     134.941us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      99.646us       100.00%      99.646us      33.215us             3  
-                                Activity Buffer Request        20.90%     102.993us        20.90%     102.993us     102.993us      65.663us        65.90%      65.663us      65.663us             1  
-                                       aten::empty_like         1.51%       7.430us         5.95%      29.310us       9.770us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.44%      21.880us         4.44%      21.880us       7.293us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.41%     159.695us        32.41%     159.695us      53.232us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.05%       5.180us         1.05%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        18.90%      99.621us        99.07%     522.281us     522.281us       0.000us         0.00%     164.258us     164.258us             1  
+                                         CausalConv1dFn        18.38%      96.883us        80.18%     422.660us     140.887us       0.000us         0.00%     164.258us      54.753us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.13%      27.060us        56.33%     296.936us      98.979us      98.914us       100.00%     164.258us      54.753us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     161.441us       163.21%     161.441us     161.441us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.914us       100.00%      98.914us      32.971us             3  
+                                Activity Buffer Request        20.45%     107.822us        20.45%     107.822us     107.822us      65.344us        66.06%      65.344us      65.344us             1  
+                                       aten::empty_like         1.40%       7.371us         5.47%      28.841us       9.614us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.07%      21.470us         4.07%      21.470us       7.157us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.74%     162.054us        30.74%     162.054us      54.018us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.93%       4.890us         0.93%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 492.716us
-Self CUDA time total: 99.646us
+Self CPU time total: 527.171us
+Self CUDA time total: 98.914us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4527,9 +4527,9 @@ Installed 15 packages in 13ms
 
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 14.15it/s] -Fetching 11 files: 64%|██████▎ | 7/11 [00:02<00:01, 3.20it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:02<00:00, 5.38it/s]
+Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:02, 4.39it/s] +Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:01, 3.53it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 5.65it/s]

Artifacts:

causal_conv1d.jsonl