Numba Scan: zero out unwritten buffers

ricardoV94 · ricardoV94 · commit 7a46ac2bdd0f · 2025-11-27T18:39:30.000+01:00
diff --git a/pytensor/link/numba/dispatch/scan.py b/pytensor/link/numba/dispatch/scan.py
@@ -254,6 +254,17 @@ def add_output_storage_post_proc_stmt(
                     """
                 ).strip()
             )
+        else:
+            # And regular loops should zero out unused entries of the output buffer
+            # These show up with truncated gradients of while loops
+            output_storage_post_proc_stmts.append(
+                dedent(
+                    f"""
+                    elif {storage_size} > (i + {max_offset}):
+                        {outer_in_name}[i + {max_offset}:] = 0
+                    """
+                ).strip()
+            )
 
     # Special in-loop statements that create (nit-sot) storage arrays after a
     # single iteration is performed.  This is necessary because we don't know
@@ -309,7 +320,7 @@ def add_output_storage_post_proc_stmt(
                     )
 
                 if outer_in_name not in outer_in_mit_mot_names:
-                    # MIT-SOT and NIT-SOT may require buffer rolling/truncation after the main loop
+                    # MIT-SOT and NIT-SOT may require buffer rolling/truncation/zeroing after the main loop
                     max_offset_out_tap = max(output_taps) + max_lookback_inp_tap
                     add_output_storage_post_proc_stmt(
                         storage_name, max_offset_out_tap, storage_size_name
diff --git a/tests/link/numba/test_scan.py b/tests/link/numba/test_scan.py
@@ -673,3 +673,26 @@ def test_higher_order_derivatives():
         [g, gg, ggg],
         [np.array(0.95)],
     )
+
+
+def test_grad_until_and_truncate_sequence_taps():
+    # This is a case where we need special zero out behavior in Scan
+    # Copied from tests.scan.basic.py::TestGradUntil::test_grad_until_and_truncate_sequence_taps
+    x = pt.vector("x")
+    threshold = pt.scalar(name="threshold", dtype="int64")
+
+    r = scan(
+        lambda x, y, u: (x * y, until(y > u)),
+        sequences=dict(input=x, taps=[-2, 0]),
+        outputs_info=[None],
+        non_sequences=[threshold],
+        truncate_gradient=3,
+        return_updates=False,
+    )
+    g = grad(r.sum(), x)
+
+    compare_numba_and_py(
+        [x, threshold],
+        [r, g],
+        [np.arange(15, dtype=x.dtype), 6],
+    )