[Mosaic GPU] Improve checks on the WG strided transfers

apaszke · Google-ML-Automation · commit 33c7a7eb0a4f · 2025-10-02T03:50:00.000-07:00
Our previous code didn't actually verify that the vector loads/stores are safe,
so you could pass in a non-contiguous reference with a weird shape, but the right
number of elements and get nonsensical results. The current check is a bit too conservative,
but it's better to lean this way.

This also factors the address calculation into a common `transfer_strided` class method, following
the same pattern we use for tiled layouts.

PiperOrigin-RevId: 814163433
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -966,16 +966,11 @@ def load_strided(
         )
     else:
       layout = WGStridedFragLayout(shape=shape, vec_size=vec_size)
+    registers = np.empty(layout.registers_shape(shape), dtype=object)
     vec_ty = ir.VectorType.get((layout.vec_size,), ref_ty.element_type)
-    try:
-      # Flattening the reference potentially produces simpler PTX but
-      # if the ref is not already 1D and has strided dimensions
-      # flattening won't work.
-      ref_ = mgpu.memref_fold(ref, 0, len(ref_ty.shape))
-      vecs = [vector.load(vec_ty, ref_, [vec_idx]) for vec_idx in layout.linear_thread_idxs()]
-    except NotImplementedError:
-      vecs = [vector.load(vec_ty, ref, vec_idx) for vec_idx in layout.thread_idxs(shape)]
-    return cls(_registers=np.array(vecs), _layout=layout, _is_signed=is_signed)
+    for _get, update, ref, idx in cls.transfer_strided(ref, layout.vec_size):
+      update(registers, vector.load(vec_ty, ref, idx))
+    return cls(_registers=registers, _layout=layout, _is_signed=is_signed)
 
   @classmethod
   def splat(
@@ -2579,8 +2574,10 @@ def store_untiled(
         if isinstance(ref, utils.MultimemRef):
           raise NotImplementedError("Strided layout does not support multimem")
         if swizzle != 16:
-          raise NotImplementedError
-        self._store_untiled_wg_strided(ref)
+          raise ValueError("Only TiledLayouts support swizzling")
+        assert isinstance(self.layout, WGStridedFragLayout)
+        for get, _update, ref, idx in self.transfer_strided(ref, self.layout.vec_size):
+          vector.store(get(self.registers), ref, idx)
       case TiledLayout():
         ref_shape = ir.MemRefType(ref.type).shape
         ref = utils.memref_reshape(ref, (*(1 for _ in ref_shape), *ref_shape))
@@ -2621,8 +2618,8 @@ def load_untiled(
       is_signed: bool | None = None,
       optimized: bool = True,
   ) -> FragmentedArray:
-    ref_shape = ir.MemRefType(ref.type).shape
-    ref = utils.memref_reshape(ref, (*(1 for _ in ref_shape), *ref_shape))
+    ref_ty = ir.MemRefType(ref.type)
+    ref = utils.memref_reshape(ref, (*(1 for _ in ref_ty.shape), *ref_ty.shape))
     return cls.load_tiled(
         ref, swizzle=swizzle, is_signed=is_signed, layout=layout, optimized=optimized
     )
@@ -2653,27 +2650,6 @@ def _store_untiled_splat(self, ref: ir.Value):
     )
     fa.store_untiled(ref)
 
-  def _store_untiled_wg_strided(self, ref: ir.Value):
-    assert isinstance(self.layout, WGStridedFragLayout)
-    ref_ty = ir.MemRefType(ref.type)
-    idxs: Iterable[Sequence[ir.Value]]
-    try:
-      # Flattening the reference potentially produces simpler PTX but
-      # if the ref is not already 1D and has strided dimensions
-      # flattening won't work. We use a different variable for ref in
-      # case `NotImplementedError` is thrown by
-      # .linear_thread_idxs().
-      ref_ = mgpu.memref_fold(ref, 0, len(ref_ty.shape))
-      idxs = ((i,) for i in self.layout.linear_thread_idxs())
-    except NotImplementedError:
-      ref_ = ref
-      idxs = self.layout.thread_idxs(self.shape)
-    ref_shape = tuple(ref_ty.shape)
-    if ref_shape != self.shape:
-      raise ValueError((ref_shape, self.shape))
-    for idx, reg in zip(idxs, self.registers.flat):
-      vector.store(reg, ref_, idx)
-
   def store_tiled(self, ref: ir.Value | utils.MultimemRef, swizzle: int | None, optimized: bool = True):
     if not isinstance(self.layout, TiledLayout):
       raise NotImplementedError(self.layout)
@@ -2731,6 +2707,51 @@ def load_tiled(
       update(registers, loaded_reg)
     return cls(_registers=registers, _layout=layout, _is_signed=is_signed)
 
+  @classmethod
+  def transfer_strided(self, ref: ir.Value, vec_size: int):
+    ref_ty = ir.MemRefType(ref.type)
+    layout = WGStridedFragLayout(shape=tuple(ref_ty.shape), vec_size=vec_size)
+    try:
+      # Flattening the reference potentially produces simpler PTX but
+      # if the ref is not already 1D and has strided dimensions
+      # flattening won't work.
+      ref = mgpu.memref_fold(ref, 0, len(ref_ty.shape))
+    except ValueError:
+      strides, _ = ref_ty.get_strides_and_offset()
+      if vec_size > 1:
+        # TODO(apaszke): We could fold all the pairs of dims that are contiguous
+        # This check is a too strict if we don't do that.
+        has_contiguous_dim = False
+        for size, stride in zip(ref_ty.shape, strides):
+          if stride == 1:
+            has_contiguous_dim = True
+            if size % vec_size != 0:
+              raise ValueError(
+                  "The contiguous dimension of the reference must be a"
+                  f" multiple of the layout's vector size (got {size} and"
+                  f" vector size {vec_size})"
+              ) from None
+          elif size > 1:
+            if stride % vec_size != 0:
+              raise ValueError(
+                  "Non-contiguous dimension of the reference must have strides"
+                  " that are multiples of the layout's vector size (got"
+                  f" {stride} and vector size {vec_size})"
+              ) from None
+        if not has_contiguous_dim:
+          raise ValueError(
+              "The reference must have a contiguous dimension when vec_size > 1"
+          )
+      idx_gen = layout.thread_idxs(tuple(ref_ty.shape))
+    else:
+      idx_gen = map(lambda x: [x], layout.linear_thread_idxs())
+    for i, vec_idx in enumerate(idx_gen):
+      def update(registers, reg, _i=i):
+        registers[_i] = reg
+      def get(registers, _i=i):
+        return registers[_i]
+      yield get, update, ref, vec_idx
+
   @staticmethod
   def transfer_tiled(
       ref: ir.Value,
diff --git a/jax/experimental/mosaic/gpu/utils.py b/jax/experimental/mosaic/gpu/utils.py
@@ -694,7 +694,7 @@ def memref_fold(ref: ir.Value, dim, fold_rank) -> ir.Value:
     new_strides[dim : dim + fold_rank] = [new_strides[dim + fold_rank - 1]]
     new_layout = ir.StridedLayoutAttr.get(offset, new_strides)
   else:
-    raise NotImplementedError(
+    raise ValueError(
         f"strides={ref_ty.get_strides_and_offset()[0]}, {ref_ty.shape=},"
         f" {dim=}, {fold_rank=}"
     )
diff --git a/tests/mosaic/gpu_test.py b/tests/mosaic/gpu_test.py
@@ -413,7 +413,7 @@ def kernel(ctx, inp, out, _):
       # ("overap", (2, 4, 4), (16, 1, 1), 0, 3, True),
   ])
   def test_fold_strided(
-      self, shape, strides, dim, fold_rank, throws_not_impl
+      self, shape, strides, dim, fold_rank, throws
   ):
     expanded_shape = get_packed_shape(strides, shape)
     total_size = np.prod(expanded_shape)
@@ -426,7 +426,7 @@ def np_fold(inp, dim, fold_rank):
       out_shape[dim : dim + fold_rank] = [
           int(np.prod(inp.shape[dim : dim + fold_rank]))
       ]
-      if throws_not_impl:
+      if throws:
         return jax.ShapeDtypeStruct(shape=out_shape, dtype=inp.dtype)
       else:
         return inp.reshape(*out_shape)
@@ -442,12 +442,12 @@ def kernel(ctx, inp, out, _):
           kernel, (1, 1, 1), (128, 1, 1), np_inp, out, ()
       )(np_inp)
       assert (
-          not throws_not_impl
+          not throws
       ), "If it should have thrown it would during the call."
       np.testing.assert_array_equal(y, out)
 
-    if throws_not_impl:
-      with self.assertRaises(NotImplementedError):
+    if throws:
+      with self.assertRaises(ValueError):
         do_test()
     else:
       do_test()
@@ -2937,6 +2937,47 @@ def kernel(ctx, dst, _):
     rtol = 4e-6 if approx else 2e-7
     np.testing.assert_allclose(result, np_op(x), atol=atol, rtol=rtol)
 
+  def test_strided_copy_noncontig_good(self):
+    def kernel(ctx, src, dst, _):
+      src_slice = mgpu.memref_slice(src, (slice(None), 1))
+      mgpu.FragmentedArray.load_strided(src_slice, is_signed=True, vec_size=4).store_untiled(dst)
+
+    in_shape = jax.ShapeDtypeStruct((32, 2, 32), jnp.int32)
+    out_shape = jax.ShapeDtypeStruct((32, 32), jnp.int32)
+
+    kernel_fn = mgpu.as_gpu_kernel(
+        kernel, (1, 1, 1), (128, 1, 1), in_shape, out_shape, ()
+    )
+    x = np.arange(math.prod(in_shape.shape), dtype=jnp.int32).reshape(in_shape.shape)
+    np.testing.assert_array_equal(kernel_fn(x), x[:, 1])
+
+  def test_strided_copy_noncontig_bad(self):
+    def kernel(ctx, src, dst, _):
+      src_slice = mgpu.memref_slice(src, (slice(None), 1))
+      mgpu.FragmentedArray.load_strided(src_slice, is_signed=True, vec_size=2).store_untiled(dst)
+
+    out_shape = jax.ShapeDtypeStruct((256, 7), jnp.int32)
+
+    in_shape = jax.ShapeDtypeStruct((256, 6, 7), jnp.int32)
+    msg = (
+        "The contiguous dimension of the reference must be a multiple of the"
+        " layout's vector size (got 7 and vector size 2)"
+    )
+    with self.assertRaises(ValueError, msg=msg):
+      mgpu.as_gpu_kernel(
+          kernel, (1, 1, 1), (128, 1, 1), in_shape, out_shape, ()
+      )
+
+    in_shape = jax.ShapeDtypeStruct((256, 5, 7), jnp.int32)
+    msg = (
+        "Non-contiguous dimension of the reference must have strides that are"
+        " multiples of the layout's vector size (got 35 and vector size 2)"
+    )
+    with self.assertRaises(ValueError, msg=msg):
+      mgpu.as_gpu_kernel(
+          kernel, (1, 1, 1), (128, 1, 1), in_shape, out_shape, ()
+      )
+
   @parameterized.product(
       dtype=[jnp.float32, jnp.int32],
       m=[128],

Original file line number	Diff line number	Diff line change
`@@ -694,7 +694,7 @@ def memref_fold(ref: ir.Value, dim, fold_rank) -> ir.Value:`
`694`	`694`	`new_strides[dim : dim + fold_rank] = [new_strides[dim + fold_rank - 1]]`
`695`	`695`	`new_layout = ir.StridedLayoutAttr.get(offset, new_strides)`
`696`	`696`	`else:`
`697`		`- raise NotImplementedError(`
	`697`	`+ raise ValueError(`
`698`	`698`	`f"strides={ref_ty.get_strides_and_offset()[0]}, {ref_ty.shape=},"`
`699`	`699`	`f" {dim=}, {fold_rank=}"`
`700`	`700`	`)`