[Mosaic GPU] Improve checks on the WG strided transfers

apaszke · Google-ML-Automation · commit 5375748a6974 · 2025-10-01T06:22:21.000-07:00
Our previous code didn't actually verify that the vector loads/stores are safe,
so you could pass in a non-contiguous reference with a weird shape, but the right
number of elements and get nonsensical results. The current check is a bit too conservative,
but it's better to lean this way.

This also factors the address calculation into a common `transfer_strided` class method, following
the same pattern we use for tiled layouts.

PiperOrigin-RevId: 813722484
diff --git a/docs/jax.experimental.pallas.mosaic_gpu.rst b/docs/jax.experimental.pallas.mosaic_gpu.rst
@@ -86,6 +86,15 @@ Blackwell-specific functions
    try_cluster_cancel
    query_cluster_cancel
 
+Multimem operations
+-------------------
+
+.. autosummary::
+   :toctree: _autosummary
+
+   multimem_store
+   multimem_load_reduce
+
 Aliases
 -------
 
diff --git a/jax/_src/pallas/mosaic_gpu/primitives.py b/jax/_src/pallas/mosaic_gpu/primitives.py
@@ -3517,3 +3517,88 @@ def _multimem_store_lowering_rule(
   if ctx.module_ctx.auto_barriers:
     mgpu.warpgroup_barrier()  # Make sure the writes have completed.
   return ()
+
+
+multimem_load_reduce_p = jax_core.Primitive("multimem_load_reduce")
+
+@multimem_load_reduce_p.def_effectful_abstract_eval
+def _multimem_load_reduce_abstract_eval(ref, *avals_flat, tree, collective_axes, reduction_op):
+  del collective_axes, reduction_op
+  _check_ref(ref, "ref", gpu_core.GMEM)
+  shape, dtype = ref.shape, ref.dtype
+  if tree is not None:
+    transforms = jax.tree.unflatten(tree, avals_flat)
+    for t in transforms:
+      shape = t.transform_shape(shape)
+      dtype = t.transform_dtype(dtype)
+  return jax_core.ShapedArray(shape, dtype), {pallas_core.comms_effect}
+
+@lowering.register_lowering_rule(multimem_load_reduce_p, mgpu.LoweringSemantics.Lane)
+def _multimem_load_reduce_lowering_rule(
+    ctx: lowering.LoweringRuleContext, ref, *transforms_leaves, tree, collective_axes, reduction_op,
+):
+  if (mesh_info := ctx.module_ctx.mesh_info) is None:
+    raise ValueError(
+        "JAX device mesh is required by multimem_load_reduce, but not defined."
+    )
+  if set(collective_axes) != set(mesh_info.axis_names):
+    raise NotImplementedError(
+        "Only collective_axes that include all JAX device mesh"
+        f" ({mesh_info.axis_names}) axes are supported, but got"
+        f" {collective_axes}"
+    )
+  if ctx.out_layout_hint is None:
+    raise RuntimeError(
+        "Failed to infer the output layout of multimem_load_reduce. Please apply"
+        " plgpu.layout_cast to its output right after its creation."
+    )
+  dtype = ctx.avals_out[0].dtype
+  transforms = tree.unflatten(transforms_leaves)
+  ref, transforms = lowering._handle_transforms(ctx, ref, transforms, allow_peer_refs=False)
+  if transforms:
+    raise NotImplementedError(
+        f"Unhandled transforms for multimem_load_reduce: {transforms}"
+    )
+  multi_ref = ctx.launch_ctx.to_remote_multicast(ref)
+  is_signed = mgpu_utils.is_signed(dtype)
+  arr = mgpu.FragmentedArray.load_reduce_untiled(
+      multi_ref,
+      layout=ctx.out_layout_hint,
+      is_signed=is_signed,
+      reduction=reduction_op,
+  )
+  return arr
+
+def multimem_load_reduce(
+    ref: _Ref,
+    *,
+    collective_axes: Hashable | tuple[Hashable, ...],
+    reduction_op: mgpu.MultimemReductionOp,
+) -> jax.Array:
+  """Loads from a GMEM reference on all devices present in collective_axes and reduces the loaded values.
+
+  The supported dtypes are: ``jnp.float32``, ``jnp.float16``, ``jnp.bfloat16``,
+  ``jnp.float8_e5m2``, ``jnp.float8_e4m3fn``, ``jnp.int32`` and ``jnp.int64``.
+
+  8-bit floating point dtypes are only supported on Blackwell GPUs.
+
+  Args:
+    ref: The GMEM reference to load from.
+    collective_axes: The JAX mesh axes indicating the devices to load from.
+    reduction_op: The reduction operation to perform on the loaded values. The
+      allowed values are add (all dtypes), min, max (all dtypes but f32), as
+      well as and, or and xor (integer types only).
+  """
+  ref, ref_transforms = state_primitives.get_ref_and_transforms(
+      ref, None, "multimem_load_reduce"
+  )
+  flat_ref_transforms, ref_transforms_treedef = tree_util.tree_flatten(
+      ref_transforms
+  )
+  return multimem_load_reduce_p.bind(
+      ref,
+      *flat_ref_transforms,
+      tree=ref_transforms_treedef,
+      collective_axes=collective_axes,
+      reduction_op=reduction_op,
+  )
diff --git a/jax/experimental/mosaic/gpu/__init__.py b/jax/experimental/mosaic/gpu/__init__.py
@@ -86,6 +86,7 @@
     Partition1D as Partition1D,
     SemaphoreRef as SemaphoreRef,
     ThreadSubset as ThreadSubset,
+    MultimemReductionOp as MultimemReductionOp,
     bitwidth as bitwidth,
     bytewidth as bytewidth,
     c as c,
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -966,16 +966,11 @@ def load_strided(
         )
     else:
       layout = WGStridedFragLayout(shape=shape, vec_size=vec_size)
+    registers = np.empty(layout.registers_shape(shape), dtype=object)
     vec_ty = ir.VectorType.get((layout.vec_size,), ref_ty.element_type)
-    try:
-      # Flattening the reference potentially produces simpler PTX but
-      # if the ref is not already 1D and has strided dimensions
-      # flattening won't work.
-      ref_ = mgpu.memref_fold(ref, 0, len(ref_ty.shape))
-      vecs = [vector.load(vec_ty, ref_, [vec_idx]) for vec_idx in layout.linear_thread_idxs()]
-    except NotImplementedError:
-      vecs = [vector.load(vec_ty, ref, vec_idx) for vec_idx in layout.thread_idxs(shape)]
-    return cls(_registers=np.array(vecs), _layout=layout, _is_signed=is_signed)
+    for _get, update, ref, idx in cls.transfer_strided(ref, layout.vec_size):
+      update(registers, vector.load(vec_ty, ref, idx))
+    return cls(_registers=registers, _layout=layout, _is_signed=is_signed)
 
   @classmethod
   def splat(
@@ -2579,8 +2574,10 @@ def store_untiled(
         if isinstance(ref, utils.MultimemRef):
           raise NotImplementedError("Strided layout does not support multimem")
         if swizzle != 16:
-          raise NotImplementedError
-        self._store_untiled_wg_strided(ref)
+          raise ValueError("Only TiledLayouts support swizzling")
+        assert isinstance(self.layout, WGStridedFragLayout)
+        for get, _update, ref, idx in self.transfer_strided(ref, self.layout.vec_size):
+          vector.store(get(self.registers), ref, idx)
       case TiledLayout():
         ref_shape = ir.MemRefType(ref.type).shape
         ref = utils.memref_reshape(ref, (*(1 for _ in ref_shape), *ref_shape))
@@ -2621,8 +2618,8 @@ def load_untiled(
       is_signed: bool | None = None,
       optimized: bool = True,
   ) -> FragmentedArray:
-    ref_shape = ir.MemRefType(ref.type).shape
-    ref = utils.memref_reshape(ref, (*(1 for _ in ref_shape), *ref_shape))
+    ref_ty = ir.MemRefType(ref.type)
+    ref = utils.memref_reshape(ref, (*(1 for _ in ref_ty.shape), *ref_ty.shape))
     return cls.load_tiled(
         ref, swizzle=swizzle, is_signed=is_signed, layout=layout, optimized=optimized
     )
@@ -2653,27 +2650,6 @@ def _store_untiled_splat(self, ref: ir.Value):
     )
     fa.store_untiled(ref)
 
-  def _store_untiled_wg_strided(self, ref: ir.Value):
-    assert isinstance(self.layout, WGStridedFragLayout)
-    ref_ty = ir.MemRefType(ref.type)
-    idxs: Iterable[Sequence[ir.Value]]
-    try:
-      # Flattening the reference potentially produces simpler PTX but
-      # if the ref is not already 1D and has strided dimensions
-      # flattening won't work. We use a different variable for ref in
-      # case `NotImplementedError` is thrown by
-      # .linear_thread_idxs().
-      ref_ = mgpu.memref_fold(ref, 0, len(ref_ty.shape))
-      idxs = ((i,) for i in self.layout.linear_thread_idxs())
-    except NotImplementedError:
-      ref_ = ref
-      idxs = self.layout.thread_idxs(self.shape)
-    ref_shape = tuple(ref_ty.shape)
-    if ref_shape != self.shape:
-      raise ValueError((ref_shape, self.shape))
-    for idx, reg in zip(idxs, self.registers.flat):
-      vector.store(reg, ref_, idx)
-
   def store_tiled(self, ref: ir.Value | utils.MultimemRef, swizzle: int | None, optimized: bool = True):
     if not isinstance(self.layout, TiledLayout):
       raise NotImplementedError(self.layout)
@@ -2731,6 +2707,51 @@ def load_tiled(
       update(registers, loaded_reg)
     return cls(_registers=registers, _layout=layout, _is_signed=is_signed)
 
+  @classmethod
+  def transfer_strided(self, ref: ir.Value, vec_size: int):
+    ref_ty = ir.MemRefType(ref.type)
+    layout = WGStridedFragLayout(shape=tuple(ref_ty.shape), vec_size=vec_size)
+    try:
+      # Flattening the reference potentially produces simpler PTX but
+      # if the ref is not already 1D and has strided dimensions
+      # flattening won't work.
+      ref = mgpu.memref_fold(ref, 0, len(ref_ty.shape))
+    except ValueError:
+      strides, _ = ref_ty.get_strides_and_offset()
+      if vec_size > 1:
+        # TODO(apaszke): We could fold all the pairs of dims that are contiguous
+        # This check is a too strict if we don't do that.
+        has_contiguous_dim = False
+        for size, stride in zip(ref_ty.shape, strides):
+          if stride == 1:
+            has_contiguous_dim = True
+            if size % vec_size != 0:
+              raise ValueError(
+                  "The contiguous dimension of the reference must be a"
+                  f" multiple of the layout's vector size (got {size} and"
+                  f" vector size {vec_size})"
+              ) from None
+          elif size > 1:
+            if stride % vec_size != 0:
+              raise ValueError(
+                  "Non-contiguous dimension of the reference must have strides"
+                  " that are multiples of the layout's vector size (got"
+                  f" {stride} and vector size {vec_size})"
+              ) from None
+        if not has_contiguous_dim:
+          raise ValueError(
+              "The reference must have a contiguous dimension when vec_size > 1"
+          )
+      idx_gen = layout.thread_idxs(tuple(ref_ty.shape))
+    else:
+      idx_gen = map(lambda x: [x], layout.linear_thread_idxs())
+    for i, vec_idx in enumerate(idx_gen):
+      def update(registers, reg, _i=i):
+        registers[_i] = reg
+      def get(registers, _i=i):
+        return registers[_i]
+      yield get, update, ref, vec_idx
+
   @staticmethod
   def transfer_tiled(
       ref: ir.Value,
diff --git a/jax/experimental/mosaic/gpu/utils.py b/jax/experimental/mosaic/gpu/utils.py
@@ -694,7 +694,7 @@ def memref_fold(ref: ir.Value, dim, fold_rank) -> ir.Value:
     new_strides[dim : dim + fold_rank] = [new_strides[dim + fold_rank - 1]]
     new_layout = ir.StridedLayoutAttr.get(offset, new_strides)
   else:
-    raise NotImplementedError(
+    raise ValueError(
         f"strides={ref_ty.get_strides_and_offset()[0]}, {ref_ty.shape=},"
         f" {dim=}, {fold_rank=}"
     )
diff --git a/jax/experimental/pallas/mosaic_gpu.py b/jax/experimental/pallas/mosaic_gpu.py
@@ -68,6 +68,7 @@
 from jax._src.pallas.mosaic_gpu.primitives import inline_mgpu as inline_mgpu
 from jax._src.pallas.mosaic_gpu.primitives import load as load
 from jax._src.pallas.mosaic_gpu.primitives import multimem_store as multimem_store
+from jax._src.pallas.mosaic_gpu.primitives import multimem_load_reduce as multimem_load_reduce
 from jax._src.pallas.mosaic_gpu.primitives import print_layout as print_layout
 from jax._src.pallas.mosaic_gpu.primitives import query_cluster_cancel as query_cluster_cancel
 from jax._src.pallas.mosaic_gpu.primitives import RefType as RefType
diff --git a/tests/mosaic/gpu_test.py b/tests/mosaic/gpu_test.py
@@ -447,7 +447,7 @@ def kernel(ctx, inp, out, _):
       np.testing.assert_array_equal(y, out)
 
     if throws_not_impl:
-      with self.assertRaises(NotImplementedError):
+      with self.assertRaises(ValueError):
         do_test()
     else:
       do_test()
diff --git a/tests/pallas/gpu_pallas_distributed_test.py b/tests/pallas/gpu_pallas_distributed_test.py
@@ -29,6 +29,7 @@
 import jax.experimental.mosaic.gpu as mgpu
 import jax.numpy as jnp
 import numpy as np
+import jax.experimental.mosaic.gpu.fragmented_array as fa
 
 
 P = jax.sharding.PartitionSpec
@@ -259,6 +260,123 @@ def _store():
     ref = lax.broadcasted_iota(jnp.int32, (128, 128), 1)
     np.testing.assert_array_equal(y, np.concat([ref, ref], axis=0))
 
+  @parameterized.parameters(
+      (jnp.int32, 1, "add"),
+      (jnp.int32, 1, "min"),
+      (jnp.int32, 1, "max"),
+      (jnp.int32, 1, "and"),
+      (jnp.int32, 1, "or"),
+      (jnp.int32, 1, "xor"),
+      (jnp.float32, 1, "add"),
+      (jnp.float32, 2, "add"),
+      (jnp.float32, 4, "add"),
+      (jnp.float16, 2, "add"),
+      (jnp.float16, 2, "min"),
+      (jnp.float16, 4, "max"),
+      (jnp.float16, 8, "add"),
+      (jnp.bfloat16, 2, "max"),
+      (jnp.bfloat16, 8, "add"),
+      (jnp.float8_e5m2, 4, "add"),
+      (jnp.float8_e5m2, 8, "min"),
+      (jnp.float8_e5m2, 16, "max"),
+      (jnp.float8_e4m3fn, 4, "min"),
+      (jnp.float8_e4m3fn, 8, "max"),
+      (jnp.float8_e4m3fn, 16, "add"),
+  )
+  def test_multimem_load_reduce(self, dtype, vector_length, reduction):
+    if dtype in (
+        jnp.float8_e5m2,
+        jnp.float8_e4m3fn,
+    ) and not jtu.is_cuda_compute_capability_at_least("10.0"):
+      self.skipTest("Only works on GPU with capability >= sm100")
+    if jax.process_index() > 2:
+      return  # Only 2 processes needed.
+    devices = jax.devices()[:2]
+
+    def kernel(x_ref, y_ref, _, sem_ref):
+      layout = plgpu.Layout.TILED(
+          fa.Tiling(
+              (
+                  (64, 2 * vector_length),
+                  (16, 2 * vector_length),
+                  (vector_length,),
+              )
+          ),
+          warp_dims=(-5,),
+          lane_dims=(-3, -2),
+          vector_dim=-1,
+      )
+      y_ref[...] = plgpu.layout_cast(
+          plgpu.multimem_load_reduce(
+              x_ref.at[16:-16], collective_axes="x", reduction_op=reduction,
+          ),
+          layout
+      )
+      my_device = lax.axis_index("x")
+      other_device = 1 - my_device
+      pl.semaphore_signal(sem_ref, 1, device_id=other_device)
+      pl.semaphore_wait(sem_ref)
+
+    # The rounding we see in low precision types seems to be different from
+    # what JAX/XLA use.
+    match jnp.dtype(dtype).itemsize:
+      case 4:
+        bound = 800000
+      case 2:
+        bound = 128
+      case 1:
+        bound = 4
+      case _:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+    x_local = jax.random.randint(
+        jax.random.key(1234), (128 + 64, 32), dtype=jnp.int32, minval=-bound, maxval=bound,
+    ).astype(dtype)
+    mesh = jax.sharding.Mesh(devices, ("x",))
+    x_shard = jax.ShapeDtypeStruct((64 + 32, 32), dtype)
+    y_shape = jax.ShapeDtypeStruct((64, 32), dtype)
+    y, _ = jax.jit(
+        shard_map.shard_map(
+            pl.pallas_call(
+                kernel,
+                in_specs=[pl.BlockSpec(memory_space=plgpu.GMEM)],
+                out_specs=[
+                    pl.BlockSpec(memory_space=plgpu.SMEM),
+                    pl.BlockSpec(memory_space=plgpu.GMEM),
+                ],
+                out_shape=(y_shape, x_shard),
+                scratch_shapes=[plgpu.SemaphoreType.REGULAR],
+                # TODO(b/448323639): Without aliasing XLA doesn't actually
+                # insert the copy that puts the operand in symmetric memory,
+                # which causes the kernel to crash.
+                input_output_aliases={0: 1},
+            ),
+            mesh=mesh,
+            in_specs=P("x"),
+            out_specs=P("x"),  # Not really, but lets us test.
+            check_rep=False,
+        )
+    )(x_local)
+    y = multihost_utils.process_allgather(y, tiled=True)
+    match reduction:
+      case "add":
+        np_reduction = jnp.add
+      case "min":
+        np_reduction = jnp.minimum
+      case "max":
+        np_reduction = jnp.maximum
+      case "and":
+        np_reduction = jnp.bitwise_and
+      case "or":
+        np_reduction = jnp.bitwise_or
+      case "xor":
+        np_reduction = jnp.bitwise_xor
+      case _:
+        raise ValueError(reduction)
+    np.testing.assert_array_equal(
+        y.astype(jnp.float32),
+        np.tile(np_reduction(x_local[16:64+16], x_local[64+48:128+48]), (2, 1)),
+    )
+
 
 if __name__ == '__main__':
   # This test doesn't work with the platform allocator, so we override it
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py

Original file line number	Diff line number	Diff line change
`@@ -694,7 +694,7 @@ def memref_fold(ref: ir.Value, dim, fold_rank) -> ir.Value:`
`694`	`694`	`new_strides[dim : dim + fold_rank] = [new_strides[dim + fold_rank - 1]]`
`695`	`695`	`new_layout = ir.StridedLayoutAttr.get(offset, new_strides)`
`696`	`696`	`else:`
`697`		`- raise NotImplementedError(`
	`697`	`+ raise ValueError(`
`698`	`698`	`f"strides={ref_ty.get_strides_and_offset()[0]}, {ref_ty.shape=},"`
`699`	`699`	`f" {dim=}, {fold_rank=}"`
`700`	`700`	`)`