[Pallas:MGPU] Add support for multimem stores in Pallas

apaszke · Google-ML-Automation · commit 47d933ccdea1 · 2025-10-01T05:39:04.000-07:00
PiperOrigin-RevId: 813711270
diff --git a/jax/_src/pallas/mosaic_gpu/primitives.py b/jax/_src/pallas/mosaic_gpu/primitives.py
@@ -3434,3 +3434,86 @@ def query_cluster_cancel(
       grid_names=grid_names,
       transforms_tree=result_transforms_tree)
   return tuple(result[:-1]), result[-1]
+
+
+multimem_store_p = jax_core.Primitive("multimem_store")
+multimem_store_p.multiple_results = True
+
+
+def multimem_store(source: jax.Array, ref: _Ref, collective_axes: Hashable | tuple[Hashable, ...]):
+  """Stores the value to ref on all devices present in collective_axes.
+
+  The stores is done using the multimem instructions, meaning that the data is
+  only transferred to the switch once, and broadcasted to all other devices
+  there.
+
+  Args:
+    source: The value to store.
+    ref: The GMEM reference to store the value to.
+    collective_axes: The JAX mesh axes indicating the devices to store to.
+  """
+  if isinstance(ref, pallas_core.TransformedRef):
+    transforms_leaves, transforms_tree = jax.tree.flatten(
+        ref.transforms
+    )
+    ref = ref.ref
+  else:
+    transforms_leaves, transforms_tree = [], None
+  multimem_store_p.bind(
+      source,
+      ref,
+      *transforms_leaves,
+      collective_axes=collective_axes,
+      transforms_tree=transforms_tree,
+  )
+
+
+@multimem_store_p.def_effectful_abstract_eval
+def _multimem_store_abstract_eval(source, ref, *transforms_leaves, transforms_tree, **_):
+  _check_ref(ref, "ref", gpu_core.GMEM)
+  shape, dtype = ref.shape, ref.dtype
+  if transforms_tree is not None:
+    transforms = jax.tree.unflatten(transforms_tree, transforms_leaves)
+    for t in transforms:
+      shape = t.transform_shape(shape)
+      dtype = t.transform_dtype(dtype)
+  if source.dtype != dtype:
+    raise ValueError(f"Value dtype {source.dtype} does not match ref dtype {dtype}")
+  if source.shape != shape:
+    raise ValueError(f"Value shape {source.shape} does not match ref shape {shape}")
+  return [], {pallas_core.comms_effect}
+
+
+@lowering.register_lowering_rule(multimem_store_p, mgpu.LoweringSemantics.Lane)
+def _multimem_store_lowering_rule(
+    ctx: lowering.LoweringRuleContext, value, local_ref, *transforms_leaves, transforms_tree, collective_axes,
+):
+  if (mesh_info := ctx.module_ctx.mesh_info) is None:
+    raise ValueError(
+        "JAX device mesh is required by multimem_store, but not defined."
+    )
+  if set(collective_axes) != set(mesh_info.axis_names):
+    raise NotImplementedError(
+        "Only collective_axes that include all JAX device mesh"
+        f" ({mesh_info.axis_names}) axes are supported, but got"
+        f" {collective_axes}"
+    )
+  if not isinstance(value, mgpu.FragmentedArray):
+    raise TypeError(f"Can only store arrays (got {value}).")
+  if transforms_tree is not None:
+    transforms = tree_util.tree_unflatten(transforms_tree, transforms_leaves)
+    local_ref, transforms = lowering._handle_transforms(
+        ctx, local_ref, transforms, allow_peer_refs=False
+    )
+    if transforms:
+      raise NotImplementedError(
+          f"Unhandled transforms for multimem_store: {transforms}"
+      )
+  multi_ref = ctx.launch_ctx.to_remote_multicast(local_ref)
+  if not ctx.avals_in[0].shape:
+    multi_ref.store(lowering._ensure_ir_value(value, ctx.avals_out[0].dtype), [])
+  else:
+    value.store_untiled(multi_ref, optimized=False)
+  if ctx.module_ctx.auto_barriers:
+    mgpu.warpgroup_barrier()  # Make sure the writes have completed.
+  return ()
diff --git a/jax/experimental/pallas/mosaic_gpu.py b/jax/experimental/pallas/mosaic_gpu.py
@@ -67,6 +67,7 @@
 from jax._src.pallas.mosaic_gpu.primitives import copy_smem_to_gmem as copy_smem_to_gmem
 from jax._src.pallas.mosaic_gpu.primitives import inline_mgpu as inline_mgpu
 from jax._src.pallas.mosaic_gpu.primitives import load as load
+from jax._src.pallas.mosaic_gpu.primitives import multimem_store as multimem_store
 from jax._src.pallas.mosaic_gpu.primitives import print_layout as print_layout
 from jax._src.pallas.mosaic_gpu.primitives import query_cluster_cancel as query_cluster_cancel
 from jax._src.pallas.mosaic_gpu.primitives import RefType as RefType
diff --git a/tests/pallas/gpu_pallas_distributed_test.py b/tests/pallas/gpu_pallas_distributed_test.py
@@ -22,6 +22,7 @@
 from jax import lax
 from jax._src import test_util as jtu
 from jax._src import test_multiprocess as jt_multiprocess
+from jax.experimental import multihost_utils
 from jax.experimental import pallas as pl
 from jax.experimental import shard_map
 from jax.experimental.pallas import mosaic_gpu as plgpu
@@ -229,6 +230,35 @@ def kernel(y_ref, sem):
     with self.assertRaisesRegex(NotImplementedError, msg):
       f()
 
+  def test_multimem_store(self):
+    if jax.process_index() > 2:
+      return  # Only 2 processes needed.
+
+    def kernel(y_ref, sem):
+      @pl.when(lax.axis_index('x') == 0)
+      def _store():
+        output = plgpu.layout_cast(lax.broadcasted_iota(jnp.int32, (128, 128), 1), plgpu.Layout.WGMMA)
+        plgpu.multimem_store(output, y_ref, 'x')
+      other_dev_id = 1 - lax.axis_index('x')
+      pl.semaphore_signal(sem, 1, device_id=other_dev_id)
+      pl.semaphore_wait(sem)
+
+    kernel_call = pl.pallas_call(
+        kernel,
+        out_specs=pl.BlockSpec(memory_space=plgpu.GMEM),
+        out_shape=jax.ShapeDtypeStruct((128, 128), jnp.int32),
+        scratch_shapes=[plgpu.SemaphoreType.REGULAR],
+    )
+    mesh = jax.sharding.Mesh(jax.devices(), ['x'])
+    y = jax.jit(
+        shard_map.shard_map(
+            kernel_call, mesh, in_specs=(), out_specs=P("x"), check_rep=False,
+        )
+    )()
+    y = multihost_utils.process_allgather(y, tiled=True)
+    ref = lax.broadcasted_iota(jnp.int32, (128, 128), 1)
+    np.testing.assert_array_equal(y, np.concat([ref, ref], axis=0))
+
 
 if __name__ == '__main__':
   # This test doesn't work with the platform allocator, so we override it
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -2603,6 +2603,7 @@ def test_missing_primitive_lowerings_are_tracked(self):
         mgpu_primitives.semaphore_signal_parallel_p,
         mgpu_primitives.try_cluster_cancel_p,
         mgpu_primitives.query_cluster_cancel_p,
+        mgpu_primitives.multimem_store_p,
         lax.slice_p,
         lax.iota_p,
         pallas_core.core_map_p,