[JAX] inspect_array: thread probe name through FFI to per-probe filenames

tdophung · tdophung · commit b697a866234c · 2026-06-04T16:30:08.000-07:00
Previously the FFI hardcoded the output path to my_tensor_gpu{N}.bin and
ignored the `name` argument on the Python side, so every probe call in
a program overwrote the same files; the only surviving on-disk dumps
were whichever probe happened to fire last per rank. That made
multi-probe debugging (e.g. wiring TE_MOE_INSPECT through several fwd
and bwd steps of an MoE block) impossible to do offline -- only the
live printf log could be correlated, and only by shape/dtype.

Pass `name` through as an XLA FFI string attribute. On the C++ side it
gets sanitised to a POSIX-safe filename component
({[A-Za-z0-9._-]} preserved, everything else mapped to `_`) and used
as a suffix:

  my_tensor_gpu{device}_{sanitized_name}.bin
  my_tensor_gpu{device}_{sanitized_name}_meta.json

The unsanitised name is echoed verbatim in the JSON metadata and in the
printed log line so probe identity survives the rename.

On the Python side `name` is carried as a custom_vjp nondiff arg, threaded
into the InspectPrimitive bind as a static kwarg, and surfaced through
abstract / lowering / impl / partition / shardy_sharding_rule.
diff --git a/transformer_engine/jax/csrc/extensions/inspect.cpp b/transformer_engine/jax/csrc/extensions/inspect.cpp
@@ -5,18 +5,43 @@
  ************************************************************************/
 #include <cuda_runtime.h>
 
+#include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <string>
+#include <string_view>
 
 #include "../extensions.h"
 #include "xla/ffi/api/c_api.h"
 
 namespace transformer_engine {
 namespace jax {
 
+// Sanitize a probe name for use as a filename component: replace any
+// character that's not [A-Za-z0-9._-] with '_'. Probe names like
+// "fwd/sparse_probs_after_fused_topk" therefore become legal POSIX
+// filenames ("fwd_sparse_probs_after_fused_topk") without losing the
+// trailing semantic suffix.
+static std::string SanitizeProbeName(std::string_view name) {
+  std::string out;
+  out.reserve(name.size());
+  for (char c : name) {
+    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '.' ||
+        c == '_' || c == '-') {
+      out.push_back(c);
+    } else {
+      out.push_back('_');
+    }
+  }
+  if (out.empty()) {
+    out = "anon";
+  }
+  return out;
+}
+
 Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type min_buf,
                       Buffer_Type max_buf, Buffer_Type mean_buf, Buffer_Type std_buf,
-                      Result_Type output_buf) {
+                      Result_Type output_buf, std::string_view name) {
   NVTE_CHECK(input_buf.untyped_data() != nullptr, "Input must be provided for inspect operation");
   NVTE_CHECK(output_buf->untyped_data() != nullptr,
              "Output must be provided for inspect operation");
@@ -42,18 +67,25 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type mi
   int device;
   NVTE_CHECK_CUDA(cudaGetDevice(&device));
 
-  // Write the tensor data to a file as a binary blob
-  std::string filename = "my_tensor_gpu" + std::to_string(device) + ".bin";
+  // Per-probe filenames: my_tensor_gpu{device}_{sanitized_name}.bin /
+  // ..._meta.json. With distinct names, the on-disk dumps survive across
+  // probes instead of being overwritten on every call, so a single test
+  // run produces one .bin per probe per rank ready for offline analysis.
+  std::string safe_name = SanitizeProbeName(name);
+  std::string device_str = std::to_string(device);
+  std::string filename = "my_tensor_gpu" + device_str + "_" + safe_name + ".bin";
   std::ofstream file(filename, std::ios::binary);
   NVTE_CHECK(file.is_open(), "Failed to create file: ", filename);
   file.write(reinterpret_cast<const char *>(input_data.data()), input_data.size());
   file.close();
 
-  // Write out a metadata file
-  std::string meta_filename = "my_tensor_gpu" + std::to_string(device) + "_meta.json";
+  std::string meta_filename = "my_tensor_gpu" + device_str + "_" + safe_name + "_meta.json";
   std::ofstream meta_file(meta_filename);
   NVTE_CHECK(meta_file.is_open(), "Failed to create file: ", meta_filename);
   meta_file << "{";
+  // Echo the original (un-sanitized) probe name so analysis tools can
+  // recover the semantic label even when the filename had to mangle it.
+  meta_file << "\"name\": \"" << name << "\", ";
   meta_file << "\"shape\": [";
   for (size_t i = 0; i < input_buf.dimensions().size(); ++i) {
     meta_file << input_buf.dimensions()[i];
@@ -70,8 +102,11 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type mi
   meta_file << "}";
   meta_file.close();
 
-  // Log the tensor metadata to the console
-  printf("[gpu%d]: Tensor data written to %s (shape: [", device, filename.c_str());
+  // Surface the probe name in the live log alongside the file path, so
+  // analysing a multi-probe trace doesn't require correlating by
+  // shape/dtype guesswork.
+  printf("[gpu%d %.*s]: written to %s (shape: [", device, static_cast<int>(name.size()),
+         name.data(), filename.c_str());
   for (size_t i = 0; i < input_buf.dimensions().size(); ++i) {
     printf("%zu", static_cast<size_t>(input_buf.dimensions()[i]));
     if (i < input_buf.dimensions().size() - 1) {
@@ -86,13 +121,14 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type mi
 
 XLA_FFI_DEFINE_HANDLER_SYMBOL(InspectHandler, InspectFFI,
                               FFI::Bind()
-                                  .Ctx<FFI_Stream_Type>()  // stream
-                                  .Arg<Buffer_Type>()      // input
-                                  .Arg<Buffer_Type>()      // min
-                                  .Arg<Buffer_Type>()      // max
-                                  .Arg<Buffer_Type>()      // mean
-                                  .Arg<Buffer_Type>()      // std
-                                  .Ret<Buffer_Type>()      // output
+                                  .Ctx<FFI_Stream_Type>()           // stream
+                                  .Arg<Buffer_Type>()               // input
+                                  .Arg<Buffer_Type>()               // min
+                                  .Arg<Buffer_Type>()               // max
+                                  .Arg<Buffer_Type>()               // mean
+                                  .Arg<Buffer_Type>()               // std
+                                  .Ret<Buffer_Type>()               // output
+                                  .Attr<std::string_view>("name")   // probe name
 );
 
 }  // namespace jax
diff --git a/transformer_engine/jax/debug/experimental/inspect.py b/transformer_engine/jax/debug/experimental/inspect.py
@@ -33,10 +33,13 @@ def abstract(
         x_max_aval,
         x_mean_aval,
         x_std_aval,
+        *,
+        name,
     ):
         """
         inspect abstract
         """
+        del name
         assert (
             x_min_aval.shape == () and x_min_aval.dtype == jnp.float32
         ), "x_min must be a scalar with dtype float32"
@@ -59,6 +62,8 @@ def lowering(
         x_max,
         x_mean,
         x_std,
+        *,
+        name,
     ):
         """
         inspect lowering rules
@@ -74,6 +79,7 @@ def lowering(
             x_max,
             x_mean,
             x_std,
+            name=name,
         )
 
     @staticmethod
@@ -83,6 +89,8 @@ def impl(
         x_max,
         x_mean,
         x_std,
+        *,
+        name,
     ):
         """
         inspect implementation
@@ -94,11 +102,12 @@ def impl(
             x_max,
             x_mean,
             x_std,
+            name=name,
         )
         return x
 
     @staticmethod
-    def partition(mesh, arg_infos, result_infos):
+    def partition(mesh, arg_infos, result_infos, *, name):
         """
         Identity in sharding: the output carries the same sharding as ``x``;
         the four scalar stats (x_min, x_max, x_mean, x_std) are fully
@@ -119,25 +128,26 @@ def partition(mesh, arg_infos, result_infos):
         out_sharding = x_sharding
 
         def sharded_impl(x, x_min, x_max, x_mean, x_std):
-            return InspectPrimitive.impl(x, x_min, x_max, x_mean, x_std)
+            return InspectPrimitive.impl(x, x_min, x_max, x_mean, x_std, name=name)
 
         return mesh, sharded_impl, out_sharding, arg_shardings
 
     @staticmethod
-    def shardy_sharding_rule(*args):
+    def shardy_sharding_rule(*args, **kwargs):
         """
         Five operands, one output. ``x`` and the output carry the same
         wildcard rank; the four scalar stats are rank-0 (empty operand
-        entries between commas).
+        entries between commas). The ``name`` keyword attribute does not
+        participate in the rule.
         """
-        del args
+        del args, kwargs
         return "..., , , , -> ..."
 
 
 register_primitive(InspectPrimitive)
 
 
-def _inspect_array_inner(x: jnp.ndarray) -> jnp.ndarray:
+def _inspect_array_inner(x: jnp.ndarray, name: str) -> jnp.ndarray:
     assert InspectPrimitive.outer_primitive is not None, (
         "InspectPrimitive FFI is not registered. Please ensure the C++ extension is properly built"
         " and registered."
@@ -148,50 +158,62 @@ def _inspect_array_inner(x: jnp.ndarray) -> jnp.ndarray:
         jnp.max(x).astype(jnp.float32),
         jnp.mean(x.astype(jnp.float32)),
         jnp.std(x.astype(jnp.float32)),
+        name=name,
     )
 
 
-@partial(jax.custom_vjp, nondiff_argnums=())
-def _inspect(
-    x,
-):
+# ``name`` is a Python string and must not be traced through jax — it is
+# carried as a custom_vjp nondiff argument so it stays static at compile
+# time, threads into the primitive bind as a kwarg, and lands on the
+# FFI as a string attribute.
+@partial(jax.custom_vjp, nondiff_argnums=(1,))
+def _inspect(x, name):
     """ """
-    output, _ = _inspect_fwd_rule(
-        x,
-    )
+    output, _ = _inspect_fwd_rule(x, name)
     return output
 
 
-def _inspect_fwd_rule(
-    x,
-):
+def _inspect_fwd_rule(x, name):
     """"""
     ctx = ()
-    x = _inspect_array_inner(x)
+    x = _inspect_array_inner(x, name)
     return x, ctx
 
 
-def _inspect_bwd_rule(
-    ctx,
-    grad,
-):
+def _inspect_bwd_rule(name, ctx, grad):
     """"""
-    del ctx
+    del name, ctx
     return (grad,)
 
 
 _inspect.defvjp(_inspect_fwd_rule, _inspect_bwd_rule)
 
 
 def inspect_array(x: jnp.ndarray, name: str) -> jnp.ndarray:
-    """Utility function to inspect JAX arrays by printing their name, shape, dtype, and statistics.
+    """Inspect a JAX array by dumping its data and stats to disk per-rank.
+
+    On every call the FFI synchronises the input device buffer to host
+    and writes two files per rank, **keyed by ``name``** so multiple
+    probes in the same program produce distinct dumps:
+
+    * ``my_tensor_gpu{device}_{sanitized_name}.bin``      – raw bytes.
+    * ``my_tensor_gpu{device}_{sanitized_name}_meta.json`` – ``name``,
+      shape, dtype, and min/max/mean/std summary stats.
+
+    A line is also printed to stdout including the probe ``name`` so
+    multi-probe traces are easy to follow in a live log.
+
+    ``name`` is treated as a static (non-traced) attribute, so the same
+    probe name must be passed in every (re-)trace of an enclosing
+    ``jax.jit``; characters outside ``[A-Za-z0-9._-]`` are mapped to
+    ``_`` when forming the filename, but the unsanitised name is echoed
+    verbatim in the JSON metadata and the printed log line.
 
     Args:
         x (jnp.ndarray): The JAX array to inspect.
-        name (str): The name of the array for identification in the output.
+        name (str): Identifier for this probe; used in filenames and logs.
     """
-    del name  # Name is currently unused, but can be included in the future for more informative output
-    return _inspect(x)
+    return _inspect(x, name)
 
 
 def load_array_dump(filename: str, shape: tuple, dtype: jnp.dtype) -> jnp.ndarray: