[JAX SC] perf: Refactor COO grouping key and add optimization for constant weights.

adityagupta1089 · Google-ML-Automation · commit 49ba0b9130d1 · 2025-11-24T14:28:54.000-08:00
2x improvement due to avoiding random memory accesses.

The 64-bit grouping key now uses the lower bits to store either the original index (if variable weights are present) or the row_id (if weights are constant). This allows skipping a memory lookup for `CooFormat` objects when feature weights are always `1.0`, improving performance. The key unpacking logic is updated to use new static helper functions.

PiperOrigin-RevId: 836364065
diff --git a/jax_tpu_embedding/sparsecore/lib/core/abstract_input_batch.h b/jax_tpu_embedding/sparsecore/lib/core/abstract_input_batch.h
@@ -49,7 +49,9 @@ class AbstractInputBatch {
   // Return the batch size or the number of samples in this input batch.
   virtual ssize_t size() const = 0;
 
-  // Extract COO Tensors.
+  // Returns true if the input batch has variable weights.
+  virtual bool HasVariableWeights() const { return true; }
+
   virtual void ExtractCooTensors(
       const ExtractCooTensorsOptions& options,
       ExtractedCooTensors& extracted_coo_tensors) = 0;
diff --git a/jax_tpu_embedding/sparsecore/lib/core/coo_format.h b/jax_tpu_embedding/sparsecore/lib/core/coo_format.h
@@ -58,10 +58,14 @@ struct CooFormat {
   // Bits taken by minibatching bucket ID.
   static constexpr uint32_t kMinibatchingBucketBits =
       absl::bit_width(kMaxMinibatchingBuckets - 1);
-  // Bits for Index
-  static constexpr uint32_t kIndexBits = 32 - kMinibatchingBucketBits;
-  // Index Mask
-  static constexpr uint32_t kIndexMask = (1 << kIndexBits) - 1;
+  // Bits for variable data (index or row_id).
+  static constexpr uint32_t kDataBits = 32 - kMinibatchingBucketBits;
+  // Mask for variable data (index or row_id).
+  static constexpr uint32_t kDataMask = (1 << kDataBits) - 1;
+  // Bit offset for rotated_col_id in grouping key.
+  static constexpr uint32_t kRotatedColIdOffset = kDataBits;
+  // Bit offset for bucket_id in grouping key.
+  static constexpr uint32_t kBucketIdOffset = kRotatedColIdOffset + 32;
 
   // A deterministic hash function eventually used to compute mini-batching
   // bucket id as `hash(col_id) % bucket_count`.
@@ -136,24 +140,36 @@ struct CooFormat {
   // Computes a 64-bit sorting key with the following layout:
   // [63:58] bucket_id (6 bits)
   // [57:26] {global_sc_id, local_embedding_id} (32 bits) <- rotated col_id
-  // [25:0]  index (26 bits)
+  // [25:0]  index or row_id (26 bits)
   // The key is used to group and sort COO tensors for efficient processing.
   uint64_t GetGroupingKey(const uint32_t num_scs_bit, const int index,
-                          const bool create_buckets = false,
-                          HashFn hash_fn = HighwayHash) const {
+                          const bool create_buckets,
+                          HashFn hash_fn = HighwayHash,
+                          const bool has_variable_weights = true) const {
     // This structure ensures tensors are sorted first by bucket_id, then by
     // sparse core, and finally by embedding ID.
     const uint32_t bucket_id = create_buckets ? GetBucketId(hash_fn) : 0;
 
-    DCHECK_LE(index, kIndexMask);
+    const uint32_t data = has_variable_weights ? index : row_id;
+    DCHECK_LE(data, kDataMask);
 
     // [global_sc_id, local_embedding_id]
     uint32_t rotated_col_id =
         absl::rotr(static_cast<uint32_t>(col_id), num_scs_bit);
 
-    return (uint64_t{bucket_id} << (64 - kMinibatchingBucketBits)) |
-           (uint64_t{rotated_col_id} << (32 - kMinibatchingBucketBits)) |
-           static_cast<uint64_t>(index);
+    return (uint64_t{bucket_id} << kBucketIdOffset) |
+           (uint64_t{rotated_col_id} << kRotatedColIdOffset) |
+           static_cast<uint64_t>(data);
+  }
+
+  static uint32_t GetDataFromKey(uint64_t key) { return key & kDataMask; }
+
+  static uint32_t GetRotatedColIdFromKey(uint64_t key) {
+    return (key >> kRotatedColIdOffset) & 0xFFFFFFFF;
+  }
+
+  static uint32_t GetBucketIdFromKey(uint64_t key) {
+    return key >> kBucketIdOffset;
   }
 };
 
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing.cc
@@ -113,12 +113,34 @@ void CheckDeviceBatchSize(int batch_size_for_device, int num_sc_per_device,
       batch_size_for_device, stacked_table_name, num_sc_per_device);
 }
 
+// We consider a stack to have variable weights if any feature in the stack
+// has explicitly variable weights or if any feature uses a row combiner
+// other than 'sum' (e.g., 'mean' or 'sqrtn').
+bool StackHasVariableWeights(
+    absl::Span<std::unique_ptr<AbstractInputBatch>> input_batches,
+    absl::Span<const StackedTableMetadata> stacked_table_metadata) {
+  for (const auto& metadata : stacked_table_metadata) {
+    // `kHasVariableWeights` must be true if any feature in the stack:
+    // 1.  Is explicitly marked as having variable weights.
+    // 2.  Uses a row combiner other than 'sum'. Non-'sum' combiners (e.g.,
+    //     'mean', 'sqrtn') adjust gains during `ExtractCooTensors`. This
+    //     means the gains in `coo_tensors` are not always 1.0, even with unity
+    //     input weights.
+    if (input_batches[metadata.feature_index]->HasVariableWeights() ||
+        metadata.row_combiner != RowCombiner::kSum) {
+      return true;
+    }
+  }
+  return false;
+}
+
 // Holds the state for processing a single stacked table across all local
 // devices. This includes extracted COO tensors, partitioned COO tensors,
 // CSR arrays, and statistics.
 struct TableState {
   absl::string_view stacked_table_name;
   absl::Span<const StackedTableMetadata> stacked_table_metadata;
+  bool has_variable_weights;
   int coo_buffer_size_per_device;
   CsrArraysPerHost csr_arrays_per_host;
   StatsPerHost stats_per_host;
@@ -131,10 +153,12 @@ struct TableState {
 
   TableState(absl::string_view name,
              absl::Span<const StackedTableMetadata> metadata,
+             bool has_variable_weights,
              const PreprocessSparseDenseMatmulInputOptions& options,
              int num_scs, int row_pointers_size_per_bucket)
       : stacked_table_name(name),
         stacked_table_metadata(metadata),
+        has_variable_weights(has_variable_weights),
         coo_buffer_size_per_device(ComputeCooBufferSizePerDevice(
             num_scs, options.num_sc_per_device, metadata, options.batch_number,
             options.enable_minibatching)),
@@ -154,6 +178,24 @@ struct TableState {
   }
 };
 
+template <typename SplitType>
+void SortAndGroupCooTensorsForTableState(
+    TableState& state, int local_device,
+    const PreprocessSparseDenseMatmulInputOptions& options,
+    internal::StatsPerDevice& stats, SplitType& split) {
+  if (state.has_variable_weights) {
+    state.partitioned_coo_tensors_per_device[local_device] =
+        SortAndGroupCooTensorsPerLocalDevice<true>(
+            state.extracted_coo_tensors_per_device[local_device],
+            state.stacked_table_metadata[0], options, stats, split);
+  } else {
+    state.partitioned_coo_tensors_per_device[local_device] =
+        SortAndGroupCooTensorsPerLocalDevice<false>(
+            state.extracted_coo_tensors_per_device[local_device],
+            state.stacked_table_metadata[0], options, stats, split);
+  }
+}
+
 // Extracts, sorts, and groups COO tensors for a single stacked table across
 // all local devices. This function populates
 // `state.extracted_coo_tensors_per_device` and
@@ -180,11 +222,9 @@ void ExtractSortAndGroupCooTensorsForTable(
 
           internal::StatsPerDevice stats_per_device =
               state.stats_per_host.GetStatsPerDevice(local_device);
-          state.partitioned_coo_tensors_per_device[local_device] =
-              SortAndGroupCooTensorsPerLocalDevice(
-                  state.extracted_coo_tensors_per_device[local_device],
-                  state.stacked_table_metadata[0], options, stats_per_device,
-                  state.table_minibatching_required);
+          SortAndGroupCooTensorsForTableState(
+              state, local_device, options, stats_per_device,
+              state.table_minibatching_required);
           state.dropped_id_count_per_device[local_device] =
               stats_per_device.dropped_id_count;
           counter.DecrementCount();
@@ -230,11 +270,9 @@ void CreateMinibatchingBucketsForTable(
           options.num_sc_per_device);
       internal::StatsPerDevice dummy_stats =
           dummy_stats_host.GetStatsPerDevice(0);
-      state.partitioned_coo_tensors_per_device[local_device] =
-          SortAndGroupCooTensorsPerLocalDevice(
-              state.extracted_coo_tensors_per_device[local_device],
-              state.stacked_table_metadata[0], options, dummy_stats,
-              state.table_minibatching_split);
+      SortAndGroupCooTensorsForTableState(state, local_device, options,
+                                         dummy_stats,
+                                         state.table_minibatching_split);
       state.dropped_id_count_per_device[local_device] =
           dummy_stats.dropped_id_count;
       counter.DecrementCount();
@@ -538,8 +576,11 @@ PreprocessSparseDenseMatmulInput(
   table_states.reserve(stacked_tables.size());
   for (const auto& [stacked_table_name, stacked_table_metadata] :
        stacked_tables) {
+    const bool stack_has_weights =
+        StackHasVariableWeights(input_batches, stacked_table_metadata);
     table_states.emplace_back(stacked_table_name, stacked_table_metadata,
-                              options, num_scs, row_pointers_size_per_bucket);
+                              stack_has_weights, options, num_scs,
+                              row_pointers_size_per_bucket);
   }
 
   // Stage 1: COO Extraction and Initial Sort/Group
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_test.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_test.cc
@@ -608,7 +608,7 @@ TEST_P(MinibatchingTest, KeysAreSorted) {
 TEST_P(MinibatchingTest, IndexFromKeyIsCorrect) {
   std::vector<uint64_t> keys = GenerateGroupingKeys();
   for (int i = 0; i < keys.size(); ++i) {
-    EXPECT_EQ(keys[i] & CooFormat::kIndexMask, i);
+    EXPECT_EQ(CooFormat::GetDataFromKey(keys[i]), i);
   }
 }
 
diff --git a/jax_tpu_embedding/sparsecore/lib/core/ragged_tensor_input_batch.h b/jax_tpu_embedding/sparsecore/lib/core/ragged_tensor_input_batch.h
@@ -75,6 +75,9 @@ class RaggedTensorInputBatch : public AbstractInputBatch {
         max_vocab_id_(max_vocab_id) {}
 
   int64_t size() const override { return row_offsets_.size() - 1; }
+
+  bool HasVariableWeights() const override { return false; }
+
   void ExtractCooTensors(const ExtractCooTensorsOptions& options,
                          ExtractedCooTensors& coo_tensors) override {
     SparseCsrInputBatchStream<int64_t, EmbeddingIdsView, RowOffsetsView>
diff --git a/jax_tpu_embedding/sparsecore/lib/core/sort_and_group_coo_tensors_impl.h b/jax_tpu_embedding/sparsecore/lib/core/sort_and_group_coo_tensors_impl.h
@@ -84,13 +84,13 @@ inline void ValidateMaxIdsOrDie(
 }
 
 inline void ValidateKeyCapacity(const int local_sc_id, const int key_count) {
-  // Index = 0 to kIndexMask giving us a count of kIndexMask + 1.
-  if (key_count > 1 + CooFormat::kIndexMask) {
+  // Index = 0 to kDataMask giving us a count of kDataMask + 1.
+  if (key_count > 1 + CooFormat::kDataMask) {
     LOG(ERROR) << absl::StrFormat(
         "Too many tensors for SparseCore #%d: got %d, limit: "
         "%d. Preprocessed output may not be reliable and cause undefined "
         "behavior.",
-        local_sc_id, key_count, CooFormat::kIndexMask);
+        local_sc_id, key_count, CooFormat::kDataMask);
   }
 }
 
@@ -177,6 +177,7 @@ struct LocalSparseCoreTensorGroupingContext {
   const PreprocessSparseDenseMatmulInputOptions& options;
   const bool create_buckets;
   const int32_t local_sc_id;
+  const int32_t num_sc_bits;
 
   // Outputs.
   PartitionedCooTensors& grouped_coo_tensors;
@@ -187,6 +188,7 @@ struct LocalSparseCoreTensorGroupingContext {
   MatrixXi& kept_unique_ids_per_partition_per_bucket;
 };
 
+template <bool kHasVariableWeights>
 inline void GroupAndDeduplicateCooTensorsForLocalSparseCore(
     LocalSparseCoreTensorGroupingContext context) {
   // Unpack context for readability.
@@ -214,17 +216,24 @@ inline void GroupAndDeduplicateCooTensorsForLocalSparseCore(
   // capacity. This decision is sticky for all tensors with the same `col_id`
   // within the same bucket.
   bool dropping_current_unique_col_id = false;
+  const int num_sc_bits = context.num_sc_bits;
   for (const uint64_t key : context.keys) {
     // Step 1: Unpack key to get tensor coordinates.
-    const uint32_t index = key & CooFormat::kIndexMask;
-    const CooFormat& coo_tensor = coo_tensors[index];
-    const uint32_t col_id = coo_tensor.col_id;
-    const uint32_t global_sc_id = coo_tensor.col_id & (global_sc_count - 1);
-    const uint32_t bucket_id =
-        context.create_buckets
-            ? coo_tensor.GetBucketId(options.minibatching_bucketing_hash_fn)
-            : 0;
-    const uint32_t row_id = coo_tensor.row_id;
+    const uint32_t bucket_id = CooFormat::GetBucketIdFromKey(key);
+    const uint32_t col_id =
+        absl::rotl(CooFormat::GetRotatedColIdFromKey(key), num_sc_bits);
+    const uint32_t global_sc_id = col_id & (global_sc_count - 1);
+
+    uint32_t row_id;
+    CooFormat coo_tensor(0, 0, 0.0f);
+    if constexpr (kHasVariableWeights) {
+      const uint32_t index = CooFormat::GetDataFromKey(key);
+      coo_tensor = coo_tensors[index];
+      row_id = coo_tensor.row_id;
+    } else {
+      row_id = CooFormat::GetDataFromKey(key);
+      coo_tensor = CooFormat(row_id, col_id, 1.0f);
+    }
 
     // Step 2: Handle duplicates.
     // An ID that is a duplicate of a previously non-dropped ID is merged.
@@ -298,7 +307,7 @@ inline void GroupAndDeduplicateCooTensorsForLocalSparseCore(
 // NOTE: We use output buffers `max_ids_per_sc`, `max_unique_ids_per_sc`, and
 // `required_buffer_size_per_sc` because we fill values in a loop to a bigger
 // array.
-template <typename SplitType>
+template <bool kHasVariableWeights = true, typename SplitType>
 PartitionedCooTensors SortAndGroupCooTensorsPerLocalDevice(
     const ExtractedCooTensors& extracted_coo_tensors,
     const StackedTableMetadata& stacked_table_metadata,
@@ -364,25 +373,30 @@ PartitionedCooTensors SortAndGroupCooTensorsPerLocalDevice(
            coo_tensors[coo_tensor_index].row_id <
                (local_sc_id + 1) * batch_size_per_sc;
          coo_tensor_index++) {
+      const CooFormat& coo_tensor = coo_tensors[coo_tensor_index];
       // The key here is [bucket_id(6 bits), global_sc_id(num_scs bits),
       // local_embedding_id(32-num_scs bits), index(26 bits)].
       //  Note that this assumes `num_scs` is a power of 2.
-      keys.push_back(coo_tensors[coo_tensor_index].GetGroupingKey(
+      keys.push_back(coo_tensor.GetGroupingKey(
           num_sc_bits, coo_tensor_index, create_buckets,
-          options.minibatching_bucketing_hash_fn));
+          options.minibatching_bucketing_hash_fn, kHasVariableWeights));
+      DCHECK(kHasVariableWeights || coo_tensors[coo_tensor_index].gain == 1.0f)
+          << "kHasVariableWeights: " << kHasVariableWeights
+          << ", coo: " << coo_tensor;
     }
 
     // The expected allocation size may be uninitialized.
     DCHECK(expected_keys_size == 0 || keys.size() == expected_keys_size);
     hwy::VQSort(keys.data(), keys.size(), hwy::SortAscending());
 
-    internal::GroupAndDeduplicateCooTensorsForLocalSparseCore({
+    const internal::LocalSparseCoreTensorGroupingContext context = {
         .keys = keys,
         .coo_tensors = coo_tensors,
         .stacked_table_metadata = stacked_table_metadata,
         .options = options,
         .create_buckets = create_buckets,
         .local_sc_id = local_sc_id,
+        .num_sc_bits = num_sc_bits,
         .grouped_coo_tensors = grouped_coo_tensors,
         .ids_per_sc_partition_per_bucket = ids_per_sc_partition_per_bucket,
         .unique_ids_per_partition_per_bucket =
@@ -392,7 +406,10 @@ PartitionedCooTensors SortAndGroupCooTensorsPerLocalDevice(
             kept_ids_per_sc_partition_per_bucket,
         .kept_unique_ids_per_partition_per_bucket =
             kept_unique_ids_per_partition_per_bucket,
-    });
+    };
+
+    internal::GroupAndDeduplicateCooTensorsForLocalSparseCore<
+        kHasVariableWeights>(context);
 
     grouped_coo_tensors.FillRemainingScBuckets();
 
diff --git a/jax_tpu_embedding/sparsecore/lib/core/sparse_coo_input_batch.h b/jax_tpu_embedding/sparsecore/lib/core/sparse_coo_input_batch.h
@@ -59,6 +59,8 @@ class PySparseCooInputBatch : public AbstractInputBatch {
   // Returns the number of rows in the current slice.
   int64_t size() const override { return batch_size_; }
 
+  bool HasVariableWeights() const override { return false; }
+
   // Extracts COO tensors for each SparseCore.
   void ExtractCooTensors(const ExtractCooTensorsOptions& options,
                          ExtractedCooTensors& coo_tensors) override;

Original file line number	Diff line number	Diff line change
`@@ -608,7 +608,7 @@ TEST_P(MinibatchingTest, KeysAreSorted) {`
`608`	`608`	`TEST_P(MinibatchingTest, IndexFromKeyIsCorrect) {`
`609`	`609`	`std::vector<uint64_t> keys = GenerateGroupingKeys();`
`610`	`610`	`for (int i = 0; i < keys.size(); ++i) {`
`611`		`- EXPECT_EQ(keys[i] & CooFormat::kIndexMask, i);`
	`611`	`+ EXPECT_EQ(CooFormat::GetDataFromKey(keys[i]), i);`
`612`	`612`	`}`
`613`	`613`	`}`
`614`	`614`