[ET-VK] Add alignment fields to PackedDimInfo for padded size calculation

ssjia · SS-JIA · commit fa3e0316803a · 2026-02-05T19:05:09.000-05:00
Pull Request resolved: #17170 This change introduces separate alignment fields to PackedDimInfo, decoupling the alignment used for padding tensor dimensions from the block size used for packing. Previously, `calculate_padded_sizes` used `packed_dim_block_size` and `outer_packed_dim_block_size` directly to determine how much to pad tensor dimensions. This works but limits flexibility - there are scenarios where we want to pad dimensions to a larger alignment than the block size for performance reasons, such as ensuring loads are aligned to cache lines or removing the need for bounds checking in shaders. The new fields `packed_dim_align` and `outer_packed_dim_align` allow specifying the alignment independently. For now, these are initialized to match the corresponding block sizes, preserving existing behavior. Future changes can set larger alignment values when beneficial for performance. Authored with Claude. ghstack-source-id: 338638551 @exported-using-ghexport Differential Revision: [D92196649](https://our.internmc.facebook.com/intern/diff/D92196649/)
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -17,13 +17,17 @@ namespace api {
 PackedDimInfo::PackedDimInfo(
     const int32_t dim,
     const int32_t dim_block_size,
+    const int32_t dim_align,
     const int32_t outer_dim,
     const int32_t outer_dim_block_size,
+    const int32_t outer_dim_align,
     const bool is_block_transposed)
     : packed_dim(dim),
       packed_dim_block_size(dim_block_size),
+      packed_dim_align(dim_align),
       outer_packed_dim(outer_dim),
       outer_packed_dim_block_size(outer_dim_block_size),
+      outer_packed_dim_align(outer_dim_align),
       block_transposed(is_block_transposed),
       block_numel(packed_dim_block_size * outer_packed_dim_block_size) {
   // Packed dims must be different
@@ -33,32 +37,105 @@ PackedDimInfo::PackedDimInfo(
 PackedDimInfo calculate_packed_dim_info(
     const utils::GPUMemoryLayout memory_layout,
     const utils::StorageType storage_type) {
-  const int32_t packed_dim = utils::to_packed_dim<int32_t>(memory_layout);
-  const int32_t outer_packed_dim =
-      utils::to_outer_packed_dim<int32_t>(memory_layout);
-  const int32_t packed_dim_block_size =
-      utils::to_packed_dim_block_size<int32_t>(memory_layout, storage_type);
-  const int32_t outer_packed_dim_block_size =
-      utils::to_outer_packed_dim_block_size<int32_t>(memory_layout);
-  const bool is_block_transposed =
-      utils::is_block_transposed_layout(memory_layout);
-
-  const int32_t block_numel =
-      packed_dim_block_size * outer_packed_dim_block_size;
-  if (storage_type != utils::kBuffer) {
+  const bool is_buffer = storage_type == utils::kBuffer;
+
+  PackedDimInfo packed_dim_info(0, 1, 1, 1, 1, 1, false);
+  switch (memory_layout) {
+    case utils::kWidthPacked:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/0,
+          /*dim_block_size=*/is_buffer ? 1 : 4,
+          /*dim_align=*/is_buffer ? 1 : 4,
+          /*outer_dim=*/1,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kHeightPacked:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/1,
+          /*dim_block_size=*/is_buffer ? 1 : 4,
+          /*dim_align=*/is_buffer ? 1 : 4,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kChannelsPacked:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/is_buffer ? 1 : 4,
+          /*dim_align=*/is_buffer ? 1 : 4,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4W:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/0,
+          /*dim_block_size=*/is_buffer ? 4 : 16,
+          /*dim_align=*/is_buffer ? 4 : 16,
+          /*outer_dim=*/1,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4C:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/is_buffer ? 4 : 16,
+          /*dim_align=*/is_buffer ? 4 : 16,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4W4C:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/4,
+          /*dim_align=*/4,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/4,
+          /*outer_dim_align=*/4,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4H4W:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/0,
+          /*dim_block_size=*/4,
+          /*dim_align=*/4,
+          /*outer_dim=*/1,
+          /*outer_dim_block_size=*/4,
+          /*outer_dim_align=*/4,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4C1W:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/is_buffer ? 4 : 16,
+          /*dim_align=*/is_buffer ? 4 : 16,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/true);
+      break;
+    default:
+      VK_THROW("Unknown GPUMemoryLayout");
+  }
+
+  if (!is_buffer) {
+    const int32_t block_numel = packed_dim_info.packed_dim_block_size *
+        packed_dim_info.outer_packed_dim_block_size;
     if (is_packed_int8_layout(memory_layout)) {
       VK_CHECK_COND(block_numel == 16);
     } else {
       VK_CHECK_COND(block_numel == 4);
     }
   }
 
-  return PackedDimInfo(
-      packed_dim,
-      packed_dim_block_size,
-      outer_packed_dim,
-      outer_packed_dim_block_size,
-      is_block_transposed);
+  return packed_dim_info;
 }
 
 /*
@@ -297,7 +374,8 @@ utils::ivec4 flip_and_unsqueeze_ivec4(
  * for GPU storage in the following ways:
  *
  *   1. The dimensionality of the tensor will be padded to a multiple of 4.
- *   2. The size of the packed dimension will be padded to a multiple of 4.
+ *   2. The size of the packed dimension will be padded to a multiple of the
+ *      packed dimension's alignment value.
  *
  * The "packed dimension" is determined based on the utils::GPUMemoryLayout
  * argument.
@@ -317,23 +395,23 @@ std::vector<int64_t> calculate_padded_sizes(
     padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes);
   }
 
-  // Pad the packed dim to the block size
-  if (packed_dim_info.packed_dim_block_size > 1) {
+  // Pad the packed dim to the alignment
+  if (packed_dim_info.packed_dim_align > 1) {
     const int64_t dim_offset = packed_dim_info.packed_dim + 1;
     const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
     padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up(
         padded_dim_size,
-        static_cast<int64_t>(packed_dim_info.packed_dim_block_size));
+        static_cast<int64_t>(packed_dim_info.packed_dim_align));
   }
 
-  // Also pad the outer packed dimension if it's different from the inner packed
-  // dimension and is marked as padded.
-  if (packed_dim_info.outer_packed_dim_block_size > 1) {
+  // Also pad the outer packed dimension if it has alignment > 1.
+  if (packed_dim_info.outer_packed_dim_align > 1) {
     const int64_t outer_dim_offset = packed_dim_info.outer_packed_dim + 1;
     const int64_t outer_padded_dim_size =
         utils::val_at(-outer_dim_offset, sizes);
-    padded_sizes.at(ndim_up4 - outer_dim_offset) =
-        utils::align_up_4(outer_padded_dim_size);
+    padded_sizes.at(ndim_up4 - outer_dim_offset) = utils::align_up(
+        outer_padded_dim_size,
+        static_cast<int64_t>(packed_dim_info.outer_packed_dim_align));
   }
 
   return padded_sizes;
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -67,6 +67,12 @@ struct PackedDimInfo {
   // In physical memory, the size of the packed dim is aligned to this size to
   // ensure that data for the packed dim aligns with texel/block boundaries.
   int32_t packed_dim_block_size;
+  // In physical memory, the size of the packed dimension will be aligned to be
+  // a multiple of this value. This value must be a multiple of the packed_dim's
+  // block size, and is selected for performance reasons i.e. to ensure loads
+  // along the packed dim are aligned to cache lines, or to enable performance
+  // optimizations in shaders, i.e. remove the need for bounds checking.
+  int32_t packed_dim_align;
   // For block-packed layouts, represents the second tensor dimension that forms
   // the "width" dimension of the MxN square that is kept contiguous in memory.
   // For non block-packed layouts, represent the dimension with the next lowest
@@ -77,6 +83,8 @@ struct PackedDimInfo {
   // 4H4W, represents the "height" of the square block that is kept contiguous
   // in memory.
   int32_t outer_packed_dim_block_size;
+  // See packed_dim_align
+  int32_t outer_packed_dim_align;
   // Typically the blocks of the tensor will be arranged such that the inner
   // dim of the block (i.e. the packed dim) has the lowest stride, and the
   // outer dim of the block (i.e. the outer packed dim) has the next lowest
@@ -94,8 +102,10 @@ struct PackedDimInfo {
   PackedDimInfo(
       const int32_t dim,
       const int32_t dim_block_size,
+      const int32_t dim_align,
       const int32_t outer_dim,
       const int32_t outer_dim_block_size,
+      const int32_t outer_dim_align,
       const bool is_block_transposed);
 };
 
diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h
@@ -139,104 +139,6 @@ static constexpr GPUMemoryLayout kPackedInt8_4H4W =
 static constexpr GPUMemoryLayout kPackedInt8_4C1W =
     GPUMemoryLayout::TENSOR_PACKED_INT8_4C1W;
 
-template <typename T>
-T to_packed_dim(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 0;
-    case kHeightPacked:
-      return 1;
-    case kChannelsPacked:
-      return 2;
-    case kPackedInt8_4W:
-      return 0;
-    case kPackedInt8_4C:
-      return 2;
-    case kPackedInt8_4W4C:
-      return 2;
-    case kPackedInt8_4H4W:
-      return 0;
-    case kPackedInt8_4C1W:
-      return 2;
-  };
-  // Should be unreachable
-  return 0;
-}
-
-template <typename T>
-T to_outer_packed_dim(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 1;
-    case kHeightPacked:
-      return 0;
-    case kChannelsPacked:
-      return 0;
-    case kPackedInt8_4W:
-      return 1;
-    case kPackedInt8_4C:
-      return 0;
-    case kPackedInt8_4W4C:
-      return 0;
-    case kPackedInt8_4H4W:
-      return 1;
-    case kPackedInt8_4C1W:
-      return 0;
-  };
-  // Should be unreachable
-  return 1;
-}
-
-template <typename T>
-T to_packed_dim_block_size(
-    const GPUMemoryLayout layout,
-    const StorageType storage) {
-  switch (layout) {
-    case kWidthPacked:
-      return storage == kBuffer ? 1 : 4;
-    case kHeightPacked:
-      return storage == kBuffer ? 1 : 4;
-    case kChannelsPacked:
-      return storage == kBuffer ? 1 : 4;
-    case kPackedInt8_4W:
-      return storage == kBuffer ? 4 : 16;
-    case kPackedInt8_4C:
-      return storage == kBuffer ? 4 : 16;
-    case kPackedInt8_4W4C:
-      return 4;
-    case kPackedInt8_4H4W:
-      return 4;
-    case kPackedInt8_4C1W:
-      return storage == kBuffer ? 4 : 16;
-  };
-  // Should be unreachable
-  return 1;
-}
-
-template <typename T>
-T to_outer_packed_dim_block_size(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 1;
-    case kHeightPacked:
-      return 1;
-    case kChannelsPacked:
-      return 1;
-    case kPackedInt8_4W:
-      return 1;
-    case kPackedInt8_4C:
-      return 1;
-    case kPackedInt8_4W4C:
-      return 4;
-    case kPackedInt8_4H4W:
-      return 4;
-    case kPackedInt8_4C1W:
-      return 1;
-  };
-  // Should be unreachable
-  return 1;
-}
-
 bool is_block_transposed_layout(const GPUMemoryLayout layout);
 
 bool is_packed_int8_layout(const GPUMemoryLayout layout);