pytorch · SS-JIA · Feb 3, 2026
@@ -17,13 +17,17 @@ namespace api {
 PackedDimInfo::PackedDimInfo(
     const int32_t dim,
     const int32_t dim_block_size,
+    const int32_t dim_align,
     const int32_t outer_dim,
     const int32_t outer_dim_block_size,
+    const int32_t outer_dim_align,
     const bool is_block_transposed)
     : packed_dim(dim),
       packed_dim_block_size(dim_block_size),
+      packed_dim_align(dim_align),
       outer_packed_dim(outer_dim),
       outer_packed_dim_block_size(outer_dim_block_size),
+      outer_packed_dim_align(outer_dim_align),
       block_transposed(is_block_transposed),
       block_numel(packed_dim_block_size * outer_packed_dim_block_size) {
   // Packed dims must be different
@@ -33,32 +37,105 @@ PackedDimInfo::PackedDimInfo(
 PackedDimInfo calculate_packed_dim_info(
     const utils::GPUMemoryLayout memory_layout,
     const utils::StorageType storage_type) {
-  const int32_t packed_dim = utils::to_packed_dim<int32_t>(memory_layout);
-  const int32_t outer_packed_dim =
-      utils::to_outer_packed_dim<int32_t>(memory_layout);
-  const int32_t packed_dim_block_size =
-      utils::to_packed_dim_block_size<int32_t>(memory_layout, storage_type);
-  const int32_t outer_packed_dim_block_size =
-      utils::to_outer_packed_dim_block_size<int32_t>(memory_layout);
-  const bool is_block_transposed =
-      utils::is_block_transposed_layout(memory_layout);
-
-  const int32_t block_numel =
-      packed_dim_block_size * outer_packed_dim_block_size;
-  if (storage_type != utils::kBuffer) {
+  const bool is_buffer = storage_type == utils::kBuffer;
+
+  PackedDimInfo packed_dim_info(0, 1, 1, 1, 1, 1, false);
+  switch (memory_layout) {
+    case utils::kWidthPacked:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/0,
+          /*dim_block_size=*/is_buffer ? 1 : 4,
+          /*dim_align=*/is_buffer ? 1 : 4,
+          /*outer_dim=*/1,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kHeightPacked:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/1,
+          /*dim_block_size=*/is_buffer ? 1 : 4,
+          /*dim_align=*/is_buffer ? 1 : 4,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kChannelsPacked:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/is_buffer ? 1 : 4,
+          /*dim_align=*/is_buffer ? 1 : 4,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4W:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/0,
+          /*dim_block_size=*/is_buffer ? 4 : 16,
+          /*dim_align=*/is_buffer ? 4 : 16,
+          /*outer_dim=*/1,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4C:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/is_buffer ? 4 : 16,
+          /*dim_align=*/is_buffer ? 4 : 16,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4W4C:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/4,
+          /*dim_align=*/4,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/4,
+          /*outer_dim_align=*/4,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4H4W:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/0,
+          /*dim_block_size=*/4,
+          /*dim_align=*/4,
+          /*outer_dim=*/1,
+          /*outer_dim_block_size=*/4,
+          /*outer_dim_align=*/4,
+          /*is_block_transposed=*/false);
+      break;
+    case utils::kPackedInt8_4C1W:
+      packed_dim_info = PackedDimInfo(
+          /*dim=*/2,
+          /*dim_block_size=*/is_buffer ? 4 : 16,
+          /*dim_align=*/is_buffer ? 4 : 16,
+          /*outer_dim=*/0,
+          /*outer_dim_block_size=*/1,
+          /*outer_dim_align=*/1,
+          /*is_block_transposed=*/true);
+      break;
+    default:
+      VK_THROW("Unknown GPUMemoryLayout");
+  }
+
+  if (!is_buffer) {
+    const int32_t block_numel = packed_dim_info.packed_dim_block_size *
+        packed_dim_info.outer_packed_dim_block_size;
     if (is_packed_int8_layout(memory_layout)) {
       VK_CHECK_COND(block_numel == 16);
     } else {
       VK_CHECK_COND(block_numel == 4);
     }
   }
 
-  return PackedDimInfo(
-      packed_dim,
-      packed_dim_block_size,
-      outer_packed_dim,
-      outer_packed_dim_block_size,
-      is_block_transposed);
+  return packed_dim_info;
 }
 
 /*
@@ -297,7 +374,8 @@ utils::ivec4 flip_and_unsqueeze_ivec4(
  * for GPU storage in the following ways:
  *
  *   1. The dimensionality of the tensor will be padded to a multiple of 4.
- *   2. The size of the packed dimension will be padded to a multiple of 4.
+ *   2. The size of the packed dimension will be padded to a multiple of the
+ *      packed dimension's alignment value.
  *
  * The "packed dimension" is determined based on the utils::GPUMemoryLayout
  * argument.
@@ -317,23 +395,23 @@ std::vector<int64_t> calculate_padded_sizes(
     padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes);
   }
 
-  // Pad the packed dim to the block size
-  if (packed_dim_info.packed_dim_block_size > 1) {
+  // Pad the packed dim to the alignment
+  if (packed_dim_info.packed_dim_align > 1) {
     const int64_t dim_offset = packed_dim_info.packed_dim + 1;
     const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
     padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up(
         padded_dim_size,
-        static_cast<int64_t>(packed_dim_info.packed_dim_block_size));
+        static_cast<int64_t>(packed_dim_info.packed_dim_align));
   }
 
-  // Also pad the outer packed dimension if it's different from the inner packed
-  // dimension and is marked as padded.
-  if (packed_dim_info.outer_packed_dim_block_size > 1) {
+  // Also pad the outer packed dimension if it has alignment > 1.
+  if (packed_dim_info.outer_packed_dim_align > 1) {
     const int64_t outer_dim_offset = packed_dim_info.outer_packed_dim + 1;
     const int64_t outer_padded_dim_size =
         utils::val_at(-outer_dim_offset, sizes);
-    padded_sizes.at(ndim_up4 - outer_dim_offset) =
-        utils::align_up_4(outer_padded_dim_size);
+    padded_sizes.at(ndim_up4 - outer_dim_offset) = utils::align_up(
+        outer_padded_dim_size,
+        static_cast<int64_t>(packed_dim_info.outer_packed_dim_align));
   }
 
   return padded_sizes;

@@ -67,6 +67,12 @@ struct PackedDimInfo {
   // In physical memory, the size of the packed dim is aligned to this size to
   // ensure that data for the packed dim aligns with texel/block boundaries.
   int32_t packed_dim_block_size;
+  // In physical memory, the size of the packed dimension will be aligned to be
+  // a multiple of this value. This value must be a multiple of the packed_dim's
+  // block size, and is selected for performance reasons i.e. to ensure loads
+  // along the packed dim are aligned to cache lines, or to enable performance
+  // optimizations in shaders, i.e. remove the need for bounds checking.
+  int32_t packed_dim_align;
   // For block-packed layouts, represents the second tensor dimension that forms
   // the "width" dimension of the MxN square that is kept contiguous in memory.
   // For non block-packed layouts, represent the dimension with the next lowest
@@ -77,6 +83,8 @@ struct PackedDimInfo {
   // 4H4W, represents the "height" of the square block that is kept contiguous
   // in memory.
   int32_t outer_packed_dim_block_size;
+  // See packed_dim_align
+  int32_t outer_packed_dim_align;
   // Typically the blocks of the tensor will be arranged such that the inner
   // dim of the block (i.e. the packed dim) has the lowest stride, and the
   // outer dim of the block (i.e. the outer packed dim) has the next lowest
@@ -94,8 +102,10 @@ struct PackedDimInfo {
   PackedDimInfo(
       const int32_t dim,
       const int32_t dim_block_size,
+      const int32_t dim_align,
       const int32_t outer_dim,
       const int32_t outer_dim_block_size,
+      const int32_t outer_dim_align,
       const bool is_block_transposed);
 };
 

@@ -139,104 +139,6 @@ static constexpr GPUMemoryLayout kPackedInt8_4H4W =
 static constexpr GPUMemoryLayout kPackedInt8_4C1W =
     GPUMemoryLayout::TENSOR_PACKED_INT8_4C1W;
 
-template <typename T>
-T to_packed_dim(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 0;
-    case kHeightPacked:
-      return 1;
-    case kChannelsPacked:
-      return 2;
-    case kPackedInt8_4W:
-      return 0;
-    case kPackedInt8_4C:
-      return 2;
-    case kPackedInt8_4W4C:
-      return 2;
-    case kPackedInt8_4H4W:
-      return 0;
-    case kPackedInt8_4C1W:
-      return 2;
-  };
-  // Should be unreachable
-  return 0;
-}
-
-template <typename T>
-T to_outer_packed_dim(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 1;
-    case kHeightPacked:
-      return 0;
-    case kChannelsPacked:
-      return 0;
-    case kPackedInt8_4W:
-      return 1;
-    case kPackedInt8_4C:
-      return 0;
-    case kPackedInt8_4W4C:
-      return 0;
-    case kPackedInt8_4H4W:
-      return 1;
-    case kPackedInt8_4C1W:
-      return 0;
-  };
-  // Should be unreachable
-  return 1;
-}
-
-template <typename T>
-T to_packed_dim_block_size(
-    const GPUMemoryLayout layout,
-    const StorageType storage) {
-  switch (layout) {
-    case kWidthPacked:
-      return storage == kBuffer ? 1 : 4;
-    case kHeightPacked:
-      return storage == kBuffer ? 1 : 4;
-    case kChannelsPacked:
-      return storage == kBuffer ? 1 : 4;
-    case kPackedInt8_4W:
-      return storage == kBuffer ? 4 : 16;
-    case kPackedInt8_4C:
-      return storage == kBuffer ? 4 : 16;
-    case kPackedInt8_4W4C:
-      return 4;
-    case kPackedInt8_4H4W:
-      return 4;
-    case kPackedInt8_4C1W:
-      return storage == kBuffer ? 4 : 16;
-  };
-  // Should be unreachable
-  return 1;
-}
-
-template <typename T>
-T to_outer_packed_dim_block_size(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 1;
-    case kHeightPacked:
-      return 1;
-    case kChannelsPacked:
-      return 1;
-    case kPackedInt8_4W:
-      return 1;
-    case kPackedInt8_4C:
-      return 1;
-    case kPackedInt8_4W4C:
-      return 4;
-    case kPackedInt8_4H4W:
-      return 4;
-    case kPackedInt8_4C1W:
-      return 1;
-  };
-  // Should be unreachable
-  return 1;
-}
-
 bool is_block_transposed_layout(const GPUMemoryLayout layout);
 
 bool is_packed_int8_layout(const GPUMemoryLayout layout);