Update base for Update on "[ET-VK] Layout-flexible impl of quantized binary"

ssjia · ssjia · commit 7758fb109b30 · 2026-02-11T13:50:09.000-08:00
This refactors the quantized binary add operator to support all PackedInt8 memory layouts (4W, 4C, 4W4C, 4H4W, 4C1W) instead of being hardcoded to 4W4C. The shader is rewritten to use the block indexing framework (BlockConfig, block_int8x4_load/store) and BufferMetadata for layout-agnostic tensor access, replacing the previous linear dispatch that assumed 4W4C ordering. Key changes: - Renames shader from binary_q8ta_q8ta_q8to to q8ta_binary, and op from add_q8ta_q8ta_q8to to q8ta_add - Shader now uses contiguous_block_idx_to_tensor4d_idx_with_block_config for dispatch and generated load/store functions for layout-flexible int8x4 access - C++ dispatch uses pick_linear_global_wg_with_block_config and passes BufferMetadata UBOs for output and both inputs, plus hashed_layout specialization constants - Moves the test operator into a separate TestQ8taBinary.cpp file that parameterizes on GPUMemoryLayout, testing all 5 layouts - Updates op_registry to accept PACKED_INT8_BUFFER (all layouts) instead of just PACKED_INT8_4W4C_BUFFER This diff was authored with Claude. Differential Revision: [D93000170](https://our.internmc.facebook.com/intern/diff/D93000170/) [ghstack-poisoned]
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -37,6 +37,8 @@ class OpFeatures:
         # bool indicating if the operator has a resize function, which allows it to
         # support models with dynamic shape
         "supports_resize",
+        # bool indicating if the operator supports tensors with more than 4 dimensions
+        "supports_highdim",
         # bool indicating if the operator handles its own prepacking. If this is True,
         # then the insert_prepack_nodes pass will not insert prepack nodes for the args
         # of the op.
@@ -60,6 +62,7 @@ def __init__(
             Union[utils.TensorRepSet, List[utils.TensorRepSet]]
         ] = None,
         supports_resize: bool = False,
+        supports_highdim: bool = False,
         supports_prepacking: bool = False,
         are_node_inputs_supported_fn: Optional[Callable] = allow_node,
         pick_io_storage_fn: Optional[Callable] = None,
@@ -85,6 +88,7 @@ def __init__(
             self.outputs_storage = utils.TensorRepSetList(self.inputs_storage[0])
 
         self.supports_resize = supports_resize
+        self.supports_highdim = supports_highdim
         self.supports_prepacking = supports_prepacking
 
         self.are_node_inputs_supported_fn = are_node_inputs_supported_fn
@@ -239,6 +243,7 @@ def register_binaryop_cpp_ops():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -253,6 +258,7 @@ def register_pow_tensor_scalar():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -635,6 +641,7 @@ def register_reduce_cpp_ops():
         inputs_storage=utils.ANY_TEXTURE,
         inputs_dtypes=utils.FP_T,
         supports_resize=True,
+        supports_highdim=True,
         are_node_inputs_supported_fn=is_reduce_node_supported,
         pick_io_storage_fn=pick_storage_for_reduce,
     )
@@ -656,6 +663,7 @@ def register_argreduce_cpp_ops():
         inputs_storage=utils.ANY_TEXTURE,
         inputs_dtypes=utils.FP_T,
         supports_resize=True,
+        supports_highdim=True,
         are_node_inputs_supported_fn=is_reduce_node_supported,
         pick_io_storage_fn=pick_storage_for_reduce,
     )
@@ -851,6 +859,7 @@ def register_apply_rotary_emb():
         inputs_storage=utils.CONTIGUOUS_ANY,
         inputs_dtypes=utils.FP_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -874,6 +883,7 @@ def register_permute_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -888,6 +898,7 @@ def register_view_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -897,6 +908,7 @@ def register_to_dim_order_copy():
         inputs_storage=utils.ANY_BUFFER,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -911,6 +923,7 @@ def register_squeeze_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -925,6 +938,7 @@ def register_unsqueeze_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -939,6 +953,7 @@ def register_clone():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -978,6 +993,7 @@ def register_expand_copy():
         inputs_storage=utils.ANY_BUFFER,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=False,
+        supports_highdim=True,
     )
 
 
@@ -1006,6 +1022,7 @@ def register_select_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -1020,6 +1037,7 @@ def register_slice_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -1034,6 +1052,7 @@ def register_split_with_sizes_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -266,6 +266,10 @@ def _is_node_supported(self, node: torch.fx.Node) -> bool:  # noqa: C901
             self.log_skip(node, "op args not supported")
             return False
 
+        if not features.supports_highdim and utils.op_contains_high_dim_tensor(node):
+            self.log_skip(node, "op does not support high dim tensors")
+            return False
+
         if self.require_dynamic_shapes and not features.supports_resize:
             self.log_skip(node, "no dynamic shape support")
             return False
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.cpp b/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
@@ -159,6 +159,11 @@ void StagingBuffer::cast_half_to_float_and_copy_from(
   for (size_t i = 0; i < numel; ++i) {
     dst[i] = half_to_float(src[i]);
   }
+  vmaFlushAllocation(
+      vulkan_buffer_.vma_allocator(),
+      vulkan_buffer_.allocation(),
+      0u,
+      VK_WHOLE_SIZE);
 }
 
 void StagingBuffer::cast_float_to_half_and_copy_to(
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -88,6 +88,11 @@ class StagingBuffer final {
     for (size_t i = 0; i < numel; ++i) {
       dst[i] = static_cast<DST_T>(src[i]);
     }
+    vmaFlushAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
   }
 
   void cast_half_to_float_and_copy_from(
@@ -109,6 +114,11 @@ class StagingBuffer final {
   template <typename SRC_T, typename DST_T>
   void cast_and_copy_to(DST_T* dst, const size_t numel) {
     VK_CHECK_COND(numel <= this->numel());
+    vmaInvalidateAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
     const SRC_T* src = reinterpret_cast<const SRC_T*>(data());
     for (size_t i = 0; i < numel; ++i) {
       dst[i] = static_cast<DST_T>(src[i]);
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
@@ -468,6 +468,22 @@ def op_contains_bool_tensor(node: torch.fx.Node) -> bool:
     return False
 
 
+def op_contains_high_dim_tensor(node: torch.fx.Node) -> bool:
+    """
+    Returns true if the operator used to compute the given node contains a tensor
+    with more than 4 dimensions
+    """
+    if is_tensor_node(node) and tensor_node_is_high_dim(node):
+        return True
+
+    for arg_node in node.args:
+        # pyre-ignore[6]
+        if is_tensor_node(arg_node) and tensor_node_is_high_dim(arg_node):
+            return True
+
+    return False
+
+
 def get_primary_arg_idx(self, node: torch.fx.Node) -> Optional[int]:
     primary_arg_idx: Optional[int] = None
     for i, arg_node in enumerate(node.args):

Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,11 @@ void StagingBuffer::cast_half_to_float_and_copy_from(`
`159`	`159`	`for (size_t i = 0; i < numel; ++i) {`
`160`	`160`	`dst[i] = half_to_float(src[i]);`
`161`	`161`	`}`
	`162`	`+ vmaFlushAllocation(`
	`163`	`+ vulkan_buffer_.vma_allocator(),`
	`164`	`+ vulkan_buffer_.allocation(),`
	`165`	`+ 0u,`
	`166`	`+ VK_WHOLE_SIZE);`
`162`	`167`	`}`
`163`	`168`
`164`	`169`	`void StagingBuffer::cast_float_to_half_and_copy_to(`