Update on "[ET-VK][ez] Make q8ta_conv2d use 4C1W layout"

ssjia · ssjia · commit 05786845d174 · 2026-02-11T13:50:08.000-08:00
This changes the q8ta_conv2d and q8ta_conv2d_dw operators' input layout from PackedInt8_4W4C to PackedInt8_4C1W in the op registry. The 4C1W layout aligns with the natural output format of channel-packed convolutions, avoiding unnecessary layout conversions between consecutive conv layers. Also adds explicit `outputs_storage` declarations (PACKED_INT8_CHANNELS_PACKED_BUFFER) to both the PW and general q8ta_conv2d op registrations, ensuring the layout propagation pass can correctly determine output layouts. Differential Revision: [D93000165](https://our.internmc.facebook.com/intern/diff/D93000165/) [ghstack-poisoned]
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -37,6 +37,8 @@ class OpFeatures:
         # bool indicating if the operator has a resize function, which allows it to
         # support models with dynamic shape
         "supports_resize",
+        # bool indicating if the operator supports tensors with more than 4 dimensions
+        "supports_highdim",
         # bool indicating if the operator handles its own prepacking. If this is True,
         # then the insert_prepack_nodes pass will not insert prepack nodes for the args
         # of the op.
@@ -60,6 +62,7 @@ def __init__(
             Union[utils.TensorRepSet, List[utils.TensorRepSet]]
         ] = None,
         supports_resize: bool = False,
+        supports_highdim: bool = False,
         supports_prepacking: bool = False,
         are_node_inputs_supported_fn: Optional[Callable] = allow_node,
         pick_io_storage_fn: Optional[Callable] = None,
@@ -85,6 +88,7 @@ def __init__(
             self.outputs_storage = utils.TensorRepSetList(self.inputs_storage[0])
 
         self.supports_resize = supports_resize
+        self.supports_highdim = supports_highdim
         self.supports_prepacking = supports_prepacking
 
         self.are_node_inputs_supported_fn = are_node_inputs_supported_fn
@@ -239,6 +243,7 @@ def register_binaryop_cpp_ops():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -253,6 +258,7 @@ def register_pow_tensor_scalar():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -635,6 +641,7 @@ def register_reduce_cpp_ops():
         inputs_storage=utils.ANY_TEXTURE,
         inputs_dtypes=utils.FP_T,
         supports_resize=True,
+        supports_highdim=True,
         are_node_inputs_supported_fn=is_reduce_node_supported,
         pick_io_storage_fn=pick_storage_for_reduce,
     )
@@ -656,6 +663,7 @@ def register_argreduce_cpp_ops():
         inputs_storage=utils.ANY_TEXTURE,
         inputs_dtypes=utils.FP_T,
         supports_resize=True,
+        supports_highdim=True,
         are_node_inputs_supported_fn=is_reduce_node_supported,
         pick_io_storage_fn=pick_storage_for_reduce,
     )
@@ -851,6 +859,7 @@ def register_apply_rotary_emb():
         inputs_storage=utils.CONTIGUOUS_ANY,
         inputs_dtypes=utils.FP_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -874,6 +883,7 @@ def register_permute_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -888,6 +898,7 @@ def register_view_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -897,6 +908,7 @@ def register_to_dim_order_copy():
         inputs_storage=utils.ANY_BUFFER,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -911,6 +923,7 @@ def register_squeeze_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -925,6 +938,7 @@ def register_unsqueeze_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -939,6 +953,7 @@ def register_clone():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -978,6 +993,7 @@ def register_expand_copy():
         inputs_storage=utils.ANY_BUFFER,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=False,
+        supports_highdim=True,
     )
 
 
@@ -1006,6 +1022,7 @@ def register_select_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -1020,6 +1037,7 @@ def register_slice_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
@@ -1034,6 +1052,7 @@ def register_split_with_sizes_copy():
         inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
+        supports_highdim=True,
     )
 
 
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -266,6 +266,10 @@ def _is_node_supported(self, node: torch.fx.Node) -> bool:  # noqa: C901
             self.log_skip(node, "op args not supported")
             return False
 
+        if not features.supports_highdim and utils.op_contains_high_dim_tensor(node):
+            self.log_skip(node, "op does not support high dim tensors")
+            return False
+
         if self.require_dynamic_shapes and not features.supports_resize:
             self.log_skip(node, "no dynamic shape support")
             return False
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.cpp b/backends/vulkan/runtime/api/containers/StagingBuffer.cpp
@@ -159,6 +159,11 @@ void StagingBuffer::cast_half_to_float_and_copy_from(
   for (size_t i = 0; i < numel; ++i) {
     dst[i] = half_to_float(src[i]);
   }
+  vmaFlushAllocation(
+      vulkan_buffer_.vma_allocator(),
+      vulkan_buffer_.allocation(),
+      0u,
+      VK_WHOLE_SIZE);
 }
 
 void StagingBuffer::cast_float_to_half_and_copy_to(
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -88,6 +88,11 @@ class StagingBuffer final {
     for (size_t i = 0; i < numel; ++i) {
       dst[i] = static_cast<DST_T>(src[i]);
     }
+    vmaFlushAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
   }
 
   void cast_half_to_float_and_copy_from(
@@ -109,6 +114,11 @@ class StagingBuffer final {
   template <typename SRC_T, typename DST_T>
   void cast_and_copy_to(DST_T* dst, const size_t numel) {
     VK_CHECK_COND(numel <= this->numel());
+    vmaInvalidateAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
     const SRC_T* src = reinterpret_cast<const SRC_T*>(data());
     for (size_t i = 0; i < numel; ++i) {
       dst[i] = static_cast<DST_T>(src[i]);
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
@@ -468,6 +468,22 @@ def op_contains_bool_tensor(node: torch.fx.Node) -> bool:
     return False
 
 
+def op_contains_high_dim_tensor(node: torch.fx.Node) -> bool:
+    """
+    Returns true if the operator used to compute the given node contains a tensor
+    with more than 4 dimensions
+    """
+    if is_tensor_node(node) and tensor_node_is_high_dim(node):
+        return True
+
+    for arg_node in node.args:
+        # pyre-ignore[6]
+        if is_tensor_node(arg_node) and tensor_node_is_high_dim(arg_node):
+            return True
+
+    return False
+
+
 def get_primary_arg_idx(self, node: torch.fx.Node) -> Optional[int]:
     primary_arg_idx: Optional[int] = None
     for i, arg_node in enumerate(node.args):

Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,11 @@ void StagingBuffer::cast_half_to_float_and_copy_from(`
`159`	`159`	`for (size_t i = 0; i < numel; ++i) {`
`160`	`160`	`dst[i] = half_to_float(src[i]);`
`161`	`161`	`}`
	`162`	`+ vmaFlushAllocation(`
	`163`	`+ vulkan_buffer_.vma_allocator(),`
	`164`	`+ vulkan_buffer_.allocation(),`
	`165`	`+ 0u,`
	`166`	`+ VK_WHOLE_SIZE);`
`162`	`167`	`}`
`163`	`168`
`164`	`169`	`void StagingBuffer::cast_float_to_half_and_copy_to(`