diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 874b2a65168..5a006e2e751 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1138,6 +1138,7 @@ jobs:
         ./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
         ./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
         ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
+        ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_clone
         ./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add
 
         # "Classic" Operator tests
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_clone.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_clone.glsl
new file mode 100644
index 00000000000..0006311e13c
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_clone.glsl
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+// Output buffer: packed int8x4 values
+${layout_declare_tensor(B, "w", "t_outp", "int", "buffer")}
+// Input buffer: packed int8x4 values
+${layout_declare_tensor(B, "r", "t_inp", "int", "buffer")}
+
+// Metadata for output tensor
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+// Metadata for input tensor
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "inp_block_config", "0")}
+${layout_declare_spec_const(C, "int", "outp_block_config", "0")}
+
+#include "block_indexing.glslh"
+#include "block_int8x4_load.glslh"
+#include "block_int8x4_store.glslh"
+
+// Generate loading functions for t_inp buffer
+define_load_int8x4_buffer_fns(t_inp)
+
+// Generate storing functions for t_outp buffer
+define_store_int8x4_buffer_fns(t_outp)
+
+void main() {
+  TensorIndex4D tidx;
+
+  // Buffer storage: use linear dispatch
+  const uint contig_block_idx = gl_GlobalInvocationID.x;
+  tidx = contiguous_block_idx_to_tensor4d_idx_with_block_config(
+      inp, contig_block_idx, inp_block_config);
+
+  if (out_of_bounds(tidx, inp)) {
+    return;
+  }
+
+  // Load int8x4 block from input using the thread's block index
+  const int inp_block_outer_dim = get_block_outer_dim(inp_block_config);
+  ivec4 int8_block = load_int8x4_block_from_t_inp(
+      inp, tidx, inp_layout, inp_block_outer_dim);
+
+  // If input and output have different block configs (different packed dims),
+  // transpose the block to match output's layout
+  if (inp_block_config != outp_block_config) {
+    int8_block = transpose_int8x4_block(int8_block);
+  }
+
+  // Store values to output buffer using output's block config
+  const int outp_block_outer_dim = get_block_outer_dim(outp_block_config);
+  store_int8x4_block_to_t_outp(
+      outp, tidx, outp_layout, outp_block_outer_dim, int8_block);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_clone.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_clone.yaml
new file mode 100644
index 00000000000..6548ee3b484
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_clone.yaml
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q8ta_clone:
+  parameter_names_with_default_values:
+    DTYPE: int
+  shader_variants:
+    - NAME: q8ta_clone
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
index a64cb0143a9..bf2cac6d220 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -11,6 +11,7 @@
 #include <executorch/backends/vulkan/runtime/graph/Logging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taClone.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
@@ -132,6 +133,13 @@ void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
 
   const utils::StorageType src_storage = graph.storage_type_of(src);
   const utils::StorageType dst_storage = graph.storage_type_of(dst);
+
+  // Handle int8x4 (quantized) tensors with block-based clone
+  if (graph.dtype_of(src) == vkapi::kInt8x4 &&
+      graph.dtype_of(dst) == vkapi::kInt8x4) {
+    return add_q8ta_clone_node(graph, src, dst);
+  }
+
   if (src_storage == utils::kTexture3D && dst_storage == utils::kTexture3D) {
     if (graph.hashed_layout_of(src) == graph.hashed_layout_of(dst)) {
       return add_clone_node(graph, src, dst);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taClone.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taClone.cpp
new file mode 100644
index 00000000000..6c688af802d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taClone.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taClone.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_q8ta_clone_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef packed_int8_output) {
+  VK_CHECK_COND(graph.dtype_of(packed_int8_input) == vkapi::kInt8x4);
+  VK_CHECK_COND(graph.dtype_of(packed_int8_output) == vkapi::kInt8x4);
+
+  // Build shader name - always buffer-to-buffer int8x4 clone
+  std::string kernel_name = "q8ta_clone";
+
+  // Pass metadata for both output and input tensors
+  // Both are always buffer-backed int8x4 tensors
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_output));
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_input));
+
+  // Create block config for output tensor: inner_dim = output's packed_dim
+  const BlockConfig outp_block_config = create_block_config_from_io_packed_dims(
+      graph, packed_int8_output, packed_int8_input);
+
+  // Create block config for input tensor: based on outp_block_config but with
+  // inner_dim = input's packed_dim. If input and output have different packed
+  // dims, the block axes are transposed.
+  const BlockConfig inp_block_config = create_block_config_from_other(
+      graph, packed_int8_input, outp_block_config);
+
+  // Cast block config to ValueRef for pick_*_global_wg_with_block_config
+  // Use inp_block_config since shader uses inp_block_config for indexing
+  const ValueRef block_config_ref =
+      static_cast<ValueRef>(inp_block_config.as_packed_int());
+
+  // Use linear dispatch for buffer-backed int8x4 tensors
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      pick_linear_global_wg_with_block_config,
+      pick_square_local_wg_with_block_config,
+      // Inputs and Outputs
+      {{packed_int8_output, vkapi::kWrite}, {packed_int8_input, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      {},
+      // Specialization Constants
+      {graph.hashed_layout_of(packed_int8_input),
+       graph.hashed_layout_of(packed_int8_output),
+       inp_block_config.as_packed_int(),
+       outp_block_config.as_packed_int()},
+      // Resize args
+      {block_config_ref}));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taClone.h b/backends/vulkan/runtime/graph/ops/impl/Q8taClone.h
new file mode 100644
index 00000000000..91d14797400
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taClone.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+//
+// Clone for int8x4 tensors (memory layout agnostic)
+//
+
+void add_q8ta_clone_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef packed_int8_output);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt
index 3108565361e..0deea46c292 100644
--- a/backends/vulkan/test/custom_ops/CMakeLists.txt
+++ b/backends/vulkan/test/custom_ops/CMakeLists.txt
@@ -98,6 +98,7 @@ if(TARGET vulkan_backend)
   add_operator_prototype(q4gsw_linear)
   add_operator_prototype(choose_qparams_per_row)
   add_operator_prototype(test_q8ta_qdq)
+  add_operator_prototype(test_q8ta_clone)
   add_operator_prototype(q8ta_q8csw_q8to_conv2d)
   add_operator_prototype(test_q8ta_conv2d_dw)
   add_operator_prototype(q8ta_q8ta_q8to_add)
diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taClone.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taClone.cpp
new file mode 100644
index 00000000000..9d4c46e79b0
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestQ8taClone.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taClone.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
+
+namespace vkcompute {
+
+void q8ta_clone_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input = args.at(idx++);
+  const ValueRef scale = args.at(idx++);
+  const ValueRef zero_point = args.at(idx++);
+  const ValueRef inp_layout_int = args.at(idx++);
+  const ValueRef outp_layout_int = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  // Extract the layout parameters and cast to GPUMemoryLayout
+  int32_t inp_layout_value = graph.extract_scalar<int32_t>(inp_layout_int);
+  utils::GPUMemoryLayout inp_layout =
+      static_cast<utils::GPUMemoryLayout>(inp_layout_value);
+
+  int32_t outp_layout_value = graph.extract_scalar<int32_t>(outp_layout_int);
+  utils::GPUMemoryLayout outp_layout =
+      static_cast<utils::GPUMemoryLayout>(outp_layout_value);
+
+  // Create temporary tensor for quantized input with input layout
+  TmpTensor packed_int8_input(
+      &graph,
+      graph.sizes_of(fp_input),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      inp_layout);
+
+  // Create temporary tensor for quantized output with output layout
+  TmpTensor packed_int8_output(
+      &graph,
+      graph.sizes_of(fp_input),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      outp_layout);
+
+  // Quantize: FP -> int8x4 with input layout
+  add_q8ta_quantize_node(graph, fp_input, scale, zero_point, packed_int8_input);
+
+  // Clone: int8x4 (input layout) -> int8x4 (output layout)
+  add_q8ta_clone_node(graph, packed_int8_input, packed_int8_output);
+
+  // Dequantize: int8x4 with output layout -> FP
+  add_q8ta_dequantize_node(
+      graph, packed_int8_output, scale, zero_point, fp_output);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.q8ta_clone_test.default, q8ta_clone_test);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 5e556ba6f08..1bf8be41854 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -92,6 +92,7 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("choose_qparams_per_row")
     define_custom_op_test_binary("q4gsw_linear")
     define_custom_op_test_binary("test_q8ta_qdq")
+    define_custom_op_test_binary("test_q8ta_clone")
     define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d")
     define_custom_op_test_binary("test_q8ta_conv2d_dw")
     define_custom_op_test_binary("q8ta_q8ta_q8to_add")
diff --git a/backends/vulkan/test/custom_ops/test_q8ta_clone.cpp b/backends/vulkan/test/custom_ops/test_q8ta_clone.cpp
new file mode 100644
index 00000000000..b872169cc7f
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_q8ta_clone.cpp
@@ -0,0 +1,344 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include "utils.h"
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+// #define DEBUG_MODE
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 512;
+
+// Configuration struct for q8ta clone testing
+struct Q8taCloneConfig {
+  std::vector<int64_t> shape; // Tensor shape (can be any dimensionality)
+  std::string test_case_name = "placeholder";
+  std::string op_name = "q8ta_clone_test";
+};
+
+// Utility function to create a test case from a Q8taCloneConfig
+TestCase create_test_case_from_config(
+    const Q8taCloneConfig& config,
+    utils::StorageType storage_type,
+    vkapi::ScalarType input_dtype,
+    utils::GPUMemoryLayout fp_memory_layout,
+    utils::GPUMemoryLayout inp_quant_layout,
+    utils::GPUMemoryLayout outp_quant_layout) {
+  TestCase test_case;
+
+  // Create a descriptive name for the test case
+  std::string shape_str = shape_string(config.shape);
+  std::string test_name = config.test_case_name + "  I=" + shape_str + "  " +
+      repr_str(storage_type, fp_memory_layout) + "->" +
+      repr_str(utils::kBuffer, inp_quant_layout) + "->" +
+      repr_str(utils::kBuffer, outp_quant_layout);
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case
+  std::string operator_name = "test_etvk." + config.op_name + ".default";
+  test_case.set_operator_name(operator_name);
+
+  // Input tensor (float) - any dimensionality
+  ValueSpec input_tensor(
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::RANDOM);
+
+  float scale_val = 0.007112;
+  ValueSpec scale(scale_val);
+
+  // Zero point for quantization
+  int32_t zero_point_val = 0;
+  ValueSpec zero_point(zero_point_val);
+
+  // Input and output quantized layouts as integers
+  int32_t inp_layout_int = static_cast<int32_t>(inp_quant_layout);
+  ValueSpec inp_layout_spec(inp_layout_int);
+
+  int32_t outp_layout_int = static_cast<int32_t>(outp_quant_layout);
+  ValueSpec outp_layout_spec(outp_layout_int);
+
+  // Output tensor (float) - same shape as input
+  ValueSpec output_tensor(
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::ZEROS);
+
+  // Add all specs to test case
+  test_case.add_input_spec(input_tensor);
+  test_case.add_input_spec(scale);
+  test_case.add_input_spec(zero_point);
+  test_case.add_input_spec(inp_layout_spec);
+  test_case.add_input_spec(outp_layout_spec);
+  test_case.add_output_spec(output_tensor);
+
+  test_case.set_abs_tolerance(scale_val + 1e-4);
+
+  // Use layout-only filter for this test since clone IS the operation being
+  // tested
+  test_case.set_shader_filter({
+      "nchw_to",
+      "to_nchw",
+      "q8ta_quantize",
+      "q8ta_dequantize",
+  });
+
+  return test_case;
+}
+
+// Generate easy test cases for q8ta_clone operation (for debugging)
+std::vector<TestCase> generate_q8ta_clone_easy_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Single simple configuration for debugging
+  Q8taCloneConfig config = {
+      {1, 16, 16, 16}, // shape: [N, C, H, W]
+      "ACCU", // test_case_name
+  };
+
+  // FP memory layouts to test
+  std::vector<utils::GPUMemoryLayout> fp_layouts = {
+      utils::kWidthPacked,
+      utils::kChannelsPacked,
+  };
+
+  // Quantized memory layouts to test
+  std::vector<utils::GPUMemoryLayout> quant_layouts = {
+      utils::kPackedInt8_4W,
+      utils::kPackedInt8_4C,
+      utils::kPackedInt8_4W4C,
+      utils::kPackedInt8_4H4W,
+      utils::kPackedInt8_4C1W,
+  };
+
+  std::vector<utils::StorageType> storage_types = {utils::kBuffer};
+  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
+
+  // Generate test cases for each combination (same layout for input and output)
+  for (const auto& fp_layout : fp_layouts) {
+    for (const auto& quant_layout : quant_layouts) {
+      for (const auto& storage_type : storage_types) {
+        for (const auto& input_dtype : float_types) {
+          // Same layout: should be a simple copy
+          test_cases.push_back(create_test_case_from_config(
+              config,
+              storage_type,
+              input_dtype,
+              fp_layout,
+              quant_layout,
+              quant_layout));
+        }
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Generate test cases for q8ta_clone operation
+std::vector<TestCase> generate_q8ta_clone_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Shapes to test
+  std::vector<std::vector<int64_t>> shapes = {
+      // Small test cases for correctness
+      {1, 3, 16, 16},
+      {1, 8, 32, 32},
+      {1, 16, 24, 24},
+      {1, 32, 12, 12},
+      {1, 1, 64, 64},
+      {1, 3, 64, 64},
+      {1, 4, 16, 16},
+
+      // Different tensor sizes
+      {1, 8, 20, 20},
+      {1, 16, 14, 14},
+      {1, 8, 28, 28},
+
+      // Odd tensor sizes
+      {1, 3, 15, 15},
+      {1, 13, 31, 31},
+      {1, 17, 23, 23},
+
+      // Performance test cases (larger tensors)
+      {1, 64, 128, 128},
+      {1, 32, 64, 64},
+      {1, 128, 56, 56},
+      {1, 128, 128, 128},
+  };
+
+  // FP memory layouts to test
+  std::vector<utils::GPUMemoryLayout> fp_layouts = {
+      utils::kWidthPacked,
+      utils::kChannelsPacked,
+  };
+
+  // Quantized memory layouts to test
+  std::vector<utils::GPUMemoryLayout> quant_layouts = {
+      utils::kPackedInt8_4W,
+      utils::kPackedInt8_4C,
+      utils::kPackedInt8_4W4C,
+      utils::kPackedInt8_4H4W,
+      utils::kPackedInt8_4C1W,
+  };
+
+  // Test with buffer storage only
+  std::vector<utils::StorageType> storage_types = {utils::kBuffer};
+
+  // Generate all combinations
+  for (const auto& shape : shapes) {
+    // Generate test case name prefix from shape dimensions
+    std::string prefix = "ACCU";
+    for (const auto& dim : shape) {
+      if (dim > kRefDimSizeLimit) {
+        prefix = "PERF";
+        break;
+      }
+    }
+
+    for (const auto& fp_layout : fp_layouts) {
+      for (const auto& inp_quant_layout : quant_layouts) {
+        for (const auto& outp_quant_layout : quant_layouts) {
+          for (const auto& storage_type : storage_types) {
+            Q8taCloneConfig config;
+            config.shape = shape;
+            config.test_case_name = prefix;
+
+            test_cases.push_back(create_test_case_from_config(
+                config,
+                storage_type,
+                vkapi::kFloat,
+                fp_layout,
+                inp_quant_layout,
+                outp_quant_layout));
+          }
+        }
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Reference implementation for q8ta_clone operation
+// Since clone just copies data, the result should be the same as
+// quantize-dequantize
+void q8ta_clone_reference_impl(TestCase& test_case) {
+  int32_t idx = 0;
+  const ValueSpec& input_spec = test_case.inputs()[idx++];
+  const ValueSpec& scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& zero_point_spec = test_case.inputs()[idx++];
+  const ValueSpec& inp_layout_spec = test_case.inputs()[idx++];
+  const ValueSpec& outp_layout_spec = test_case.inputs()[idx++];
+  (void)inp_layout_spec; // Not used in reference implementation
+  (void)outp_layout_spec; // Not used in reference implementation
+
+  // Extract output specification
+  ValueSpec& output_spec = test_case.outputs()[0];
+
+  // Get tensor dimensions (arbitrary dimensionality)
+  auto input_sizes = input_spec.get_tensor_sizes();
+
+  // Calculate total number of elements
+  int64_t num_elements = 1;
+  for (const auto& dim : input_sizes) {
+    num_elements *= dim;
+  }
+
+  // Skip for large tensors since computation time will be extremely slow
+  for (const auto& dim : input_sizes) {
+    if (dim > kRefDimSizeLimit) {
+      throw std::invalid_argument(
+          "One or more dimensions exceed the allowed limit for reference "
+          "implementation.");
+    }
+  }
+
+  if (input_spec.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Unsupported dtype");
+  }
+
+  // Get raw data pointers
+  auto& input_data = input_spec.get_float_data();
+
+  // Extract the randomized scale and zero point values
+  float scale = scale_spec.get_float_value();
+  int32_t zero_point = zero_point_spec.get_int_value();
+  int32_t quant_min = -128;
+  int32_t quant_max = 127;
+
+  // Prepare output data
+  auto& ref_data = output_spec.get_ref_float_data();
+  ref_data.resize(num_elements);
+
+  // Perform quantize-clone-dequantize operation on each element
+  // Clone preserves the quantized values, so result is same as Q-DQ
+  for (int64_t i = 0; i < num_elements; ++i) {
+    float input_val = input_data[i];
+
+    // Quantize: quantized = round(input / scale + zero_point)
+    float quantized_float = std::round(input_val / scale) + zero_point;
+
+    // Clamp to quantization range
+    quantized_float = std::max(quantized_float, static_cast<float>(quant_min));
+    quantized_float = std::min(quantized_float, static_cast<float>(quant_max));
+
+    int32_t quantized_int = static_cast<int32_t>(quantized_float);
+
+    // Dequantize: output = (quantized - zero_point) * scale
+    float dequantized = (quantized_int - zero_point) * scale;
+
+    ref_data[i] = dequantized;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+#ifdef DEBUG_MODE
+  set_print_latencies(false);
+#else
+  set_print_latencies(false);
+#endif
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Q8TA Clone Operation Prototyping Framework" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = q8ta_clone_reference_impl;
+
+  auto results = execute_test_cases(
+#ifdef DEBUG_MODE
+      generate_q8ta_clone_easy_cases,
+#else
+      generate_q8ta_clone_test_cases,
+#endif
+      "Q8taClone",
+#ifdef DEBUG_MODE
+      0,
+      1,
+#else
+      3,
+      10,
+#endif
+      ref_fn);
+
+  return 0;
+}