diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
index 7a484df9acf0e8..bd13391029b1f7 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
@@ -8,6 +8,66 @@
 #include "openvino/runtime/make_tensor.hpp"  // get_tensor_impl
 #include "util_xarch.hpp"
 
+namespace {
+
+struct FoldedTrailingLayout {
+    ov::Shape shape;
+    ov::Strides src_strides;
+    ov::Strides dst_strides;
+};
+
+// Folds the maximal COMMON trailing segment where:
+//   src_stride == dst_stride == default_stride (packed / contiguous-by-bytes)
+// into a single last dimension.
+// This is the only segment eligible for flattening to speed up row-wise memmove.
+FoldedTrailingLayout fold_common_trailing_packed_segment(const ov::Shape& shape0,
+                                                         const ov::Strides& src_strides0,
+                                                         const ov::Strides& dst_strides0,
+                                                         size_t elem_size) {
+    const size_t rank0 = shape0.size();
+    OPENVINO_ASSERT(rank0 > 0);
+
+    ov::Strides default_strides(rank0, 0);
+    default_strides[rank0 - 1] = elem_size;
+    for (size_t i = rank0 - 1; i > 0; --i) {
+        default_strides[i - 1] = default_strides[i] * shape0[i];
+    }
+
+    size_t cut = rank0 - 1;
+    for (size_t inverted_idx = rank0; inverted_idx-- > 0;) {
+        const bool ok = (src_strides0[inverted_idx] == default_strides[inverted_idx]) &&
+                        (dst_strides0[inverted_idx] == default_strides[inverted_idx]) &&
+                        (src_strides0[inverted_idx] == dst_strides0[inverted_idx]);
+        if (!ok) {
+            break;
+        }
+        cut = inverted_idx;
+    }
+
+    FoldedTrailingLayout out;
+    out.shape.reserve(cut + 1);
+    out.src_strides.reserve(cut + 1);
+    out.dst_strides.reserve(cut + 1);
+
+    for (size_t d = 0; d < cut; ++d) {
+        out.shape.push_back(shape0[d]);
+        out.src_strides.push_back(src_strides0[d]);
+        out.dst_strides.push_back(dst_strides0[d]);
+    }
+
+    size_t folded_last = 1;
+    for (size_t d = cut; d < rank0; ++d) {
+        folded_last *= shape0[d];
+    }
+    out.shape.push_back(folded_last);
+    out.src_strides.push_back(elem_size);
+    out.dst_strides.push_back(elem_size);
+
+    return out;
+}
+
+}  // namespace
+
 // FIXME: Use ov::npuw::util::view instead
 ov::SoPtr<ov::ITensor> ov::npuw::util::make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
                                                          uint32_t dim,
@@ -147,6 +207,146 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
     }
 }
 
+void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_tensor,
+                                               ov::SoPtr<ov::ITensor> dst_tensor) {
+    OPENVINO_ASSERT(src_tensor);
+    OPENVINO_ASSERT(dst_tensor);
+    OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());
+
+    // KV-cache values are byte-addressable in the current flow. Sub-byte element types (int4/uint4) are unsupported.
+    const auto et = src_tensor->get_element_type();
+    OPENVINO_ASSERT(et.bitwidth() % 8u == 0u, "sub-byte element types (e.g. int4/uint4) are not supported");
+
+    void* base_data = src_tensor->data();
+    void* dst_data = dst_tensor->data();
+    OPENVINO_ASSERT(base_data && dst_data);
+    OPENVINO_ASSERT(base_data == dst_data);
+
+    const auto& shape0 = src_tensor->get_shape();
+    const auto& dst_shape0 = dst_tensor->get_shape();
+    OPENVINO_ASSERT(shape0 == dst_shape0);
+
+    const size_t rank0 = shape0.size();
+    if (rank0 == 0) {
+        return;
+    }
+
+    for (size_t d = 0; d < rank0; ++d) {
+        if (shape0[d] == 0) {
+            return;
+        }
+    }
+
+    const size_t total_elems = src_tensor->get_size();
+    OPENVINO_ASSERT(total_elems != 0);
+
+    const size_t elem_size = src_tensor->get_byte_size() / total_elems;
+
+    const ov::Strides src_strides0 = src_tensor->get_strides();
+    const ov::Strides dst_strides0 = dst_tensor->get_strides();
+    OPENVINO_ASSERT(src_strides0.size() == rank0);
+    OPENVINO_ASSERT(dst_strides0.size() == rank0);
+
+    // The last dimension is packed in both src and dst.
+    OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size && dst_strides0[rank0 - 1] == elem_size,
+                    "src/dst last dimension is not packed");
+
+    auto* base = static_cast<uint8_t*>(base_data);
+
+    const auto folded = fold_common_trailing_packed_segment(shape0, src_strides0, dst_strides0, elem_size);
+
+    const size_t rank = folded.shape.size();
+    OPENVINO_ASSERT(rank >= 1);
+
+    const size_t row_elems = folded.shape[rank - 1];
+    const size_t row_bytes = row_elems * elem_size;
+    if (row_bytes == 0) {
+        return;
+    }
+
+    const size_t outer_rank = rank - 1;
+
+    ov::Shape outer(outer_rank, 0);
+    for (size_t d = 0; d < outer_rank; ++d) {
+        outer[d] = folded.shape[d] - 1;
+    }
+
+    auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t {
+        size_t off = 0;
+        for (size_t d = 0; d < o.size(); ++d) {
+            off += o[d] * strides_bytes[d];
+        }
+        return off;
+    };
+
+    size_t src_off = compute_outer_offset(outer, folded.src_strides);
+    size_t dst_off = compute_outer_offset(outer, folded.dst_strides);
+
+    auto step_prev_outer = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) {
+        off -= strides_bytes[dim];
+    };
+
+    auto wrap_outer_dim =
+        [&](size_t& off, const ov::Shape& shape_folded, const ov::Strides& strides_bytes, size_t dim) {
+            off += (shape_folded[dim] - 1) * strides_bytes[dim];
+        };
+
+    auto dec_outer_and_offsets = [&]() -> bool {
+        for (int d = static_cast<int>(outer_rank) - 1; d >= 0; --d) {
+            const size_t ud = static_cast<size_t>(d);
+            if (outer[ud] > 0) {
+                --outer[ud];
+                step_prev_outer(src_off, folded.src_strides, ud);
+                step_prev_outer(dst_off, folded.dst_strides, ud);
+                return true;
+            }
+            outer[ud] = folded.shape[ud] - 1;
+            wrap_outer_dim(src_off, folded.shape, folded.src_strides, ud);
+            wrap_outer_dim(dst_off, folded.shape, folded.dst_strides, ud);
+        }
+        return false;
+    };
+
+    while (true) {
+        uint8_t* src_ptr = base + src_off;
+        uint8_t* dst_ptr = base + dst_off;
+        if (src_ptr != dst_ptr) {
+            std::memmove(dst_ptr, src_ptr, row_bytes);
+        }
+
+        if (!dec_outer_and_offsets()) {
+            break;
+        }
+    }
+}
+
+// In-place move along kv_dim when src/dst share the same buffer.
+// Requirements:
+//   - kv_dim_src == kv_dim_dst
+//   - src_tensor->data() == dst_tensor->data()
+void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor> src_tensor,
+                                                ov::SoPtr<ov::ITensor> dst_tensor,
+                                                uint32_t kv_dim_src,
+                                                uint32_t kv_dim_dst) {
+    OPENVINO_ASSERT(src_tensor);
+    OPENVINO_ASSERT(dst_tensor);
+
+    OPENVINO_ASSERT(kv_dim_src == kv_dim_dst, "copy_tensor_inplace_by_dim supports only kv_dim_src == kv_dim_dst");
+
+    void* base_data = src_tensor->data();
+    void* dst_data = dst_tensor->data();
+    OPENVINO_ASSERT(base_data);
+    OPENVINO_ASSERT(dst_data);
+    OPENVINO_ASSERT(base_data == dst_data);
+
+    const auto& src_shape = src_tensor->get_shape();
+    const auto& dst_shape = dst_tensor->get_shape();
+    OPENVINO_ASSERT(src_shape == dst_shape);
+    OPENVINO_ASSERT(kv_dim_src < src_shape.size());
+
+    copy_inplace_generic_rows(src_tensor, dst_tensor);
+}
+
 std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name(
     const std::vector<ov::Output<const ov::Node>>& ports,
     const std::string& name) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
index bd1d89ae8444c7..84538f3b322a1c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
@@ -33,6 +33,13 @@ void copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
                         uint32_t kv_dim_src,
                         uint32_t kv_dim_dst);
 
+void copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor);
+
+void copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor> src_tensor,
+                                ov::SoPtr<ov::ITensor> dst_tensor,
+                                uint32_t kv_dim_src,
+                                uint32_t kv_dim_dst);
+
 std::optional<ov::Output<const ov::Node>> find_port_by_name(const std::vector<ov::Output<const ov::Node>>& ports,
                                                             const std::string& name);
 /**
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index a82bd878a11ffe..1eb131fe733eda 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -11,6 +11,7 @@
 #include "logging.hpp"
 #include "openvino/core/parallel.hpp"
 #include "openvino/runtime/iasync_infer_request.hpp"
+#include "perf.hpp"
 #include "util.hpp"
 
 namespace {
@@ -505,7 +506,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
             // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor
             // Part 2: The kv results from the last loop remain in the 'present' KV output tensor
             // The task is to copy both parts into the KV-cache input tensor for the decoding process
-
             // Copy part 1 KV results
             // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk
             auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk;
@@ -513,33 +513,47 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
                 // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption
                 // This is necessary because subsequent copy operations would overwrite the shared buffer
                 auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name));
-                ov::SoPtr<ov::ITensor> tmp_dense_kv_tensor;
-                ov::SoPtr<ov::ITensor> prefill_past_kv_chunks;
-                if (m_past_kv_bound) {
-                    tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(),
-                                                                   prefill_past_kv->get_shape(),
-                                                                   m_pre_alloc_device,
-                                                                   m_npuw_llm_compiled_model->get_plugin());
-                    prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
-                    prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor,
-                                                               pre_kv_dim,
-                                                               0u,
-                                                               static_cast<uint32_t>(tokens_in_past_chunks));
-                } else {
-                    prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv,
-                                                               pre_kv_dim,
-                                                               0u,
-                                                               static_cast<uint32_t>(tokens_in_past_chunks));
-                }
-
                 auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor,
                                                                     gen_kv_dim,
                                                                     0u,
                                                                     static_cast<uint32_t>(tokens_in_past_chunks));
-
-                uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
+                ov::SoPtr<ov::ITensor> prefill_past_kv_chunks;
+                // In-place KV copy is only safe/possible when the source and destination KV layouts match.
+                // When we have mixed v-transpose settings across models (prefill vs generate: v-transpose OFF/ON),
+                // the effective KV "token" dimension differs (pre_kv_dim != gen_kv_dim), so an in-place move/copy
+                // would corrupt data. Therefore, we only use in-place copy when pre_kv_dim == gen_kv_dim;
+                // otherwise we must copy via a temporary tensor.
+                if (m_past_kv_bound) {
+                    if (pre_kv_dim == gen_kv_dim) {
+                        prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv,
+                                                                       pre_kv_dim,
+                                                                       0u,
+                                                                       static_cast<uint32_t>(tokens_in_past_chunks));
+
+                        uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks,
+                                                       kvcache_past_kv_chunks,
+                                                       pre_kv_dim,
+                                                       gen_kv_dim);
+                    } else {
+                        auto tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(),
+                                                                            prefill_past_kv->get_shape(),
+                                                                            m_pre_alloc_device,
+                                                                            m_npuw_llm_compiled_model->get_plugin());
+                        prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
+                        prefill_past_kv_chunks = uu::make_tensor_slice(tmp_dense_kv_tensor,
+                                                                       pre_kv_dim,
+                                                                       0u,
+                                                                       static_cast<uint32_t>(tokens_in_past_chunks));
+                        uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
+                    }
+                } else {
+                    prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv,
+                                                                   pre_kv_dim,
+                                                                   0u,
+                                                                   static_cast<uint32_t>(tokens_in_past_chunks));
+                    uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
+                }
             }
-
             // Copy part 2 KV results
             auto prefill_present_kv_chunk =
                 uu::make_tensor_slice(prefill_out_tensor,
@@ -846,7 +860,14 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     if (!m_generate_initialized) {
         LOG_DEBUG("Copy kv-cache from prefill to generate model.");
         if (kvcache_desc.num_stored_tokens > 0) {
-            copy_kvcache();
+            using MS = ov::npuw::perf::metric<ov::npuw::perf::MSec>;
+            MS m_ms_copy_kvcache("copy_kvcache", /*active*/ true);
+
+            m_ms_copy_kvcache += ov::npuw::perf::ms_to_run([&]() {
+                copy_kvcache();
+            });
+
+            LOG_INFO("cost of copy_kvcache(): " << m_ms_copy_kvcache.med() << " ms");
         }
 
         LOG_DEBUG("Prepare inputs.");
diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
new file mode 100644
index 00000000000000..ae66c127b375d8
--- /dev/null
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
@@ -0,0 +1,178 @@
+// Copyright (C) 2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef HAVE_AVX2
+#    include "copy_inplace.hpp"
+
+namespace {
+static ov::Strides make_padded_strides_keep_tail_default(const ov::Shape& shape,
+                                                         const ov::element::Type& et,
+                                                         size_t kv_dim,
+                                                         size_t pad_elems) {
+    ov::Strides s = copy_inplace_details::default_byte_strides(shape, et);
+
+    const size_t rank = shape.size();
+    if (rank == 0) {
+        return s;
+    }
+    if (rank == 1) {
+        if (kv_dim == 0) {
+            s[0] += pad_elems * et.size();
+        }
+        return s;
+    }
+
+    s[rank - 1] = et.size();
+    for (size_t d = rank - 1; d-- > 0;) {
+        s[d] = s[d + 1] * shape[d + 1];
+        if (d == kv_dim) {
+            s[d] += pad_elems * s[d + 1];
+        }
+    }
+
+    return s;
+}
+
+static std::vector<int8_t> to_i8(const std::vector<uint8_t>& v) {
+    std::vector<int8_t> out(v.size());
+    std::memcpy(out.data(), v.data(), v.size());
+    return out;
+}
+
+void CopyInplaceTestsBase::make_input() {
+    const auto elem_bytes = copy_inplace_details::elem_size_bytes(type);
+    const auto total_elems = ov::shape_size(shape);
+    ASSERT_GT(total_elems, 0u);
+
+    auto max_offset = [&](const ov::Strides& strides) -> size_t {
+        size_t off = 0;
+        for (size_t d = 0; d < shape.size(); ++d) {
+            off += (shape[d] - 1) * strides[d];
+        }
+        return off;
+    };
+
+    const size_t src_max = max_offset(src_strides);
+    const size_t dst_max = max_offset(dst_strides);
+    const size_t byte_size = std::max(src_max, dst_max) + elem_bytes;
+
+    base_bytes_initial.resize(byte_size);
+    ref_bytes.assign(byte_size, 0);
+    out_bytes.assign(byte_size, 0);
+
+    std::mt19937 rng(42);
+    std::uniform_int_distribution<int> dist(0, 255);
+    for (size_t i = 0; i < base_bytes_initial.size(); ++i) {
+        base_bytes_initial[i] = static_cast<uint8_t>(dist(rng));
+    }
+
+    baseTensor = ov::Tensor(ov::element::u8, ov::Shape{byte_size}, base_bytes_initial.data());
+}
+
+bool CopyInplaceTestsBase::isNegative() const {
+    if (shape.size() < 2) {
+        return true;
+    }
+    if (kv_dim >= shape.size()) {
+        return true;
+    }
+    if (type.bitwidth() < 8) {
+        return true;
+    }
+    return false;
+}
+
+void CopyInplaceTestsBase::make_ref_output() {
+    ref_bytes = base_bytes_initial;
+
+    const auto elem_bytes = copy_inplace_details::elem_size_bytes(type);
+    const uint8_t* base_in = base_bytes_initial.data();
+
+    std::vector<uint8_t> tmp_out = base_bytes_initial;
+
+    ov::Shape idx(shape.size(), 0);
+    std::vector<uint8_t> elem(elem_bytes);
+
+    for (;;) {
+        copy_inplace_details::read_elem_bytes(base_in, idx, src_strides, elem_bytes, elem.data());
+        copy_inplace_details::write_elem_bytes(tmp_out.data(), idx, dst_strides, elem_bytes, elem.data());
+
+        if (!copy_inplace_details::next_index(idx, shape)) {
+            break;
+        }
+    }
+
+    ref_bytes = std::move(tmp_out);
+}
+
+void CopyInplaceTestsBase::SetUp(const CopyInplaceTestsParams& getParam) {
+    ShapesInitializer shapeInit;
+    ov::element::Type_t t;
+    std::tie(t, shapeInit, kv_dim) = getParam;
+
+    type = ov::element::Type(t);
+
+    std::vector<int> dims;
+    shapeInit(dims);
+    shape = ov::Shape{dims.begin(), dims.end()};
+
+    src_strides = copy_inplace_details::default_byte_strides(shape, type);
+    const size_t pad_elems = 13;
+    dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems);
+
+    make_input();
+
+    void* base_ptr = baseTensor.data();
+    ASSERT_NE(base_ptr, nullptr);
+    srcView = ov::Tensor(type, shape, base_ptr, src_strides);
+    dstView = ov::Tensor(type, shape, base_ptr, dst_strides);
+
+    if (!isNegative()) {
+        make_ref_output();
+    }
+}
+
+TEST_P(CopyInplaceTests, copy_tensor_inplace_by_dim_correctness) {
+    ASSERT_NO_THROW_IF(!isNegative(), {
+        auto src_it = ov::get_tensor_impl(srcView);
+        auto dst_it = ov::get_tensor_impl(dstView);
+
+        ov::npuw::util::copy_tensor_inplace_by_dim(src_it,
+                                                   dst_it,
+                                                   static_cast<uint32_t>(kv_dim),
+                                                   static_cast<uint32_t>(kv_dim));
+
+        uint8_t* base_ptr = baseTensor.data<uint8_t>();
+        ASSERT_NE(base_ptr, nullptr);
+        out_bytes.assign(base_ptr, base_ptr + out_bytes.size());
+
+        ASSERT_TRUE(details::ArraysMatch(to_i8(out_bytes), to_i8(ref_bytes)));
+    });
+}
+
+// Test cases
+const auto TestCases = ::testing::Combine(
+    ::testing::ValuesIn({ov::element::Type_t::i8, ov::element::Type_t::f16, ov::element::Type_t::f32}),
+    details::ShapesIn({
+        Tensors{ input = {1, 2, 3, 4};
+}
+, Tensors {
+    input = {1, 8, 16, 32};
+}
+, Tensors {
+    input = {1, 16, 33, 64};
+}
+, Tensors {
+    input = {1, 4, 128, 16};
+}
+,
+}),
+    ::testing::Values<std::size_t>(0, 1, 2, 3)
+);
+
+INSTANTIATE_TEST_SUITE_P(CopyInplaceTests, CopyInplaceTests, TestCases, CopyInplaceTests::getTestCaseName);
+
+}  // namespace
+
+#endif  // HAVE_AVX2
diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
new file mode 100644
index 00000000000000..96ee02f961a4bd
--- /dev/null
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
@@ -0,0 +1,141 @@
+// Copyright (C) 2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+#include "infer_request_utils.hpp"
+#include "openvino/runtime/make_tensor.hpp"
+#include "openvino/runtime/tensor.hpp"
+#include "test_utils.hpp"
+
+namespace {
+
+using CopyInplaceTestsParams = std::tuple<ov::element::Type_t, ShapesInitializer, std::size_t>;
+
+namespace copy_inplace_details {
+
+inline ov::Strides default_byte_strides(const ov::Shape& shape, const ov::element::Type& et) {
+    ov::Strides strides(shape.size(), 0);
+    if (!strides.empty()) {
+        strides.back() = et.size();
+        for (size_t i = shape.size() - 1; i > 0; --i) {
+            strides[i - 1] = strides[i] * shape[i];
+        }
+    }
+    return strides;
+}
+
+inline size_t elem_size_bytes(const ov::element::Type& et) {
+    return et.size();
+}
+
+inline void read_elem_bytes(const uint8_t* base,
+                            const ov::Shape& idx,
+                            const ov::Strides& strides,
+                            size_t elem_bytes,
+                            uint8_t* out_elem) {
+    size_t off = 0;
+    for (size_t d = 0; d < idx.size(); ++d) {
+        off += idx[d] * strides[d];
+    }
+    std::memcpy(out_elem, base + off, elem_bytes);
+}
+
+inline void write_elem_bytes(uint8_t* base,
+                             const ov::Shape& idx,
+                             const ov::Strides& strides,
+                             size_t elem_bytes,
+                             const uint8_t* elem) {
+    size_t off = 0;
+    for (size_t d = 0; d < idx.size(); ++d) {
+        off += idx[d] * strides[d];
+    }
+    std::memcpy(base + off, elem, elem_bytes);
+}
+
+inline bool next_index(ov::Shape& idx, const ov::Shape& shape) {
+    for (int d = static_cast<int>(shape.size()) - 1; d >= 0; --d) {
+        const size_t ud = static_cast<size_t>(d);
+        if (++idx[ud] < shape[ud]) {
+            return true;
+        }
+        idx[ud] = 0;
+    }
+    return false;
+}
+
+}  // namespace copy_inplace_details
+
+class CopyInplaceTestsBase {
+protected:
+    ov::element::Type type;
+    ov::Tensor baseTensor;
+    ov::Tensor srcView;
+    ov::Tensor dstView;
+    ov::Shape shape;
+
+    std::vector<uint8_t> base_bytes_initial;
+    std::vector<uint8_t> ref_bytes;
+    std::vector<uint8_t> out_bytes;
+
+    std::size_t kv_dim = 0;
+
+    ov::Strides src_strides;
+    ov::Strides dst_strides;
+
+    void make_input();
+    void make_ref_output();
+    bool isNegative() const;
+
+public:
+    void SetUp(const CopyInplaceTestsParams& getParam);
+};
+
+template <class T>
+class CopyInplaceTestsTmpl : public ::testing::Test,
+                             public T,
+                             public ::testing::WithParamInterface<CopyInplaceTestsParams> {
+protected:
+    void SetUp() override {
+        T::SetUp(GetParam());
+    }
+
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<CopyInplaceTestsParams>& obj) {
+        ov::element::Type_t t;
+        ShapesInitializer shapeInit;
+        std::size_t kv_dim = 0;
+        std::tie(t, shapeInit, kv_dim) = obj.param;
+
+        std::vector<int> dims;
+        shapeInit(dims);
+
+        std::ostringstream oss;
+        oss << "S";
+        for (size_t i = 0; i < dims.size(); ++i) {
+            oss << dims[i];
+            if (i + 1 != dims.size())
+                oss << "x";
+        }
+        oss << "_T" << ov::element::Type(t) << "_KV" << kv_dim;
+        return oss.str();
+    }
+};
+
+using CopyInplaceTests = CopyInplaceTestsTmpl<CopyInplaceTestsBase>;
+
+}  // anonymous namespace