From 35789cc7bb2ebfd507790852622d585255e420a1 Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Thu, 11 Dec 2025 16:17:45 +0800
Subject: [PATCH 01/13] Implement inplace kv cache copy when it's shared

---
 .../src/plugin/npuw/infer_request_utils.cpp   |  91 +++++++++++++
 .../src/plugin/npuw/infer_request_utils.hpp   |   5 +
 .../src/plugin/npuw/llm_infer_request.cpp     | 120 ++++++++++++++++++
 3 files changed, 216 insertions(+)
diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
index d4e4ff66371dbc..df9805579fb2e3 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
@@ -140,6 +140,97 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
     }
 }
 
+// In-place move along kv_dim when src/dst share the same buffer.
+// Requirements:
+//   - kv_dim_src == kv_dim_dst, otherwise throws
+//   - src_tensor->data() == dst_tensor->data()
+void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
+                                                ov::SoPtr<ov::ITensor> dst_tensor,
+                                                uint32_t kv_dim_src,
+                                                uint32_t kv_dim_dst) {
+    OPENVINO_ASSERT(src_tensor);
+    OPENVINO_ASSERT(dst_tensor);
+
+    if (kv_dim_src != kv_dim_dst) {
+        OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst");
+    }
+
+    void* base_data = src_tensor->data();
+    void* dst_data = dst_tensor->data();
+    OPENVINO_ASSERT(base_data);
+    OPENVINO_ASSERT(dst_data);
+    OPENVINO_ASSERT(base_data == dst_data);
+
+    const auto& shape = src_tensor->get_shape();
+    const auto& dst_shape = dst_tensor->get_shape();
+    OPENVINO_ASSERT(shape.size() == dst_shape.size());
+    OPENVINO_ASSERT(shape == dst_shape);
+    OPENVINO_ASSERT(kv_dim_src < shape.size());
+
+    const auto& src_strides = src_tensor->get_strides();
+    const auto& dst_strides = dst_tensor->get_strides();
+
+    const size_t total_elems = src_tensor->get_size();
+    const size_t elem_size = src_tensor->get_byte_size() / total_elems;
+
+    if (src_strides == dst_strides) {
+        LOG_INFO("identical strides, skip");
+        return;
+    }
+
+    for (size_t d = 0; d < shape.size(); ++d) {
+        if (shape[d] == 0) {
+            LOG_INFO("zero-sized dimension, nothing to move");
+            return;
+        }
+    }
+
+    auto* base = static_cast<uint8_t*>(base_data);
+    const size_t rank = shape.size();
+
+    std::vector<size_t> idx(rank);
+    for (size_t d = 0; d < rank; ++d) {
+        idx[d] = shape[d] - 1;
+    }
+
+    size_t src_off = 0;
+    size_t dst_off = 0;
+    for (size_t d = 0; d < rank; ++d) {
+        src_off += idx[d] * src_strides[d];
+        dst_off += idx[d] * dst_strides[d];
+    }
+
+    auto dec_index_and_update_offsets = [&]() -> bool {
+        for (int d = static_cast<int>(rank) - 1; d >= 0; --d) {
+            const size_t old = idx[static_cast<size_t>(d)];
+            if (old > 0) {
+                idx[static_cast<size_t>(d)] = old - 1;
+                src_off -= src_strides[static_cast<size_t>(d)];
+                dst_off -= dst_strides[static_cast<size_t>(d)];
+                return true;
+            } else {
+                idx[static_cast<size_t>(d)] = shape[static_cast<size_t>(d)] - 1;
+                src_off += src_strides[static_cast<size_t>(d)] * (shape[static_cast<size_t>(d)] - 1);
+                dst_off += dst_strides[static_cast<size_t>(d)] * (shape[static_cast<size_t>(d)] - 1);
+            }
+        }
+        return false;
+    };
+
+    while (true) {
+        uint8_t* src_ptr = base + src_off;
+        uint8_t* dst_ptr = base + dst_off;
+
+        if (src_ptr != dst_ptr) {
+            std::memmove(dst_ptr, src_ptr, elem_size);
+        }
+
+        if (!dec_index_and_update_offsets()) {
+            break;
+        }
+    }
+}
+
 std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name(
     const std::vector<ov::Output<const ov::Node>>& ports,
     const std::string& name) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
index 022d49b56a140c..f526328cf12943 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
@@ -31,6 +31,11 @@ void copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
                         uint32_t kv_dim_src,
                         uint32_t kv_dim_dst);
 
+void move_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
+                                ov::SoPtr<ov::ITensor> dst_tensor,
+                                uint32_t kv_dim_src,
+                                uint32_t kv_dim_dst);
+
 std::optional<ov::Output<const ov::Node>> find_port_by_name(const std::vector<ov::Output<const ov::Node>>& ports,
                                                             const std::string& name);
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index 6f7be5664975b1..9efaf3348b9e00 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -587,10 +587,125 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
             // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor
             // Part 2: The kv results from the last loop remain in the 'present' KV output tensor
             // The task is to copy both parts into the KV-cache input tensor for the decoding process
+            // Copy part 1 KV results
+            // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk
+            auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk;
+            // Start counting time.
+            auto t_start = std::chrono::high_resolution_clock::now();
+            if (tokens_in_past_chunks > 0) {
+                // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption
+                // This is necessary because subsequent copy operations would overwrite the shared buffer
+                auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name));
+                auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor,
+                                                                    gen_kv_dim,
+                                                                    0u,
+                                                                    static_cast<uint32_t>(tokens_in_past_chunks));
+                ov::SoPtr<ov::ITensor> prefill_past_kv_chunks;
+                // move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst.
+                if (m_past_kv_bound) {
+                    if (pre_kv_dim == gen_kv_dim) {
+                        prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv,
+                                                                   pre_kv_dim,
+                                                                   0u,
+                                                                   static_cast<uint32_t>(tokens_in_past_chunks));
+
+                        uu::move_tensor_inplace_by_dim(prefill_past_kv_chunks,
+                                                       kvcache_past_kv_chunks,
+                                                       pre_kv_dim,
+                                                       gen_kv_dim);
+                    } else {
+                        auto tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(),
+                                                                            prefill_past_kv->get_shape(),
+                                                                            m_pre_alloc_device,
+                                                                            m_npuw_llm_compiled_model->get_plugin());
+                        prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
+                        prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor,
+                                                                   pre_kv_dim,
+                                                                   0u,
+                                                                   static_cast<uint32_t>(tokens_in_past_chunks));
+                        uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
+                    }
+                } else {
+                    prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv,
+                                                               pre_kv_dim,
+                                                               0u,
+                                                               static_cast<uint32_t>(tokens_in_past_chunks));
+                    uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
+                }
+            }
+            // End counting time.
+            auto t_end = std::chrono::high_resolution_clock::now();
+            auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
+            LOG_INFO("############tokens_in_past_chunks cost: " << duration_ms << " ms");
+            // Copy part 2 KV results
+            auto prefill_present_kv_chunk =
+                uu::make_tensor_slice(prefill_out_tensor,
+                                      pre_kv_dim,
+                                      static_cast<uint32_t>(prefill_chunk_size - m_tokens_in_present_chunk),
+                                      static_cast<uint32_t>(prefill_chunk_size));
+
+            auto kvcache_last_kv_chunk = uu::make_tensor_slice(kvcache_in_tensor,
+                                                               gen_kv_dim,
+                                                               static_cast<uint32_t>(tokens_in_past_chunks),
+                                                               kvcache_desc.num_stored_tokens);
+
+            uu::copy_tensor_by_dim(prefill_present_kv_chunk, kvcache_last_kv_chunk, pre_kv_dim, gen_kv_dim);
+        } else {
+            auto prefill_out_slice =
+                uu::make_tensor_slice(prefill_out_tensor,
+                                      pre_kv_dim,
+                                      kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens,
+                                      kvcache_desc.max_prompt_size);
+
+            auto kvcache_in_slice =
+                uu::make_tensor_slice(kvcache_in_tensor, gen_kv_dim, 0u, kvcache_desc.num_stored_tokens);
+
+            uu::copy_tensor_by_dim(prefill_out_slice, kvcache_in_slice, pre_kv_dim, gen_kv_dim);
+        }
+    });
+    LOG_DEBUG("Done.");
+}
+
+/*
+void ov::npuw::LLMInferRequest::copy_kvcache() {
+    namespace uu = ov::npuw::util;
+    LOG_DEBUG("Copying kv-cache from prefill to generate model.");
+    LOG_BLOCK();
+    auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
+    const auto& kvcache_compiled = m_kvcache_request->get_compiled_model();
+    // FIXME: Find only matching by names outputs and copy them, having previously checked that such inputs exist
+    ov::parallel_for(kvcache_compiled->outputs().size() - layer_ids::kStartOutputKVCacheLayers, [&](size_t out_idx) {
+        const std::size_t i = layer_ids::kStartOutputKVCacheLayers + out_idx;
+        const auto& output_name = kvcache_compiled->outputs()[i].get_any_name();
+        auto prefill_out_tensor = m_prefill_request->get_tensor(m_prefill_out_ports.at(output_name));
+
+        const auto& input_name = std::regex_replace(output_name, std::regex("present"), layer_names::past_key_values);
+        if (m_kvcache_in_ports.find(input_name) == m_kvcache_in_ports.end()) {
+            // FIXME: Totally wrong debug message. input_name is an invalid name of input layer.
+            LOG_DEBUG("Input name " << input_name << " doesn't contain kv cache. Skipping.");
+            return;
+        }
+        const auto is_value_tensor = output_name.find("value") != std::string::npos;
+        const auto kv_dim = [&](bool v_trans) -> uint32_t {
+            return (is_value_tensor && v_trans) ? 3u : kvcache_desc.dim;
+        };
 
+        const auto& pre_kv_dim = kv_dim(kvcache_desc.v_tensors_transposed_pre);
+        const auto& gen_kv_dim = kv_dim(kvcache_desc.v_tensors_transposed_gen);
+        auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name));
+
+        const auto prefill_chunk_size = m_npuw_llm_compiled_model->m_prefill_chunk_size;
+        const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
+        if (use_chunk_prefill) {
+            // The chunk prefilled KV results are divided into two parts:
+            // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor
+            // Part 2: The kv results from the last loop remain in the 'present' KV output tensor
+            // The task is to copy both parts into the KV-cache input tensor for the decoding process
             // Copy part 1 KV results
             // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk
             auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk;
+            // Start counting time.
+            auto t_start = std::chrono::high_resolution_clock::now();
             if (tokens_in_past_chunks > 0) {
                 // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption
                 // This is necessary because subsequent copy operations would overwrite the shared buffer
@@ -621,6 +736,10 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
 
                 uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
             }
+            // End counting time.
+            auto t_end = std::chrono::high_resolution_clock::now();
+            auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
+            LOG_INFO("########################################tokens_in_past_chunks cost: " << duration_ms << " ms");
 
             // Copy part 2 KV results
             auto prefill_present_kv_chunk =
@@ -650,6 +769,7 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
     });
     LOG_DEBUG("Done.");
 }
+*/
 
 void ov::npuw::LLMInferRequest::update_kvcache_for(
     std::shared_ptr<ov::IAsyncInferRequest> request,

From 5eada49f8afdfaacb17e0e631ec7847a90c6caa5 Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Tue, 16 Dec 2025 17:36:24 +0800
Subject: [PATCH 02/13] Optimize and Fix

---
 .../src/plugin/npuw/infer_request_utils.cpp   | 199 ++++++++++++++----
 .../src/plugin/npuw/infer_request_utils.hpp   |   8 +-
 .../src/plugin/npuw/llm_infer_request.cpp     |   3 +-
 3 files changed, 170 insertions(+), 40 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
index df9805579fb2e3..da6069d28c87a2 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
@@ -140,35 +140,81 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
     }
 }
 
-// In-place move along kv_dim when src/dst share the same buffer.
-// Requirements:
-//   - kv_dim_src == kv_dim_dst, otherwise throws
-//   - src_tensor->data() == dst_tensor->data()
-void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
-                                                ov::SoPtr<ov::ITensor> dst_tensor,
-                                                uint32_t kv_dim_src,
-                                                uint32_t kv_dim_dst) {
-    OPENVINO_ASSERT(src_tensor);
-    OPENVINO_ASSERT(dst_tensor);
+void ov::npuw::util::copy_inplace_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst) {
+    const auto& src_shape = src->get_shape();
 
-    if (kv_dim_src != kv_dim_dst) {
-        OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst");
+    OPENVINO_ASSERT(src_shape.size() == 4u);
+    OPENVINO_ASSERT(src_shape == dst->get_shape());
+    OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size());
+
+    const auto& src_strides = src->get_strides();
+    const auto& dst_strides = dst->get_strides();
+    const auto elem_size = src->get_byte_size() / src->get_size();
+
+    const auto C = src_shape[1];
+    const auto H = src_shape[2];
+    const auto W = src_shape[3];
+
+    const auto IS_H = src_strides[2];
+    const auto OS_H = dst_strides[2];
+
+    const size_t chunk_byte_size = W * elem_size;
+
+    const auto* src_p = static_cast<uint8_t*>(src->data());
+    auto* dst_p = static_cast<uint8_t*>(dst->data());
+
+    const size_t num_chunks = C * H;
+    if (num_chunks == 0 || chunk_byte_size == 0) {
+        return;
     }
 
-    void* base_data = src_tensor->data();
-    void* dst_data = dst_tensor->data();
-    OPENVINO_ASSERT(base_data);
-    OPENVINO_ASSERT(dst_data);
-    OPENVINO_ASSERT(base_data == dst_data);
+    for (size_t i = num_chunks; i-- > 0;) {
+        const size_t src_offset = i * IS_H;
+        const size_t dst_offset = i * OS_H;
+        std::memmove(dst_p + dst_offset, src_p + src_offset, chunk_byte_size);
+    }
+}
+
+void ov::npuw::util::copy_inplace_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) {
+    // [1, H, S1, E] -> [1, H, S2, E]
+    const int N = 0;
+    const int H = 1;
+    const int S = 2;
+    const int E = 3;
 
+    OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]);
+    OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]);
+    OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]);
+    OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());
+    OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u);
+    OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u);
+
+    const auto* src_base = reinterpret_cast<uint8_t*>(src_tensor->data());
+    auto* dst_base = reinterpret_cast<uint8_t*>(dst_tensor->data());
+
+    const auto num_planes = src_tensor->get_shape()[H];
+    const auto src_plane_stride = src_tensor->get_strides()[H];
+    const auto dst_plane_stride = dst_tensor->get_strides()[H];
+    const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S];
+
+    if (num_planes == 0 || plane_size_in_bytes == 0) {
+        return;
+    }
+
+    for (size_t i = num_planes; i-- > 0;) {
+        const auto* src_ptr = src_base + i * src_plane_stride;
+        auto* dst_ptr = dst_base + i * dst_plane_stride;
+        std::memmove(dst_ptr, src_ptr, plane_size_in_bytes);
+    }
+}
+
+void ov::npuw::util::copy_inplace(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) {
     const auto& shape = src_tensor->get_shape();
-    const auto& dst_shape = dst_tensor->get_shape();
-    OPENVINO_ASSERT(shape.size() == dst_shape.size());
-    OPENVINO_ASSERT(shape == dst_shape);
-    OPENVINO_ASSERT(kv_dim_src < shape.size());
 
-    const auto& src_strides = src_tensor->get_strides();
-    const auto& dst_strides = dst_tensor->get_strides();
+    auto* base = static_cast<uint8_t*>(src_tensor->data());
+
+    auto src_strides = src_tensor->get_strides();
+    auto dst_strides = dst_tensor->get_strides();
 
     const size_t total_elems = src_tensor->get_size();
     const size_t elem_size = src_tensor->get_byte_size() / total_elems;
@@ -185,33 +231,74 @@ void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tenso
         }
     }
 
-    auto* base = static_cast<uint8_t*>(base_data);
-    const size_t rank = shape.size();
+    auto rank = shape.size();
+
+    ov::Shape cur_pos{0};
+    ov::Shape max_pos{1};
 
-    std::vector<size_t> idx(rank);
-    for (size_t d = 0; d < rank; ++d) {
-        idx[d] = shape[d] - 1;
+    if (src_tensor->get_element_type().bitwidth() < 8 || (is_scalar(shape))) {
+        // Doesn't support strides for LP types
+        // or both tensors have default strides
+        // Strides and positions already initialized
+    } else {
+        ov::Strides src_str, dst_str;
+        // Calculate src and dst shapes
+        bool found_step = false;
+        for (size_t inverted_idx = rank - 1; inverted_idx < rank; --inverted_idx) {
+            if (!found_step) {
+                if (src_strides[inverted_idx] == dst_strides[inverted_idx]) {
+                    continue;
+                } else {
+                    found_step = true;
+                    size_t strides_size = inverted_idx + 1;
+                    // Set right size
+                    src_str.resize(strides_size + 1);
+                    dst_str.resize(strides_size + 1);
+                    max_pos.resize(strides_size + 1);
+                    cur_pos.resize(strides_size + 1);
+                    // In case of default continuous strides we can copy several elements
+                    // In other case only one element
+                    size_t dim = 1;
+                    size_t strides = elem_size;
+
+                    if (strides_size < src_strides.size()) {
+                        strides = src_strides[strides_size];
+                        dim = shape[strides_size];
+                    }
+                    src_str[strides_size] = strides;
+                    dst_str[strides_size] = strides;
+                    max_pos[strides_size] = dim;
+                    cur_pos[strides_size] = max_pos[strides_size] - 1;
+                }
+            }
+            src_str[inverted_idx] = src_strides[inverted_idx];
+            dst_str[inverted_idx] = dst_strides[inverted_idx];
+            max_pos[inverted_idx] = shape[inverted_idx];
+            cur_pos[inverted_idx] = max_pos[inverted_idx] - 1;
+        }
+        src_strides = std::move(src_str);
+        dst_strides = std::move(dst_str);
     }
 
     size_t src_off = 0;
     size_t dst_off = 0;
-    for (size_t d = 0; d < rank; ++d) {
-        src_off += idx[d] * src_strides[d];
-        dst_off += idx[d] * dst_strides[d];
+    for (size_t d = 0; d < max_pos.size(); ++d) {
+        src_off += cur_pos[d] * src_strides[d];
+        dst_off += cur_pos[d] * dst_strides[d];
     }
 
     auto dec_index_and_update_offsets = [&]() -> bool {
-        for (int d = static_cast<int>(rank) - 1; d >= 0; --d) {
-            const size_t old = idx[static_cast<size_t>(d)];
+        for (int d = static_cast<int>(max_pos.size()) - 1; d >= 0; --d) {
+            const size_t old = cur_pos[static_cast<size_t>(d)];
             if (old > 0) {
-                idx[static_cast<size_t>(d)] = old - 1;
+                cur_pos[static_cast<size_t>(d)] = old - 1;
                 src_off -= src_strides[static_cast<size_t>(d)];
                 dst_off -= dst_strides[static_cast<size_t>(d)];
                 return true;
             } else {
-                idx[static_cast<size_t>(d)] = shape[static_cast<size_t>(d)] - 1;
-                src_off += src_strides[static_cast<size_t>(d)] * (shape[static_cast<size_t>(d)] - 1);
-                dst_off += dst_strides[static_cast<size_t>(d)] * (shape[static_cast<size_t>(d)] - 1);
+                cur_pos[static_cast<size_t>(d)] = max_pos[static_cast<size_t>(d)] - 1;
+                src_off += src_strides[static_cast<size_t>(d)] * (max_pos[static_cast<size_t>(d)] - 1);
+                dst_off += dst_strides[static_cast<size_t>(d)] * (max_pos[static_cast<size_t>(d)] - 1);
             }
         }
         return false;
@@ -222,7 +309,7 @@ void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tenso
         uint8_t* dst_ptr = base + dst_off;
 
         if (src_ptr != dst_ptr) {
-            std::memmove(dst_ptr, src_ptr, elem_size);
+            std::memmove(dst_ptr, src_ptr, src_strides[src_strides.size() - 1]);
         }
 
         if (!dec_index_and_update_offsets()) {
@@ -231,6 +318,42 @@ void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tenso
     }
 }
 
+// In-place move along kv_dim when src/dst share the same buffer.
+// Requirements:
+//   - kv_dim_src == kv_dim_dst, otherwise throws
+//   - src_tensor->data() == dst_tensor->data()
+void ov::npuw::util::copy_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
+                                                ov::SoPtr<ov::ITensor> dst_tensor,
+                                                uint32_t kv_dim_src,
+                                                uint32_t kv_dim_dst) {
+    OPENVINO_ASSERT(src_tensor);
+    OPENVINO_ASSERT(dst_tensor);
+
+    if (kv_dim_src != kv_dim_dst) {
+        OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst");
+    }
+
+    void* base_data = src_tensor->data();
+    void* dst_data = dst_tensor->data();
+    OPENVINO_ASSERT(base_data);
+    OPENVINO_ASSERT(dst_data);
+    OPENVINO_ASSERT(base_data == dst_data);
+
+    const auto& src_shape = src_tensor->get_shape();
+    const auto& dst_shape = dst_tensor->get_shape();
+    OPENVINO_ASSERT(src_shape.size() == dst_shape.size());
+    OPENVINO_ASSERT(src_shape == dst_shape);
+    OPENVINO_ASSERT(kv_dim_src < src_shape.size());
+
+    if (kv_dim_src == 3u) {
+        copy_inplace_columns_by_row_chunks(src_tensor, dst_tensor);
+    } else if (kv_dim_src == 2u) {
+        copy_inplace_by_planes(src_tensor, dst_tensor);
+    } else {
+        copy_inplace(src_tensor, dst_tensor);
+    }
+}
+
 std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name(
     const std::vector<ov::Output<const ov::Node>>& ports,
     const std::string& name) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
index f526328cf12943..fa53959b1280b4 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
@@ -31,7 +31,13 @@ void copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
                         uint32_t kv_dim_src,
                         uint32_t kv_dim_dst);
 
-void move_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
+void copy_inplace_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst);
+
+void copy_inplace_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor);
+
+void copy_inplace(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor);
+
+void copy_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
                                 ov::SoPtr<ov::ITensor> dst_tensor,
                                 uint32_t kv_dim_src,
                                 uint32_t kv_dim_dst);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index 9efaf3348b9e00..bffed9211b042a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -582,6 +582,7 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
 
         const auto prefill_chunk_size = m_npuw_llm_compiled_model->m_prefill_chunk_size;
         const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
+        LOG_INFO("############pre_kv_dim and gen_kv_dim" << pre_kv_dim << " " << gen_kv_dim << ";");
         if (use_chunk_prefill) {
             // The chunk prefilled KV results are divided into two parts:
             // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor
@@ -609,7 +610,7 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
                                                                    0u,
                                                                    static_cast<uint32_t>(tokens_in_past_chunks));
 
-                        uu::move_tensor_inplace_by_dim(prefill_past_kv_chunks,
+                        uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks,
                                                        kvcache_past_kv_chunks,
                                                        pre_kv_dim,
                                                        gen_kv_dim);

From 40d955ab8d19af7d61933c487c54063794261735 Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Thu, 18 Dec 2025 16:24:22 +0800
Subject: [PATCH 03/13] Fix and optimize

---
 .../src/plugin/npuw/llm_infer_request.cpp     | 118 +-----------------
 1 file changed, 6 insertions(+), 112 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index bffed9211b042a..407c662abc338c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -582,7 +582,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
 
         const auto prefill_chunk_size = m_npuw_llm_compiled_model->m_prefill_chunk_size;
         const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
-        LOG_INFO("############pre_kv_dim and gen_kv_dim" << pre_kv_dim << " " << gen_kv_dim << ";");
         if (use_chunk_prefill) {
             // The chunk prefilled KV results are divided into two parts:
             // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor
@@ -591,8 +590,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
             // Copy part 1 KV results
             // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk
             auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk;
-            // Start counting time.
-            auto t_start = std::chrono::high_resolution_clock::now();
             if (tokens_in_past_chunks > 0) {
                 // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption
                 // This is necessary because subsequent copy operations would overwrite the shared buffer
@@ -634,114 +631,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
                     uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
                 }
             }
-            // End counting time.
-            auto t_end = std::chrono::high_resolution_clock::now();
-            auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-            LOG_INFO("############tokens_in_past_chunks cost: " << duration_ms << " ms");
-            // Copy part 2 KV results
-            auto prefill_present_kv_chunk =
-                uu::make_tensor_slice(prefill_out_tensor,
-                                      pre_kv_dim,
-                                      static_cast<uint32_t>(prefill_chunk_size - m_tokens_in_present_chunk),
-                                      static_cast<uint32_t>(prefill_chunk_size));
-
-            auto kvcache_last_kv_chunk = uu::make_tensor_slice(kvcache_in_tensor,
-                                                               gen_kv_dim,
-                                                               static_cast<uint32_t>(tokens_in_past_chunks),
-                                                               kvcache_desc.num_stored_tokens);
-
-            uu::copy_tensor_by_dim(prefill_present_kv_chunk, kvcache_last_kv_chunk, pre_kv_dim, gen_kv_dim);
-        } else {
-            auto prefill_out_slice =
-                uu::make_tensor_slice(prefill_out_tensor,
-                                      pre_kv_dim,
-                                      kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens,
-                                      kvcache_desc.max_prompt_size);
-
-            auto kvcache_in_slice =
-                uu::make_tensor_slice(kvcache_in_tensor, gen_kv_dim, 0u, kvcache_desc.num_stored_tokens);
-
-            uu::copy_tensor_by_dim(prefill_out_slice, kvcache_in_slice, pre_kv_dim, gen_kv_dim);
-        }
-    });
-    LOG_DEBUG("Done.");
-}
-
-/*
-void ov::npuw::LLMInferRequest::copy_kvcache() {
-    namespace uu = ov::npuw::util;
-    LOG_DEBUG("Copying kv-cache from prefill to generate model.");
-    LOG_BLOCK();
-    auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
-    const auto& kvcache_compiled = m_kvcache_request->get_compiled_model();
-    // FIXME: Find only matching by names outputs and copy them, having previously checked that such inputs exist
-    ov::parallel_for(kvcache_compiled->outputs().size() - layer_ids::kStartOutputKVCacheLayers, [&](size_t out_idx) {
-        const std::size_t i = layer_ids::kStartOutputKVCacheLayers + out_idx;
-        const auto& output_name = kvcache_compiled->outputs()[i].get_any_name();
-        auto prefill_out_tensor = m_prefill_request->get_tensor(m_prefill_out_ports.at(output_name));
-
-        const auto& input_name = std::regex_replace(output_name, std::regex("present"), layer_names::past_key_values);
-        if (m_kvcache_in_ports.find(input_name) == m_kvcache_in_ports.end()) {
-            // FIXME: Totally wrong debug message. input_name is an invalid name of input layer.
-            LOG_DEBUG("Input name " << input_name << " doesn't contain kv cache. Skipping.");
-            return;
-        }
-        const auto is_value_tensor = output_name.find("value") != std::string::npos;
-        const auto kv_dim = [&](bool v_trans) -> uint32_t {
-            return (is_value_tensor && v_trans) ? 3u : kvcache_desc.dim;
-        };
-
-        const auto& pre_kv_dim = kv_dim(kvcache_desc.v_tensors_transposed_pre);
-        const auto& gen_kv_dim = kv_dim(kvcache_desc.v_tensors_transposed_gen);
-        auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name));
-
-        const auto prefill_chunk_size = m_npuw_llm_compiled_model->m_prefill_chunk_size;
-        const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
-        if (use_chunk_prefill) {
-            // The chunk prefilled KV results are divided into two parts:
-            // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor
-            // Part 2: The kv results from the last loop remain in the 'present' KV output tensor
-            // The task is to copy both parts into the KV-cache input tensor for the decoding process
-            // Copy part 1 KV results
-            // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk
-            auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk;
-            // Start counting time.
-            auto t_start = std::chrono::high_resolution_clock::now();
-            if (tokens_in_past_chunks > 0) {
-                // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption
-                // This is necessary because subsequent copy operations would overwrite the shared buffer
-                auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name));
-                ov::SoPtr<ov::ITensor> tmp_dense_kv_tensor;
-                ov::SoPtr<ov::ITensor> prefill_past_kv_chunks;
-                if (m_past_kv_bound) {
-                    tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(),
-                                                                   prefill_past_kv->get_shape(),
-                                                                   m_pre_alloc_device,
-                                                                   m_npuw_llm_compiled_model->get_plugin());
-                    prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
-                    prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor,
-                                                               pre_kv_dim,
-                                                               0u,
-                                                               static_cast<uint32_t>(tokens_in_past_chunks));
-                } else {
-                    prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv,
-                                                               pre_kv_dim,
-                                                               0u,
-                                                               static_cast<uint32_t>(tokens_in_past_chunks));
-                }
-
-                auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor,
-                                                                    gen_kv_dim,
-                                                                    0u,
-                                                                    static_cast<uint32_t>(tokens_in_past_chunks));
-
-                uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
-            }
-            // End counting time.
-            auto t_end = std::chrono::high_resolution_clock::now();
-            auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-            LOG_INFO("########################################tokens_in_past_chunks cost: " << duration_ms << " ms");
-
             // Copy part 2 KV results
             auto prefill_present_kv_chunk =
                 uu::make_tensor_slice(prefill_out_tensor,
@@ -770,7 +659,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
     });
     LOG_DEBUG("Done.");
 }
-*/
 
 void ov::npuw::LLMInferRequest::update_kvcache_for(
     std::shared_ptr<ov::IAsyncInferRequest> request,
@@ -1077,7 +965,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     if (!m_generate_initialized) {
         LOG_DEBUG("Copy kv-cache from prefill to generate model.");
         if (kvcache_desc.num_stored_tokens > 0) {
+            // Start counting time.
+            auto t_start = std::chrono::high_resolution_clock::now();
             copy_kvcache();
+            // End counting time.
+            auto t_end = std::chrono::high_resolution_clock::now();
+            auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
+            LOG_INFO("cost of copy_kvcache(): " << duration_ms << " ms");
         }
 
         LOG_DEBUG("Prepare inputs.");

From afd44182b76f543fabfdbe44216ea970a566d55c Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Tue, 6 Jan 2026 16:35:07 +0800
Subject: [PATCH 04/13] Refactor

---
 .../src/plugin/npuw/infer_request_utils.cpp   | 419 ++++++++++++------
 .../src/plugin/npuw/infer_request_utils.hpp   |   8 +-
 .../intel_npu/src/plugin/npuw/util.cpp        |   3 +-
 3 files changed, 282 insertions(+), 148 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
index ce81501ed56e96..131f093289f4e8 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
@@ -1,3 +1,165 @@
+// // Copyright (C) 2025 Intel Corporation
+// // SPDX-License-Identifier: Apache-2.0
+// //
+
+// #include "infer_request_utils.hpp"
+
+// #include "logging.hpp"
+// #include "openvino/runtime/make_tensor.hpp"  // get_tensor_impl
+// #include "util_xarch.hpp"
+
+// // FIXME: Use ov::npuw::util::view instead
+// ov::SoPtr<ov::ITensor> ov::npuw::util::make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
+//                                                          uint32_t dim,
+//                                                          uint32_t start_pos,
+//                                                          uint32_t end_pos) {
+//     ov::Shape start_shape(std::vector<size_t>(tensor->get_shape().size(), 0u));
+//     start_shape[dim] = start_pos;
+//     ov::Shape end_shape = tensor->get_shape();
+//     end_shape[dim] = end_pos;
+//     return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape));
+// }
+
+// void ov::npuw::util::copy_to_right(const ov::SoPtr<ov::ITensor>& src, const ov::SoPtr<ov::ITensor>& dst) {
+//     OPENVINO_ASSERT(src->get_byte_size() <= dst->get_byte_size());
+//     std::copy_n(reinterpret_cast<uint8_t*>(src->data()),
+//                 src->get_byte_size(),
+//                 reinterpret_cast<uint8_t*>(dst->data()) + dst->get_byte_size() - src->get_byte_size());
+// }
+
+// void ov::npuw::util::copy_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) {
+//     // [1, H, S1, E] -> [1, H, S2, E]
+//     const int N = 0;
+//     const int H = 1;
+//     const int S = 2;
+//     const int E = 3;
+
+//     OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]);
+//     OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]);
+//     OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]);
+//     OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());
+//     OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u);
+//     OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u);
+
+//     const auto* src_tensor_data = reinterpret_cast<uint8_t*>(src_tensor->data());
+//     auto* dst_tensor_data = reinterpret_cast<uint8_t*>(dst_tensor->data());
+
+//     const auto num_planes = src_tensor->get_shape()[H];
+//     const auto src_plane_stride = src_tensor->get_strides()[H];
+//     const auto dst_plane_stride = dst_tensor->get_strides()[H];
+//     const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S];
+
+//     for (size_t i = 0; i < num_planes; ++i) {
+//         std::copy_n(src_tensor_data, plane_size_in_bytes, dst_tensor_data);
+//         dst_tensor_data += dst_plane_stride;
+//         src_tensor_data += src_plane_stride;
+//     }
+// }
+
+// void ov::npuw::util::copy_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst) {
+//     /*
+//       src/dst layout: [1, heads, emb_size, seq_len]
+
+//       X[*,i] - embedding for i-th token,
+//       Instead of copy columns, copy rows X[i,*]
+
+//       [[X00 X01 ... X0n]      [[X00 X01 ... X0n]
+//        [X10 X11 ... X1n]       [X10 X11 ... X1n]
+//        [X20 X21 ... X2n]  ...  [X20 X21 ... X2n]
+//              ...                     ...
+//        [Xm0 Xm1 ... Xmn]]      [Xm0 Xm1 ... Xmn]]
+//     */
+
+//     const auto& src_shape = src->get_shape();
+
+//     OPENVINO_ASSERT(src_shape.size() == 4u);
+//     OPENVINO_ASSERT(src_shape == dst->get_shape());
+//     OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size());
+
+//     const auto& src_strides = src->get_strides();
+//     const auto& dst_strides = dst->get_strides();
+//     const auto elem_size = src->get_byte_size() / src->get_size();
+
+//     const auto C = src_shape[1];
+//     const auto H = src_shape[2];
+//     const auto W = src_shape[3];
+
+//     const auto IS_H = src_strides[2];
+//     const auto OS_H = dst_strides[2];
+
+//     const size_t chunk_byte_size = W * elem_size;
+
+//     const auto* src_p = static_cast<uint8_t*>(src->data());
+//     auto* dst_p = static_cast<uint8_t*>(dst->data());
+
+//     for (size_t i = 0; i < C * H; ++i) {
+//         const size_t src_offset = i * IS_H;
+//         const size_t dst_offset = i * OS_H;
+//         std::copy_n(src_p + src_offset, chunk_byte_size, dst_p + dst_offset);
+//     }
+// }
+
+// void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
+//                                         ov::SoPtr<ov::ITensor> dst_tensor,
+//                                         uint32_t kv_dim_src,
+//                                         uint32_t kv_dim_dst) {
+//     if (kv_dim_src != kv_dim_dst) {
+//         // new case - do a generic copy for now (in fact it is a permute)
+//         // Example:
+//         //   kv_dim_src         kv_dim_dst
+//         //       v                     v
+//         // [1,8,256,128] --> [1,8,128,256]
+//         const auto& src_shape = src_tensor->get_shape();
+//         const auto& dst_shape = dst_tensor->get_shape();
+//         NPUW_ASSERT(src_shape.size() == 4);
+//         NPUW_ASSERT(dst_shape.size() == 4);
+//         NPUW_ASSERT(kv_dim_src < 4);
+//         NPUW_ASSERT(kv_dim_dst < 4);
+//         NPUW_ASSERT(src_shape[kv_dim_src] == dst_shape[kv_dim_dst]);
+
+//         std::array<int, 4> axis = {0, 1, 2, 3};
+//         // Remap like 0,1,2,3 => 0,1,3,2 (see example)
+//         std::swap(axis[kv_dim_src], axis[kv_dim_dst]);
+//         ov::npuw::util::permute_i4d(src_tensor, dst_tensor, axis);
+//         return;
+//     }
+//     // Old behavior
+//     NPUW_ASSERT(kv_dim_src == kv_dim_dst);
+//     if (kv_dim_src == 3u) {
+//         // Asserting that we work with last dimenston here:
+//         const auto& src_shape = src_tensor->get_shape();
+//         OPENVINO_ASSERT(src_shape.size() == 4);
+//         // If last dimenstion of src_tensor is equal to 1, then we can squeeze
+//         // src_shape from [1, heads, d_v, seq_len=1] to [heads, d_v].
+//         // We can then treat src_tensor as a continuous tensor of row value vectors
+//         // for multiple heads, while dst_tensor will still have [1, heads, d_v, seq_len!=1],
+//         // shape, awaiting updates at column dimension, as value vectors are columns now.
+//         if (src_shape[kv_dim_src] == 1 && src_tensor->is_continuous()) {
+//             // FIXME: ov::npuw::util::XARCH::copy_row_as_column(src_tensor, dst_tensor) throws when used here
+//             copy_columns_by_row_chunks(src_tensor, dst_tensor);
+//         } else {
+//             copy_columns_by_row_chunks(src_tensor, dst_tensor);
+//         }
+//     } else if (kv_dim_src == 2u) {
+//         copy_by_planes(src_tensor, dst_tensor);
+//     } else {
+//         src_tensor->copy_to(dst_tensor._ptr);
+//     }
+// }
+
+// std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name(
+//     const std::vector<ov::Output<const ov::Node>>& ports,
+//     const std::string& name) {
+//     auto it = std::find_if(ports.begin(), ports.end(), [&](const auto& port) {
+//         return port.get_names().count(name) != 0;
+//     });
+//     if (it == ports.end()) {
+//         return std::nullopt;
+//     }
+//     return std::make_optional(*it);
+// }
+
+//////////////////////////////////////////////////////////////////////
 // Copyright (C) 2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -147,179 +309,158 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
     }
 }
 
-void ov::npuw::util::copy_inplace_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst) {
-    const auto& src_shape = src->get_shape();
-
-    OPENVINO_ASSERT(src_shape.size() == 4u);
-    OPENVINO_ASSERT(src_shape == dst->get_shape());
-    OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size());
-
-    const auto& src_strides = src->get_strides();
-    const auto& dst_strides = dst->get_strides();
-    const auto elem_size = src->get_byte_size() / src->get_size();
-
-    const auto C = src_shape[1];
-    const auto H = src_shape[2];
-    const auto W = src_shape[3];
-
-    const auto IS_H = src_strides[2];
-    const auto OS_H = dst_strides[2];
+void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_tensor,
+                                               ov::SoPtr<ov::ITensor> dst_tensor) {
+    OPENVINO_ASSERT(src_tensor);
+    OPENVINO_ASSERT(dst_tensor);
 
-    const size_t chunk_byte_size = W * elem_size;
+    void* base_data = src_tensor->data();
+    void* dst_data = dst_tensor->data();
+    OPENVINO_ASSERT(base_data && dst_data);
+    OPENVINO_ASSERT(base_data == dst_data);
 
-    const auto* src_p = static_cast<uint8_t*>(src->data());
-    auto* dst_p = static_cast<uint8_t*>(dst->data());
+    const auto& shape0 = src_tensor->get_shape();
+    const auto& dst_shape0 = dst_tensor->get_shape();
+    OPENVINO_ASSERT(shape0 == dst_shape0);
 
-    const size_t num_chunks = C * H;
-    if (num_chunks == 0 || chunk_byte_size == 0) {
+    const size_t rank0 = shape0.size();
+    if (rank0 == 0) {
         return;
     }
 
-    for (size_t i = num_chunks; i-- > 0;) {
-        const size_t src_offset = i * IS_H;
-        const size_t dst_offset = i * OS_H;
-        std::memmove(dst_p + dst_offset, src_p + src_offset, chunk_byte_size);
+    for (size_t d = 0; d < rank0; ++d) {
+        if (shape0[d] == 0) {
+            return;
+        }
     }
-}
 
-void ov::npuw::util::copy_inplace_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) {
-    // [1, H, S1, E] -> [1, H, S2, E]
-    const int N = 0;
-    const int H = 1;
-    const int S = 2;
-    const int E = 3;
+    const size_t total_elems = src_tensor->get_size();
+    OPENVINO_ASSERT(total_elems != 0);
+    const size_t elem_size = src_tensor->get_byte_size() / total_elems;
 
-    OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]);
-    OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]);
-    OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]);
-    OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());
-    OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u);
-    OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u);
+    ov::Strides src_strides0 = src_tensor->get_strides();
+    ov::Strides dst_strides0 = dst_tensor->get_strides();
+    OPENVINO_ASSERT(src_strides0.size() == rank0);
+    OPENVINO_ASSERT(dst_strides0.size() == rank0);
 
-    const auto* src_base = reinterpret_cast<uint8_t*>(src_tensor->data());
-    auto* dst_base = reinterpret_cast<uint8_t*>(dst_tensor->data());
+    // Build default byte strides for given shape (same as ov::ITensor::copy_to logic).
+    ov::Strides default_strides(rank0, 0);
+    default_strides[rank0 - 1] = elem_size;
+    for (size_t i = rank0 - 1; i > 0; --i) {
+        default_strides[i - 1] = default_strides[i] * shape0[i];
+    }
 
-    const auto num_planes = src_tensor->get_shape()[H];
-    const auto src_plane_stride = src_tensor->get_strides()[H];
-    const auto dst_plane_stride = dst_tensor->get_strides()[H];
-    const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S];
+    // Your explicit preconditions:
+    OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size);
+    OPENVINO_ASSERT(dst_strides0[rank0 - 1] == elem_size);
+    OPENVINO_ASSERT(default_strides[rank0 - 1] == elem_size);
 
-    if (num_planes == 0 || plane_size_in_bytes == 0) {
-        return;
+    if (rank0 >= 2) {
+        const size_t packed = shape0[rank0 - 1] * elem_size;
+        OPENVINO_ASSERT(src_strides0[rank0 - 2] == packed);
+        OPENVINO_ASSERT(dst_strides0[rank0 - 2] == packed);
+        OPENVINO_ASSERT(default_strides[rank0 - 2] == packed);
     }
 
-    for (size_t i = num_planes; i-- > 0;) {
-        const auto* src_ptr = src_base + i * src_plane_stride;
-        auto* dst_ptr = dst_base + i * dst_plane_stride;
-        std::memmove(dst_ptr, src_ptr, plane_size_in_bytes);
+    // Find the COMMON trailing segment where src_stride == dst_stride == default_stride.
+    // This is the only part eligible for flattening.
+    size_t cut = rank0 - 1;  // at worst, we can always copy along last dim
+    for (size_t inverted_idx = rank0 - 1; inverted_idx < rank0; --inverted_idx) {
+        const bool ok = (src_strides0[inverted_idx] == default_strides[inverted_idx]) &&
+                        (dst_strides0[inverted_idx] == default_strides[inverted_idx]) &&
+                        (src_strides0[inverted_idx] == dst_strides0[inverted_idx]);
+        if (ok) {
+            cut = inverted_idx;
+            if (inverted_idx == 0) {
+                break;
+            }
+            continue;
+        }
+        break;
     }
-}
 
-void ov::npuw::util::copy_inplace(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) {
-    const auto& shape = src_tensor->get_shape();
+    // Fold [cut..rank0-1] into a single last dimension.
+    ov::Shape shape;
+    ov::Strides src_strides;
+    ov::Strides dst_strides;
 
-    auto* base = static_cast<uint8_t*>(src_tensor->data());
+    shape.reserve(cut + 1);
+    src_strides.reserve(cut + 1);
+    dst_strides.reserve(cut + 1);
 
-    auto src_strides = src_tensor->get_strides();
-    auto dst_strides = dst_tensor->get_strides();
+    for (size_t d = 0; d < cut; ++d) {
+        shape.push_back(shape0[d]);
+        src_strides.push_back(src_strides0[d]);
+        dst_strides.push_back(dst_strides0[d]);
+    }
 
-    const size_t total_elems = src_tensor->get_size();
-    const size_t elem_size = src_tensor->get_byte_size() / total_elems;
+    size_t folded_last = 1;
+    for (size_t d = cut; d < rank0; ++d) {
+        folded_last *= shape0[d];
+    }
+    shape.push_back(folded_last);
+
+    // For the folded last dim, the step is element-size (bytes per element).
+    // (Since the whole folded tail is default-contiguous, this holds.)
+    src_strides.push_back(elem_size);
+    dst_strides.push_back(elem_size);
+
+    const size_t rank = shape.size();
+    OPENVINO_ASSERT(rank >= 1);
 
-    if (src_strides == dst_strides) {
-        LOG_INFO("identical strides, skip");
+    const size_t row_elems = shape[rank - 1];
+    const size_t row_bytes = row_elems * elem_size;
+    if (row_bytes == 0) {
         return;
     }
 
-    for (size_t d = 0; d < shape.size(); ++d) {
-        if (shape[d] == 0) {
-            LOG_INFO("zero-sized dimension, nothing to move");
-            return;
-        }
+    // Iterate outer coordinates in reverse lexicographic order for overlap-safe memmove.
+    size_t num_rows = 1;
+    for (size_t d = 0; d + 1 < rank; ++d) {
+        num_rows *= shape[d];
+    }
+    if (num_rows == 0) {
+        return;
     }
 
-    auto rank = shape.size();
-
-    ov::Shape cur_pos{0};
-    ov::Shape max_pos{1};
+    auto* base = static_cast<uint8_t*>(base_data);
 
-    if (src_tensor->get_element_type().bitwidth() < 8 || (is_scalar(shape))) {
-        // Doesn't support strides for LP types
-        // or both tensors have default strides
-        // Strides and positions already initialized
-    } else {
-        ov::Strides src_str, dst_str;
-        // Calculate src and dst shapes
-        bool found_step = false;
-        for (size_t inverted_idx = rank - 1; inverted_idx < rank; --inverted_idx) {
-            if (!found_step) {
-                if (src_strides[inverted_idx] == dst_strides[inverted_idx]) {
-                    continue;
-                } else {
-                    found_step = true;
-                    size_t strides_size = inverted_idx + 1;
-                    // Set right size
-                    src_str.resize(strides_size + 1);
-                    dst_str.resize(strides_size + 1);
-                    max_pos.resize(strides_size + 1);
-                    cur_pos.resize(strides_size + 1);
-                    // In case of default continuous strides we can copy several elements
-                    // In other case only one element
-                    size_t dim = 1;
-                    size_t strides = elem_size;
-
-                    if (strides_size < src_strides.size()) {
-                        strides = src_strides[strides_size];
-                        dim = shape[strides_size];
-                    }
-                    src_str[strides_size] = strides;
-                    dst_str[strides_size] = strides;
-                    max_pos[strides_size] = dim;
-                    cur_pos[strides_size] = max_pos[strides_size] - 1;
-                }
-            }
-            src_str[inverted_idx] = src_strides[inverted_idx];
-            dst_str[inverted_idx] = dst_strides[inverted_idx];
-            max_pos[inverted_idx] = shape[inverted_idx];
-            cur_pos[inverted_idx] = max_pos[inverted_idx] - 1;
-        }
-        src_strides = std::move(src_str);
-        dst_strides = std::move(dst_str);
+    ov::Shape idx(rank - 1, 0);
+    for (size_t d = 0; d + 1 < rank; ++d) {
+        idx[d] = shape[d] - 1;
     }
 
-    size_t src_off = 0;
-    size_t dst_off = 0;
-    for (size_t d = 0; d < max_pos.size(); ++d) {
-        src_off += cur_pos[d] * src_strides[d];
-        dst_off += cur_pos[d] * dst_strides[d];
-    }
+    auto compute_offset = [&](const ov::Shape& outer, const ov::Strides& strides_bytes) -> size_t {
+        size_t off = 0;
+        for (size_t d = 0; d < outer.size(); ++d) {
+            off += outer[d] * strides_bytes[d];
+        }
+        return off;
+    };
 
-    auto dec_index_and_update_offsets = [&]() -> bool {
-        for (int d = static_cast<int>(max_pos.size()) - 1; d >= 0; --d) {
-            const size_t old = cur_pos[static_cast<size_t>(d)];
-            if (old > 0) {
-                cur_pos[static_cast<size_t>(d)] = old - 1;
-                src_off -= src_strides[static_cast<size_t>(d)];
-                dst_off -= dst_strides[static_cast<size_t>(d)];
+    auto dec_outer = [&]() -> bool {
+        for (int d = static_cast<int>(rank) - 2; d >= 0; --d) {
+            const size_t ud = static_cast<size_t>(d);
+            if (idx[ud] > 0) {
+                --idx[ud];
                 return true;
-            } else {
-                cur_pos[static_cast<size_t>(d)] = max_pos[static_cast<size_t>(d)] - 1;
-                src_off += src_strides[static_cast<size_t>(d)] * (max_pos[static_cast<size_t>(d)] - 1);
-                dst_off += dst_strides[static_cast<size_t>(d)] * (max_pos[static_cast<size_t>(d)] - 1);
             }
+            idx[ud] = shape[ud] - 1;
         }
         return false;
     };
 
     while (true) {
+        const size_t src_off = compute_offset(idx, src_strides);
+        const size_t dst_off = compute_offset(idx, dst_strides);
+
         uint8_t* src_ptr = base + src_off;
         uint8_t* dst_ptr = base + dst_off;
-
         if (src_ptr != dst_ptr) {
-            std::memmove(dst_ptr, src_ptr, src_strides[src_strides.size() - 1]);
+            std::memmove(dst_ptr, src_ptr, row_bytes);
         }
 
-        if (!dec_index_and_update_offsets()) {
+        if (!dec_outer()) {
             break;
         }
     }
@@ -329,7 +470,7 @@ void ov::npuw::util::copy_inplace(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<o
 // Requirements:
 //   - kv_dim_src == kv_dim_dst, otherwise throws
 //   - src_tensor->data() == dst_tensor->data()
-void ov::npuw::util::copy_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
+void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor> src_tensor,
                                                 ov::SoPtr<ov::ITensor> dst_tensor,
                                                 uint32_t kv_dim_src,
                                                 uint32_t kv_dim_dst) {
@@ -352,13 +493,9 @@ void ov::npuw::util::copy_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tenso
     OPENVINO_ASSERT(src_shape == dst_shape);
     OPENVINO_ASSERT(kv_dim_src < src_shape.size());
 
-    if (kv_dim_src == 3u) {
-        copy_inplace_columns_by_row_chunks(src_tensor, dst_tensor);
-    } else if (kv_dim_src == 2u) {
-        copy_inplace_by_planes(src_tensor, dst_tensor);
-    } else {
-        copy_inplace(src_tensor, dst_tensor);
-    }
+    // One generic implementation for all kv_dim.
+    // We rely on row-wise memmove on the (possibly flattened) last dimension and stride-based addressing.
+    copy_inplace_generic_rows(src_tensor, dst_tensor);
 }
 
 std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name(
@@ -371,4 +508,4 @@ std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name(
         return std::nullopt;
     }
     return std::make_optional(*it);
-}
+}
\ No newline at end of file
diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
index b47bea1579bdec..d15c841b117ab7 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
@@ -33,13 +33,9 @@ void copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
                         uint32_t kv_dim_src,
                         uint32_t kv_dim_dst);
 
-void copy_inplace_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst);
+void copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor);
 
-void copy_inplace_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor);
-
-void copy_inplace(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor);
-
-void copy_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
+void copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor> src_tensor,
                                 ov::SoPtr<ov::ITensor> dst_tensor,
                                 uint32_t kv_dim_src,
                                 uint32_t kv_dim_dst);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
index 80eb0aeeb590f0..56917dc8cc835c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
@@ -280,8 +280,9 @@ void ov::npuw::util::unpack(const ov::SoPtr<ov::ITensor>& from,
         unpack_nf4f16(from, scale, to, unpack_options);
     } else if (type_from == ov::element::f8e4m3 || type_from == ov::element::f8e5m2 ||
                type_from == ov::element::f8e8m0) {
-        // FIXME: Implement XARCH::unpack
+        LOG_INFO("######################## unpack_f8f16");
         unpack_f8f16(from, scale, to, unpack_options);
+        //ov::npuw::util::XARCH::unpack_f8f16_scale(from, scale, to, unpack_options);
     } else if (type_from == ov::element::f16) {
         // FIXME: Implement XARCH::unpack
         unpack_f16f16(from, scale, to, unpack_options);

From aa539a2330876b705d2174cc13decd2f2cb0b94d Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Tue, 6 Jan 2026 20:56:00 +0800
Subject: [PATCH 05/13] Add tests

---
 .../tests/unit/npuw/copy_inplace.cpp          | 208 ++++++++++++++++++
 .../tests/unit/npuw/copy_inplace.hpp          | 149 +++++++++++++
 2 files changed, 357 insertions(+)
 create mode 100644 src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
 create mode 100644 src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp

diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
new file mode 100644
index 00000000000000..fb3225b7c5d4aa
--- /dev/null
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
@@ -0,0 +1,208 @@
+// Copyright (C) 2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef HAVE_AVX2
+#    include "copy_inplace.hpp"
+
+namespace {
+
+static ov::Strides make_padded_strides_keep_tail_default(const ov::Shape& shape,
+                                                         const ov::element::Type& et,
+                                                         size_t kv_dim,
+                                                         size_t pad_elems) {
+    ov::Strides s = copy_inplace_details::default_byte_strides(shape, et);
+
+    // Keep last 2 dims default contiguous explicitly.
+    if (shape.size() >= 1) {
+        s.back() = et.size();
+    }
+    if (shape.size() >= 2) {
+        s[shape.size() - 2] = shape.back() * et.size();
+    }
+
+    const size_t rank = shape.size();
+    if (rank <= 2) {
+        return s;
+    }
+
+    const size_t last2_begin = rank - 2;
+    for (size_t d = 0; d < last2_begin; ++d) {
+        if (d <= kv_dim) {
+            s[d] += pad_elems * et.size();
+        }
+    }
+    return s;
+}
+
+static std::vector<int8_t> to_i8(const std::vector<uint8_t>& v) {
+    std::vector<int8_t> out(v.size());
+    std::memcpy(out.data(), v.data(), v.size());
+    return out;
+}
+
+void CopyInplaceTestsBase::make_input() {
+    const auto elem_bytes = copy_inplace_details::elem_size_bytes(type);
+    const auto total_elems = ov::shape_size(shape);
+    ASSERT_GT(total_elems, 0u);
+
+    auto max_offset = [&](const ov::Strides& strides) -> size_t {
+        size_t off = 0;
+        for (size_t d = 0; d < shape.size(); ++d) {
+            off += (shape[d] - 1) * strides[d];
+        }
+        return off;
+    };
+
+    const size_t src_max = max_offset(src_strides);
+    const size_t dst_max = max_offset(dst_strides);
+    const size_t byte_size = std::max(src_max, dst_max) + elem_bytes;
+
+    base_bytes_initial.resize(byte_size);
+    ref_bytes.assign(byte_size, 0);
+    out_bytes.assign(byte_size, 0);
+
+    std::mt19937 rng(42);
+    std::uniform_int_distribution<int> dist(0, 255);
+    for (size_t i = 0; i < base_bytes_initial.size(); ++i) {
+        base_bytes_initial[i] = static_cast<uint8_t>(dist(rng));
+    }
+
+    // External-memory tensor (safe for unit test lifetime).
+    baseTensor = ov::Tensor(ov::element::u8, ov::Shape{byte_size}, base_bytes_initial.data());
+}
+
+void CopyInplaceTestsBase::make_views() {
+    src_strides = copy_inplace_details::default_byte_strides(shape, type);
+
+    const size_t pad_elems = 13;
+    dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems);
+
+    void* base_ptr = baseTensor.data();
+    ASSERT_NE(base_ptr, nullptr);
+
+    srcView = ov::Tensor(type, shape, base_ptr, src_strides);
+    dstView = ov::Tensor(type, shape, base_ptr, dst_strides);
+}
+
+bool CopyInplaceTestsBase::isNegative() const {
+    if (shape.size() < 2) {
+        return true;
+    }
+    if (kv_dim >= shape.size()) {
+        return true;
+    }
+    if (type.bitwidth() < 8) {
+        return true;
+    }
+    return false;
+}
+
+void CopyInplaceTestsBase::make_ref_output() {
+    ref_bytes = base_bytes_initial;
+
+    const auto elem_bytes = copy_inplace_details::elem_size_bytes(type);
+    const uint8_t* base_in = base_bytes_initial.data();
+
+    std::vector<uint8_t> tmp_out = base_bytes_initial;
+
+    ov::Shape idx(shape.size(), 0);
+    std::vector<uint8_t> elem(elem_bytes);
+
+    for (;;) {
+        copy_inplace_details::read_elem_bytes(base_in, idx, src_strides, elem_bytes, elem.data());
+        copy_inplace_details::write_elem_bytes(tmp_out.data(), idx, dst_strides, elem_bytes, elem.data());
+
+        if (!copy_inplace_details::next_index(idx, shape)) {
+            break;
+        }
+    }
+
+    ref_bytes = std::move(tmp_out);
+}
+
+void CopyInplaceTestsBase::SetUp(const CopyInplaceTestsParams& getParam) {
+    ShapesInitializer shapeInit;
+    ov::element::Type_t t;
+    std::tie(t, shapeInit, kv_dim) = getParam;
+
+    type = ov::element::Type(t);
+
+    std::vector<int> dims;
+    shapeInit(dims);
+    shape = ov::Shape{dims.begin(), dims.end()};
+
+    // Precompute strides first (no base pointer needed)
+    src_strides = copy_inplace_details::default_byte_strides(shape, type);
+    const size_t pad_elems = 13;
+    dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems);
+
+    // Now allocate/fill buffer
+    make_input();
+
+    // Create views (needs baseTensor pointer)
+    void* base_ptr = baseTensor.data();
+    ASSERT_NE(base_ptr, nullptr);
+    srcView = ov::Tensor(type, shape, base_ptr, src_strides);
+    dstView = ov::Tensor(type, shape, base_ptr, dst_strides);
+
+    if (!isNegative()) {
+        make_ref_output();
+    }
+}
+
+std::string CopyInplaceTestsBase::ToString() const {
+    std::ostringstream oss;
+    oss << "[";
+    for (size_t i = 0; i < shape.size(); ++i) {
+        oss << shape[i] << ((i + 1 == shape.size()) ? "" : "x");
+    }
+    oss << "]"
+        << "_type_" << type << "_kv_" << kv_dim;
+    return oss.str();
+}
+
+TEST_P(CopyInplaceTests, copy_tensor_inplace_by_dim_correctness) {
+    ASSERT_NO_THROW_IF(!isNegative(), {
+        auto src_it = ov::get_tensor_impl(srcView);
+        auto dst_it = ov::get_tensor_impl(dstView);
+
+        ov::npuw::util::copy_tensor_inplace_by_dim(src_it,
+                                                   dst_it,
+                                                   static_cast<uint32_t>(kv_dim),
+                                                   static_cast<uint32_t>(kv_dim));
+
+        uint8_t* base_ptr = baseTensor.data<uint8_t>();
+        ASSERT_NE(base_ptr, nullptr);
+        out_bytes.assign(base_ptr, base_ptr + out_bytes.size());
+
+        // test_utils.hpp defines details::ArraysMatch for vector<int8_t>
+        ASSERT_TRUE(details::ArraysMatch(to_i8(out_bytes), to_i8(ref_bytes)));
+    });
+}
+
+// Test cases
+const auto TestCases = ::testing::Combine(
+    ::testing::ValuesIn({ov::element::Type_t::i8, ov::element::Type_t::f16, ov::element::Type_t::f32}),
+    details::ShapesIn({
+        Tensors{ input = {1, 2, 3, 4};
+}  // namespace
+, Tensors {
+    input = {1, 8, 16, 32};
+}
+, Tensors {
+    input = {1, 16, 33, 64};
+}
+, Tensors {
+    input = {1, 4, 128, 16};
+}
+,
+}),
+    ::testing::Values<std::size_t>(0, 1, 2, 3)
+);
+
+INSTANTIATE_TEST_SUITE_P(CopyInplaceTests, CopyInplaceTests, TestCases, CopyInplaceTests::getTestCaseName);
+
+}  // namespace
+
+#endif  // HAVE_AVX2
\ No newline at end of file
diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
new file mode 100644
index 00000000000000..b98d355396f333
--- /dev/null
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
@@ -0,0 +1,149 @@
+// Copyright (C) 2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+#include "infer_request_utils.hpp"  // copy_tensor_inplace_by_dim
+#include "openvino/runtime/make_tensor.hpp"
+#include "openvino/runtime/tensor.hpp"
+#include "test_utils.hpp"
+
+namespace {
+
+// NOTE: do NOT redefine ASSERT_NO_THROW_* macros here.
+// They already exist in test_utils.hpp and warnings are treated as errors.
+
+// (type, shape, kv_dim)
+using CopyInplaceTestsParams = std::tuple<ov::element::Type_t, ShapesInitializer, std::size_t>;
+
+namespace copy_inplace_details {
+
+inline ov::Strides default_byte_strides(const ov::Shape& shape, const ov::element::Type& et) {
+    ov::Strides strides(shape.size(), 0);
+    if (!strides.empty()) {
+        strides.back() = et.size();
+        for (size_t i = shape.size() - 1; i > 0; --i) {
+            strides[i - 1] = strides[i] * shape[i];
+        }
+    }
+    return strides;
+}
+
+inline size_t elem_size_bytes(const ov::element::Type& et) {
+    return et.size();
+}
+
+inline void read_elem_bytes(const uint8_t* base,
+                            const ov::Shape& idx,
+                            const ov::Strides& strides,
+                            size_t elem_bytes,
+                            uint8_t* out_elem) {
+    size_t off = 0;
+    for (size_t d = 0; d < idx.size(); ++d) {
+        off += idx[d] * strides[d];
+    }
+    std::memcpy(out_elem, base + off, elem_bytes);
+}
+
+inline void write_elem_bytes(uint8_t* base,
+                             const ov::Shape& idx,
+                             const ov::Strides& strides,
+                             size_t elem_bytes,
+                             const uint8_t* elem) {
+    size_t off = 0;
+    for (size_t d = 0; d < idx.size(); ++d) {
+        off += idx[d] * strides[d];
+    }
+    std::memcpy(base + off, elem, elem_bytes);
+}
+
+// Enumerate ND index in lexicographic order.
+inline bool next_index(ov::Shape& idx, const ov::Shape& shape) {
+    // shape is assumed non-empty and all dims > 0 in this test suite
+    for (int d = static_cast<int>(shape.size()) - 1; d >= 0; --d) {
+        const size_t ud = static_cast<size_t>(d);
+        if (++idx[ud] < shape[ud]) {
+            return true;
+        }
+        idx[ud] = 0;
+    }
+    return false;
+}
+
+}  // namespace copy_inplace_details
+
+class CopyInplaceTestsBase {
+protected:
+    ov::element::Type type;
+    ov::Tensor baseTensor;  // shared buffer owner (u8)
+    ov::Tensor srcView;
+    ov::Tensor dstView;
+    ov::Shape shape;
+
+    std::vector<uint8_t> base_bytes_initial;
+    std::vector<uint8_t> ref_bytes;
+    std::vector<uint8_t> out_bytes;
+
+    std::size_t kv_dim = 0;
+
+    ov::Strides src_strides;
+    ov::Strides dst_strides;
+
+    void make_input();
+    void make_views();
+    void make_ref_output();
+    bool isNegative() const;
+
+public:
+    void SetUp(const CopyInplaceTestsParams& getParam);
+    std::string ToString() const;
+};
+
+template <class T>
+class CopyInplaceTestsTmpl : public ::testing::Test,
+                             public T,
+                             public ::testing::WithParamInterface<CopyInplaceTestsParams> {
+protected:
+    void SetUp() override {
+        T::SetUp(GetParam());
+    }
+
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<CopyInplaceTestsParams>& obj) {
+        ov::element::Type_t t;
+        ShapesInitializer shapeInit;
+        std::size_t kv_dim = 0;
+        std::tie(t, shapeInit, kv_dim) = obj.param;
+
+        std::vector<int> dims;
+        shapeInit(dims);
+
+        std::ostringstream oss;
+        oss << "S";
+        for (size_t i = 0; i < dims.size(); ++i) {
+            oss << dims[i];
+            if (i + 1 != dims.size())
+                oss << "x";
+        }
+        oss << "_T" << ov::element::Type(t) << "_KV" << kv_dim;
+        return oss.str();
+    }
+};
+
+using CopyInplaceTests = CopyInplaceTestsTmpl<CopyInplaceTestsBase>;
+
+}  // anonymous namespace
\ No newline at end of file

From c874ad422db73e3f0578b863687a2eeb7951352e Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Thu, 15 Jan 2026 17:08:14 +0800
Subject: [PATCH 06/13] add unit tests and optimize

---
 .../src/plugin/npuw/infer_request_utils.cpp   | 101 +++++++++++-------
 .../src/plugin/npuw/llm_infer_request.cpp     |  40 +++----
 .../intel_npu/src/plugin/npuw/util.cpp        |   3 +-
 .../tests/unit/npuw/copy_inplace.cpp          |  35 ++++--
 4 files changed, 112 insertions(+), 67 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
index 131f093289f4e8..9e734064b331cc 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
@@ -343,25 +343,66 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
     OPENVINO_ASSERT(src_strides0.size() == rank0);
     OPENVINO_ASSERT(dst_strides0.size() == rank0);
 
-    // Build default byte strides for given shape (same as ov::ITensor::copy_to logic).
     ov::Strides default_strides(rank0, 0);
     default_strides[rank0 - 1] = elem_size;
     for (size_t i = rank0 - 1; i > 0; --i) {
         default_strides[i - 1] = default_strides[i] * shape0[i];
     }
 
-    // Your explicit preconditions:
+    auto* base = static_cast<uint8_t*>(base_data);
+
+    auto compute_offset = [&](const ov::Shape& ix, const ov::Strides& strides_bytes) -> size_t {
+        size_t off = 0;
+        for (size_t d = 0; d < ix.size(); ++d) {
+            off += ix[d] * strides_bytes[d];
+        }
+        return off;
+    };
+
+    // ---------------------------------------------------------------------
+    // Last dimension not packed in either src or dst.
+    // We cannot memmove row_bytes as a contiguous block. Do element-wise memmove.
+    // Keep reverse lexicographic order to be overlap-safe for in-place move.
+    // ---------------------------------------------------------------------
+    if (src_strides0[rank0 - 1] != elem_size || dst_strides0[rank0 - 1] != elem_size) {
+        ov::Shape idx(shape0.size(), 0);
+        for (size_t d = 0; d < rank0; ++d) {
+            idx[d] = shape0[d] - 1;
+        }
+
+        auto dec_idx = [&]() -> bool {
+            for (int d = static_cast<int>(rank0) - 1; d >= 0; --d) {
+                const size_t ud = static_cast<size_t>(d);
+                if (idx[ud] > 0) {
+                    --idx[ud];
+                    return true;
+                }
+                idx[ud] = shape0[ud] - 1;
+            }
+            return false;
+        };
+
+        while (true) {
+            const size_t src_off = compute_offset(idx, src_strides0);
+            const size_t dst_off = compute_offset(idx, dst_strides0);
+
+            uint8_t* src_ptr = base + src_off;
+            uint8_t* dst_ptr = base + dst_off;
+            if (src_ptr != dst_ptr) {
+                std::memmove(dst_ptr, src_ptr, elem_size);
+            }
+
+            if (!dec_idx()) {
+                break;
+            }
+        }
+        return;
+    }
+
     OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size);
     OPENVINO_ASSERT(dst_strides0[rank0 - 1] == elem_size);
     OPENVINO_ASSERT(default_strides[rank0 - 1] == elem_size);
 
-    if (rank0 >= 2) {
-        const size_t packed = shape0[rank0 - 1] * elem_size;
-        OPENVINO_ASSERT(src_strides0[rank0 - 2] == packed);
-        OPENVINO_ASSERT(dst_strides0[rank0 - 2] == packed);
-        OPENVINO_ASSERT(default_strides[rank0 - 2] == packed);
-    }
-
     // Find the COMMON trailing segment where src_stride == dst_stride == default_stride.
     // This is the only part eligible for flattening.
     size_t cut = rank0 - 1;  // at worst, we can always copy along last dim
@@ -401,7 +442,6 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
     shape.push_back(folded_last);
 
     // For the folded last dim, the step is element-size (bytes per element).
-    // (Since the whole folded tail is default-contiguous, this holds.)
     src_strides.push_back(elem_size);
     dst_strides.push_back(elem_size);
 
@@ -414,45 +454,34 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
         return;
     }
 
-    // Iterate outer coordinates in reverse lexicographic order for overlap-safe memmove.
-    size_t num_rows = 1;
-    for (size_t d = 0; d + 1 < rank; ++d) {
-        num_rows *= shape[d];
-    }
-    if (num_rows == 0) {
-        return;
-    }
-
-    auto* base = static_cast<uint8_t*>(base_data);
-
-    ov::Shape idx(rank - 1, 0);
+    ov::Shape outer(rank - 1, 0);
     for (size_t d = 0; d + 1 < rank; ++d) {
-        idx[d] = shape[d] - 1;
+        outer[d] = shape[d] - 1;
     }
 
-    auto compute_offset = [&](const ov::Shape& outer, const ov::Strides& strides_bytes) -> size_t {
-        size_t off = 0;
-        for (size_t d = 0; d < outer.size(); ++d) {
-            off += outer[d] * strides_bytes[d];
-        }
-        return off;
-    };
-
     auto dec_outer = [&]() -> bool {
         for (int d = static_cast<int>(rank) - 2; d >= 0; --d) {
             const size_t ud = static_cast<size_t>(d);
-            if (idx[ud] > 0) {
-                --idx[ud];
+            if (outer[ud] > 0) {
+                --outer[ud];
                 return true;
             }
-            idx[ud] = shape[ud] - 1;
+            outer[ud] = shape[ud] - 1;
         }
         return false;
     };
 
+    auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t {
+        size_t off = 0;
+        for (size_t d = 0; d < o.size(); ++d) {
+            off += o[d] * strides_bytes[d];
+        }
+        return off;
+    };
+
     while (true) {
-        const size_t src_off = compute_offset(idx, src_strides);
-        const size_t dst_off = compute_offset(idx, dst_strides);
+        const size_t src_off = compute_outer_offset(outer, src_strides);
+        const size_t dst_off = compute_outer_offset(outer, dst_strides);
 
         uint8_t* src_ptr = base + src_off;
         uint8_t* dst_ptr = base + dst_off;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index 4bc9a28b72335b..21a82380d072af 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -12,6 +12,7 @@
 #include "openvino/core/parallel.hpp"
 #include "openvino/runtime/iasync_infer_request.hpp"
 #include "util.hpp"
+#include "perf.hpp"
 
 namespace {
 
@@ -597,10 +598,10 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
                 // move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst.
                 if (m_past_kv_bound) {
                     if (pre_kv_dim == gen_kv_dim) {
-                        prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv,
-                                                                   pre_kv_dim,
-                                                                   0u,
-                                                                   static_cast<uint32_t>(tokens_in_past_chunks));
+                        prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv,
+                                                                       pre_kv_dim,
+                                                                       0u,
+                                                                       static_cast<uint32_t>(tokens_in_past_chunks));
 
                         uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks,
                                                        kvcache_past_kv_chunks,
@@ -612,17 +613,17 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
                                                                             m_pre_alloc_device,
                                                                             m_npuw_llm_compiled_model->get_plugin());
                         prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
-                        prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor,
-                                                                   pre_kv_dim,
-                                                                   0u,
-                                                                   static_cast<uint32_t>(tokens_in_past_chunks));
+                        prefill_past_kv_chunks = uu::make_tensor_slice(tmp_dense_kv_tensor,
+                                                                       pre_kv_dim,
+                                                                       0u,
+                                                                       static_cast<uint32_t>(tokens_in_past_chunks));
                         uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
                     }
                 } else {
-                    prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv,
-                                                               pre_kv_dim,
-                                                               0u,
-                                                               static_cast<uint32_t>(tokens_in_past_chunks));
+                    prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv,
+                                                                   pre_kv_dim,
+                                                                   0u,
+                                                                   static_cast<uint32_t>(tokens_in_past_chunks));
                     uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
                 }
             }
@@ -975,13 +976,14 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     if (!m_generate_initialized) {
         LOG_DEBUG("Copy kv-cache from prefill to generate model.");
         if (kvcache_desc.num_stored_tokens > 0) {
-            // Start counting time.
-            auto t_start = std::chrono::high_resolution_clock::now();
-            copy_kvcache();
-            // End counting time.
-            auto t_end = std::chrono::high_resolution_clock::now();
-            auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-            LOG_INFO("cost of copy_kvcache(): " << duration_ms << " ms");
+            using MS = ov::npuw::perf::metric<ov::npuw::perf::MSec>;
+            MS m_ms_copy_kvcache("copy_kvcache", /*active*/ true);
+
+            m_ms_copy_kvcache += ov::npuw::perf::ms_to_run([&]() {
+                copy_kvcache();
+            });
+
+            LOG_INFO("cost of copy_kvcache(): " << m_ms_copy_kvcache.med() << " ms");
         }
 
         LOG_DEBUG("Prepare inputs.");
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
index 56917dc8cc835c..80eb0aeeb590f0 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
@@ -280,9 +280,8 @@ void ov::npuw::util::unpack(const ov::SoPtr<ov::ITensor>& from,
         unpack_nf4f16(from, scale, to, unpack_options);
     } else if (type_from == ov::element::f8e4m3 || type_from == ov::element::f8e5m2 ||
                type_from == ov::element::f8e8m0) {
-        LOG_INFO("######################## unpack_f8f16");
+        // FIXME: Implement XARCH::unpack
         unpack_f8f16(from, scale, to, unpack_options);
-        //ov::npuw::util::XARCH::unpack_f8f16_scale(from, scale, to, unpack_options);
     } else if (type_from == ov::element::f16) {
         // FIXME: Implement XARCH::unpack
         unpack_f16f16(from, scale, to, unpack_options);
diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
index fb3225b7c5d4aa..5f8c4f1de9c07e 100644
--- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
@@ -6,32 +6,47 @@
 #    include "copy_inplace.hpp"
 
 namespace {
-
 static ov::Strides make_padded_strides_keep_tail_default(const ov::Shape& shape,
                                                          const ov::element::Type& et,
                                                          size_t kv_dim,
                                                          size_t pad_elems) {
     ov::Strides s = copy_inplace_details::default_byte_strides(shape, et);
 
-    // Keep last 2 dims default contiguous explicitly.
-    if (shape.size() >= 1) {
-        s.back() = et.size();
+    const size_t rank = shape.size();
+    if (rank == 0) {
+        return s;
     }
-    if (shape.size() >= 2) {
-        s[shape.size() - 2] = shape.back() * et.size();
+
+    // Keep last 2 dims default contiguous explicitly.
+    s[rank - 1] = et.size();
+    if (rank >= 2) {
+        s[rank - 2] = shape[rank - 1] * et.size();
     }
 
-    const size_t rank = shape.size();
     if (rank <= 2) {
         return s;
     }
 
     const size_t last2_begin = rank - 2;
-    for (size_t d = 0; d < last2_begin; ++d) {
-        if (d <= kv_dim) {
-            s[d] += pad_elems * et.size();
+
+    // If kv_dim is in the last 2 dims, "keep tail default" means we should not pad there.
+    if (kv_dim >= last2_begin) {
+        // Recompute outer strides consistently (no padding)
+        for (size_t d = last2_begin; d-- > 0;) {
+            s[d] = s[d + 1] * shape[d + 1];
+        }
+        return s;
+    }
+
+    // Recompute strides from inner to outer; at kv_dim insert a gap measured in *inner blocks*.
+    for (size_t d = last2_begin; d-- > 0;) {
+        s[d] = s[d + 1] * shape[d + 1];
+        if (d == kv_dim) {
+            // pad_elems is number of extra "inner blocks" after each index-step in kv_dim
+            s[d] += pad_elems * s[d + 1];
         }
     }
+
     return s;
 }
 

From 42f5cf5c8d5604256f11ae7fa8bb3cc848072cf4 Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Thu, 15 Jan 2026 17:43:39 +0800
Subject: [PATCH 07/13] Fix

---
 .../src/plugin/npuw/infer_request_utils.cpp   | 162 ------------------
 1 file changed, 162 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
index 9e734064b331cc..c51381b5c04eb1 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
@@ -1,165 +1,3 @@
-// // Copyright (C) 2025 Intel Corporation
-// // SPDX-License-Identifier: Apache-2.0
-// //
-
-// #include "infer_request_utils.hpp"
-
-// #include "logging.hpp"
-// #include "openvino/runtime/make_tensor.hpp"  // get_tensor_impl
-// #include "util_xarch.hpp"
-
-// // FIXME: Use ov::npuw::util::view instead
-// ov::SoPtr<ov::ITensor> ov::npuw::util::make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
-//                                                          uint32_t dim,
-//                                                          uint32_t start_pos,
-//                                                          uint32_t end_pos) {
-//     ov::Shape start_shape(std::vector<size_t>(tensor->get_shape().size(), 0u));
-//     start_shape[dim] = start_pos;
-//     ov::Shape end_shape = tensor->get_shape();
-//     end_shape[dim] = end_pos;
-//     return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape));
-// }
-
-// void ov::npuw::util::copy_to_right(const ov::SoPtr<ov::ITensor>& src, const ov::SoPtr<ov::ITensor>& dst) {
-//     OPENVINO_ASSERT(src->get_byte_size() <= dst->get_byte_size());
-//     std::copy_n(reinterpret_cast<uint8_t*>(src->data()),
-//                 src->get_byte_size(),
-//                 reinterpret_cast<uint8_t*>(dst->data()) + dst->get_byte_size() - src->get_byte_size());
-// }
-
-// void ov::npuw::util::copy_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) {
-//     // [1, H, S1, E] -> [1, H, S2, E]
-//     const int N = 0;
-//     const int H = 1;
-//     const int S = 2;
-//     const int E = 3;
-
-//     OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]);
-//     OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]);
-//     OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]);
-//     OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());
-//     OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u);
-//     OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u);
-
-//     const auto* src_tensor_data = reinterpret_cast<uint8_t*>(src_tensor->data());
-//     auto* dst_tensor_data = reinterpret_cast<uint8_t*>(dst_tensor->data());
-
-//     const auto num_planes = src_tensor->get_shape()[H];
-//     const auto src_plane_stride = src_tensor->get_strides()[H];
-//     const auto dst_plane_stride = dst_tensor->get_strides()[H];
-//     const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S];
-
-//     for (size_t i = 0; i < num_planes; ++i) {
-//         std::copy_n(src_tensor_data, plane_size_in_bytes, dst_tensor_data);
-//         dst_tensor_data += dst_plane_stride;
-//         src_tensor_data += src_plane_stride;
-//     }
-// }
-
-// void ov::npuw::util::copy_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst) {
-//     /*
-//       src/dst layout: [1, heads, emb_size, seq_len]
-
-//       X[*,i] - embedding for i-th token,
-//       Instead of copy columns, copy rows X[i,*]
-
-//       [[X00 X01 ... X0n]      [[X00 X01 ... X0n]
-//        [X10 X11 ... X1n]       [X10 X11 ... X1n]
-//        [X20 X21 ... X2n]  ...  [X20 X21 ... X2n]
-//              ...                     ...
-//        [Xm0 Xm1 ... Xmn]]      [Xm0 Xm1 ... Xmn]]
-//     */
-
-//     const auto& src_shape = src->get_shape();
-
-//     OPENVINO_ASSERT(src_shape.size() == 4u);
-//     OPENVINO_ASSERT(src_shape == dst->get_shape());
-//     OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size());
-
-//     const auto& src_strides = src->get_strides();
-//     const auto& dst_strides = dst->get_strides();
-//     const auto elem_size = src->get_byte_size() / src->get_size();
-
-//     const auto C = src_shape[1];
-//     const auto H = src_shape[2];
-//     const auto W = src_shape[3];
-
-//     const auto IS_H = src_strides[2];
-//     const auto OS_H = dst_strides[2];
-
-//     const size_t chunk_byte_size = W * elem_size;
-
-//     const auto* src_p = static_cast<uint8_t*>(src->data());
-//     auto* dst_p = static_cast<uint8_t*>(dst->data());
-
-//     for (size_t i = 0; i < C * H; ++i) {
-//         const size_t src_offset = i * IS_H;
-//         const size_t dst_offset = i * OS_H;
-//         std::copy_n(src_p + src_offset, chunk_byte_size, dst_p + dst_offset);
-//     }
-// }
-
-// void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
-//                                         ov::SoPtr<ov::ITensor> dst_tensor,
-//                                         uint32_t kv_dim_src,
-//                                         uint32_t kv_dim_dst) {
-//     if (kv_dim_src != kv_dim_dst) {
-//         // new case - do a generic copy for now (in fact it is a permute)
-//         // Example:
-//         //   kv_dim_src         kv_dim_dst
-//         //       v                     v
-//         // [1,8,256,128] --> [1,8,128,256]
-//         const auto& src_shape = src_tensor->get_shape();
-//         const auto& dst_shape = dst_tensor->get_shape();
-//         NPUW_ASSERT(src_shape.size() == 4);
-//         NPUW_ASSERT(dst_shape.size() == 4);
-//         NPUW_ASSERT(kv_dim_src < 4);
-//         NPUW_ASSERT(kv_dim_dst < 4);
-//         NPUW_ASSERT(src_shape[kv_dim_src] == dst_shape[kv_dim_dst]);
-
-//         std::array<int, 4> axis = {0, 1, 2, 3};
-//         // Remap like 0,1,2,3 => 0,1,3,2 (see example)
-//         std::swap(axis[kv_dim_src], axis[kv_dim_dst]);
-//         ov::npuw::util::permute_i4d(src_tensor, dst_tensor, axis);
-//         return;
-//     }
-//     // Old behavior
-//     NPUW_ASSERT(kv_dim_src == kv_dim_dst);
-//     if (kv_dim_src == 3u) {
-//         // Asserting that we work with last dimenston here:
-//         const auto& src_shape = src_tensor->get_shape();
-//         OPENVINO_ASSERT(src_shape.size() == 4);
-//         // If last dimenstion of src_tensor is equal to 1, then we can squeeze
-//         // src_shape from [1, heads, d_v, seq_len=1] to [heads, d_v].
-//         // We can then treat src_tensor as a continuous tensor of row value vectors
-//         // for multiple heads, while dst_tensor will still have [1, heads, d_v, seq_len!=1],
-//         // shape, awaiting updates at column dimension, as value vectors are columns now.
-//         if (src_shape[kv_dim_src] == 1 && src_tensor->is_continuous()) {
-//             // FIXME: ov::npuw::util::XARCH::copy_row_as_column(src_tensor, dst_tensor) throws when used here
-//             copy_columns_by_row_chunks(src_tensor, dst_tensor);
-//         } else {
-//             copy_columns_by_row_chunks(src_tensor, dst_tensor);
-//         }
-//     } else if (kv_dim_src == 2u) {
-//         copy_by_planes(src_tensor, dst_tensor);
-//     } else {
-//         src_tensor->copy_to(dst_tensor._ptr);
-//     }
-// }
-
-// std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name(
-//     const std::vector<ov::Output<const ov::Node>>& ports,
-//     const std::string& name) {
-//     auto it = std::find_if(ports.begin(), ports.end(), [&](const auto& port) {
-//         return port.get_names().count(name) != 0;
-//     });
-//     if (it == ports.end()) {
-//         return std::nullopt;
-//     }
-//     return std::make_optional(*it);
-// }
-
-//////////////////////////////////////////////////////////////////////
 // Copyright (C) 2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

From d807b919b6d914259fd215b7aa66d3b1e4afdc7d Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Thu, 15 Jan 2026 17:57:13 +0800
Subject: [PATCH 08/13] Fix

---
 src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp | 2 +-
 src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
index 5f8c4f1de9c07e..f1ebe0197a2a8a 100644
--- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
@@ -220,4 +220,4 @@ INSTANTIATE_TEST_SUITE_P(CopyInplaceTests, CopyInplaceTests, TestCases, CopyInpl
 
 }  // namespace
 
-#endif  // HAVE_AVX2
\ No newline at end of file
+#endif  // HAVE_AVX2
diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
index b98d355396f333..d3ec609ca51f68 100644
--- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
@@ -146,4 +146,4 @@ class CopyInplaceTestsTmpl : public ::testing::Test,
 
 using CopyInplaceTests = CopyInplaceTestsTmpl<CopyInplaceTestsBase>;
 
-}  // anonymous namespace
\ No newline at end of file
+}  // anonymous namespace

From e4e9fad36e86b2445136cba1127d630685e25526 Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Fri, 16 Jan 2026 17:07:58 +0800
Subject: [PATCH 09/13] Format

---
 src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index 775b57491b0815..1063d3768b4606 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -11,8 +11,8 @@
 #include "logging.hpp"
 #include "openvino/core/parallel.hpp"
 #include "openvino/runtime/iasync_infer_request.hpp"
-#include "util.hpp"
 #include "perf.hpp"
+#include "util.hpp"
 
 namespace {
 

From 03137612d0fbc2cc20adcf524b035a42fceb0bfc Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Fri, 16 Jan 2026 17:44:26 +0800
Subject: [PATCH 10/13] Fix

---
 .../tests/unit/npuw/copy_inplace.cpp          | 24 -------------------
 .../tests/unit/npuw/copy_inplace.hpp          |  2 --
 2 files changed, 26 deletions(-)

diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
index f1ebe0197a2a8a..59263206fde808 100644
--- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
@@ -87,19 +87,6 @@ void CopyInplaceTestsBase::make_input() {
     baseTensor = ov::Tensor(ov::element::u8, ov::Shape{byte_size}, base_bytes_initial.data());
 }
 
-void CopyInplaceTestsBase::make_views() {
-    src_strides = copy_inplace_details::default_byte_strides(shape, type);
-
-    const size_t pad_elems = 13;
-    dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems);
-
-    void* base_ptr = baseTensor.data();
-    ASSERT_NE(base_ptr, nullptr);
-
-    srcView = ov::Tensor(type, shape, base_ptr, src_strides);
-    dstView = ov::Tensor(type, shape, base_ptr, dst_strides);
-}
-
 bool CopyInplaceTestsBase::isNegative() const {
     if (shape.size() < 2) {
         return true;
@@ -166,17 +153,6 @@ void CopyInplaceTestsBase::SetUp(const CopyInplaceTestsParams& getParam) {
     }
 }
 
-std::string CopyInplaceTestsBase::ToString() const {
-    std::ostringstream oss;
-    oss << "[";
-    for (size_t i = 0; i < shape.size(); ++i) {
-        oss << shape[i] << ((i + 1 == shape.size()) ? "" : "x");
-    }
-    oss << "]"
-        << "_type_" << type << "_kv_" << kv_dim;
-    return oss.str();
-}
-
 TEST_P(CopyInplaceTests, copy_tensor_inplace_by_dim_correctness) {
     ASSERT_NO_THROW_IF(!isNegative(), {
         auto src_it = ov::get_tensor_impl(srcView);
diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
index d3ec609ca51f68..f9e556a72861a9 100644
--- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
@@ -104,13 +104,11 @@ class CopyInplaceTestsBase {
     ov::Strides dst_strides;
 
     void make_input();
-    void make_views();
     void make_ref_output();
     bool isNegative() const;
 
 public:
     void SetUp(const CopyInplaceTestsParams& getParam);
-    std::string ToString() const;
 };
 
 template <class T>

From ce64e7ecf7f00810c4faaf520e9bfdf2d02544f4 Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Mon, 19 Jan 2026 18:26:31 +0800
Subject: [PATCH 11/13] Optimize offset computing and clean up

---
 .../src/plugin/npuw/infer_request_utils.cpp   | 88 +++++++++++++------
 .../tests/unit/npuw/copy_inplace.cpp          | 33 ++-----
 .../tests/unit/npuw/copy_inplace.hpp          | 10 +--
 3 files changed, 70 insertions(+), 61 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
index 9a0baedeb032cf..be36cc535eb0b0 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
@@ -151,6 +151,7 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
                                                ov::SoPtr<ov::ITensor> dst_tensor) {
     OPENVINO_ASSERT(src_tensor);
     OPENVINO_ASSERT(dst_tensor);
+    OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());
 
     void* base_data = src_tensor->data();
     void* dst_data = dst_tensor->data();
@@ -198,39 +199,60 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
     };
 
     // ---------------------------------------------------------------------
-    // Last dimension not packed in either src or dst.
+    // Fallback: last dimension not packed in either src or dst.
     // We cannot memmove row_bytes as a contiguous block. Do element-wise memmove.
-    // Keep reverse lexicographic order to be overlap-safe for in-place move.
     // ---------------------------------------------------------------------
     if (src_strides0[rank0 - 1] != elem_size || dst_strides0[rank0 - 1] != elem_size) {
-        ov::Shape idx(shape0.size(), 0);
+        ov::Shape idx(rank0, 0);
         for (size_t d = 0; d < rank0; ++d) {
             idx[d] = shape0[d] - 1;
         }
 
-        auto dec_idx = [&]() -> bool {
+        size_t src_off = compute_offset(idx, src_strides0);
+        size_t dst_off = compute_offset(idx, dst_strides0);
+
+        auto step_prev = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) {
+            off -= strides_bytes[dim];
+        };
+
+        auto wrap_dim = [&](size_t& off, const ov::Shape& shape, const ov::Strides& strides_bytes, size_t dim) {
+            off += (shape[dim] - 1) * strides_bytes[dim];
+        };
+
+        auto dec_idx_and_offsets = [&]() -> bool {
             for (int d = static_cast<int>(rank0) - 1; d >= 0; --d) {
                 const size_t ud = static_cast<size_t>(d);
                 if (idx[ud] > 0) {
                     --idx[ud];
+                    step_prev(src_off, src_strides0, ud);
+                    step_prev(dst_off, dst_strides0, ud);
                     return true;
                 }
                 idx[ud] = shape0[ud] - 1;
+                wrap_dim(src_off, shape0, src_strides0, ud);
+                wrap_dim(dst_off, shape0, dst_strides0, ud);
             }
             return false;
         };
 
         while (true) {
-            const size_t src_off = compute_offset(idx, src_strides0);
-            const size_t dst_off = compute_offset(idx, dst_strides0);
-
             uint8_t* src_ptr = base + src_off;
             uint8_t* dst_ptr = base + dst_off;
             if (src_ptr != dst_ptr) {
-                std::memmove(dst_ptr, src_ptr, elem_size);
+                // If no overlap, memcpy is enough (faster). Otherwise use memmove.
+                const uint8_t* s0 = src_ptr;
+                const uint8_t* s1 = src_ptr + elem_size;
+                uint8_t* d0 = dst_ptr;
+                uint8_t* d1 = dst_ptr + elem_size;
+                const bool overlap = !(d1 <= s0 || s1 <= d0);
+                if (!overlap) {
+                    std::memcpy(dst_ptr, src_ptr, elem_size);
+                } else {
+                    std::memmove(dst_ptr, src_ptr, elem_size);
+                }
             }
 
-            if (!dec_idx()) {
+            if (!dec_idx_and_offsets()) {
                 break;
             }
         }
@@ -279,7 +301,6 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
     }
     shape.push_back(folded_last);
 
-    // For the folded last dim, the step is element-size (bytes per element).
     src_strides.push_back(elem_size);
     dst_strides.push_back(elem_size);
 
@@ -292,42 +313,57 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
         return;
     }
 
-    ov::Shape outer(rank - 1, 0);
-    for (size_t d = 0; d + 1 < rank; ++d) {
+    const size_t outer_rank = rank - 1;
+
+    ov::Shape outer(outer_rank, 0);
+    for (size_t d = 0; d < outer_rank; ++d) {
         outer[d] = shape[d] - 1;
     }
 
-    auto dec_outer = [&]() -> bool {
-        for (int d = static_cast<int>(rank) - 2; d >= 0; --d) {
+    auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t {
+        size_t off = 0;
+        for (size_t d = 0; d < o.size(); ++d) {
+            off += o[d] * strides_bytes[d];
+        }
+        return off;
+    };
+
+    size_t src_off = compute_outer_offset(outer, src_strides);
+    size_t dst_off = compute_outer_offset(outer, dst_strides);
+
+    auto step_prev_outer = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) {
+        off -= strides_bytes[dim];
+    };
+
+    auto wrap_outer_dim =
+        [&](size_t& off, const ov::Shape& shape_folded, const ov::Strides& strides_bytes, size_t dim) {
+            off += (shape_folded[dim] - 1) * strides_bytes[dim];
+        };
+
+    auto dec_outer_and_offsets = [&]() -> bool {
+        for (int d = static_cast<int>(outer_rank) - 1; d >= 0; --d) {
             const size_t ud = static_cast<size_t>(d);
             if (outer[ud] > 0) {
                 --outer[ud];
+                step_prev_outer(src_off, src_strides, ud);
+                step_prev_outer(dst_off, dst_strides, ud);
                 return true;
             }
             outer[ud] = shape[ud] - 1;
+            wrap_outer_dim(src_off, shape, src_strides, ud);
+            wrap_outer_dim(dst_off, shape, dst_strides, ud);
         }
         return false;
     };
 
-    auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t {
-        size_t off = 0;
-        for (size_t d = 0; d < o.size(); ++d) {
-            off += o[d] * strides_bytes[d];
-        }
-        return off;
-    };
-
     while (true) {
-        const size_t src_off = compute_outer_offset(outer, src_strides);
-        const size_t dst_off = compute_outer_offset(outer, dst_strides);
-
         uint8_t* src_ptr = base + src_off;
         uint8_t* dst_ptr = base + dst_off;
         if (src_ptr != dst_ptr) {
             std::memmove(dst_ptr, src_ptr, row_bytes);
         }
 
-        if (!dec_outer()) {
+        if (!dec_outer_and_offsets()) {
             break;
         }
     }
diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
index 59263206fde808..ae66c127b375d8 100644
--- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp
@@ -16,33 +16,17 @@ static ov::Strides make_padded_strides_keep_tail_default(const ov::Shape& shape,
     if (rank == 0) {
         return s;
     }
-
-    // Keep last 2 dims default contiguous explicitly.
-    s[rank - 1] = et.size();
-    if (rank >= 2) {
-        s[rank - 2] = shape[rank - 1] * et.size();
-    }
-
-    if (rank <= 2) {
-        return s;
-    }
-
-    const size_t last2_begin = rank - 2;
-
-    // If kv_dim is in the last 2 dims, "keep tail default" means we should not pad there.
-    if (kv_dim >= last2_begin) {
-        // Recompute outer strides consistently (no padding)
-        for (size_t d = last2_begin; d-- > 0;) {
-            s[d] = s[d + 1] * shape[d + 1];
+    if (rank == 1) {
+        if (kv_dim == 0) {
+            s[0] += pad_elems * et.size();
         }
         return s;
     }
 
-    // Recompute strides from inner to outer; at kv_dim insert a gap measured in *inner blocks*.
-    for (size_t d = last2_begin; d-- > 0;) {
+    s[rank - 1] = et.size();
+    for (size_t d = rank - 1; d-- > 0;) {
         s[d] = s[d + 1] * shape[d + 1];
         if (d == kv_dim) {
-            // pad_elems is number of extra "inner blocks" after each index-step in kv_dim
             s[d] += pad_elems * s[d + 1];
         }
     }
@@ -83,7 +67,6 @@ void CopyInplaceTestsBase::make_input() {
         base_bytes_initial[i] = static_cast<uint8_t>(dist(rng));
     }
 
-    // External-memory tensor (safe for unit test lifetime).
     baseTensor = ov::Tensor(ov::element::u8, ov::Shape{byte_size}, base_bytes_initial.data());
 }
 
@@ -134,15 +117,12 @@ void CopyInplaceTestsBase::SetUp(const CopyInplaceTestsParams& getParam) {
     shapeInit(dims);
     shape = ov::Shape{dims.begin(), dims.end()};
 
-    // Precompute strides first (no base pointer needed)
     src_strides = copy_inplace_details::default_byte_strides(shape, type);
     const size_t pad_elems = 13;
     dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems);
 
-    // Now allocate/fill buffer
     make_input();
 
-    // Create views (needs baseTensor pointer)
     void* base_ptr = baseTensor.data();
     ASSERT_NE(base_ptr, nullptr);
     srcView = ov::Tensor(type, shape, base_ptr, src_strides);
@@ -167,7 +147,6 @@ TEST_P(CopyInplaceTests, copy_tensor_inplace_by_dim_correctness) {
         ASSERT_NE(base_ptr, nullptr);
         out_bytes.assign(base_ptr, base_ptr + out_bytes.size());
 
-        // test_utils.hpp defines details::ArraysMatch for vector<int8_t>
         ASSERT_TRUE(details::ArraysMatch(to_i8(out_bytes), to_i8(ref_bytes)));
     });
 }
@@ -177,7 +156,7 @@ const auto TestCases = ::testing::Combine(
     ::testing::ValuesIn({ov::element::Type_t::i8, ov::element::Type_t::f16, ov::element::Type_t::f32}),
     details::ShapesIn({
         Tensors{ input = {1, 2, 3, 4};
-}  // namespace
+}
 , Tensors {
     input = {1, 8, 16, 32};
 }
diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
index f9e556a72861a9..96ee02f961a4bd 100644
--- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
+++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp
@@ -17,17 +17,13 @@
 #include <tuple>
 #include <vector>
 
-#include "infer_request_utils.hpp"  // copy_tensor_inplace_by_dim
+#include "infer_request_utils.hpp"
 #include "openvino/runtime/make_tensor.hpp"
 #include "openvino/runtime/tensor.hpp"
 #include "test_utils.hpp"
 
 namespace {
 
-// NOTE: do NOT redefine ASSERT_NO_THROW_* macros here.
-// They already exist in test_utils.hpp and warnings are treated as errors.
-
-// (type, shape, kv_dim)
 using CopyInplaceTestsParams = std::tuple<ov::element::Type_t, ShapesInitializer, std::size_t>;
 
 namespace copy_inplace_details {
@@ -71,9 +67,7 @@ inline void write_elem_bytes(uint8_t* base,
     std::memcpy(base + off, elem, elem_bytes);
 }
 
-// Enumerate ND index in lexicographic order.
 inline bool next_index(ov::Shape& idx, const ov::Shape& shape) {
-    // shape is assumed non-empty and all dims > 0 in this test suite
     for (int d = static_cast<int>(shape.size()) - 1; d >= 0; --d) {
         const size_t ud = static_cast<size_t>(d);
         if (++idx[ud] < shape[ud]) {
@@ -89,7 +83,7 @@ inline bool next_index(ov::Shape& idx, const ov::Shape& shape) {
 class CopyInplaceTestsBase {
 protected:
     ov::element::Type type;
-    ov::Tensor baseTensor;  // shared buffer owner (u8)
+    ov::Tensor baseTensor;
     ov::Tensor srcView;
     ov::Tensor dstView;
     ov::Shape shape;

From f58df15bea7aefd63ab241940373846f788d7803 Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Wed, 28 Jan 2026 14:48:19 +0800
Subject: [PATCH 12/13] Add necessary comments

---
 src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index b78a23e441406c..1eb131fe733eda 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -518,7 +518,11 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
                                                                     0u,
                                                                     static_cast<uint32_t>(tokens_in_past_chunks));
                 ov::SoPtr<ov::ITensor> prefill_past_kv_chunks;
-                // move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst.
+                // In-place KV copy is only safe/possible when the source and destination KV layouts match.
+                // When we have mixed v-transpose settings across models (prefill vs generate: v-transpose OFF/ON),
+                // the effective KV "token" dimension differs (pre_kv_dim != gen_kv_dim), so an in-place move/copy
+                // would corrupt data. Therefore, we only use in-place copy when pre_kv_dim == gen_kv_dim;
+                // otherwise we must copy via a temporary tensor.
                 if (m_past_kv_bound) {
                     if (pre_kv_dim == gen_kv_dim) {
                         prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv,

From 9067950afdea03c0e66905f60394d1c34d057d1e Mon Sep 17 00:00:00 2001
From: DingZhangIntel <ding.zhang@intel.com>
Date: Thu, 29 Jan 2026 15:46:16 +0800
Subject: [PATCH 13/13] Fix and refatore

---
 .../src/plugin/npuw/infer_request_utils.cpp   | 220 +++++++-----------
 1 file changed, 83 insertions(+), 137 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
index be36cc535eb0b0..bd13391029b1f7 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
@@ -8,6 +8,66 @@
 #include "openvino/runtime/make_tensor.hpp"  // get_tensor_impl
 #include "util_xarch.hpp"
 
+namespace {
+
+struct FoldedTrailingLayout {
+    ov::Shape shape;
+    ov::Strides src_strides;
+    ov::Strides dst_strides;
+};
+
+// Folds the maximal COMMON trailing segment where:
+//   src_stride == dst_stride == default_stride (packed / contiguous-by-bytes)
+// into a single last dimension.
+// This is the only segment eligible for flattening to speed up row-wise memmove.
+FoldedTrailingLayout fold_common_trailing_packed_segment(const ov::Shape& shape0,
+                                                         const ov::Strides& src_strides0,
+                                                         const ov::Strides& dst_strides0,
+                                                         size_t elem_size) {
+    const size_t rank0 = shape0.size();
+    OPENVINO_ASSERT(rank0 > 0);
+
+    ov::Strides default_strides(rank0, 0);
+    default_strides[rank0 - 1] = elem_size;
+    for (size_t i = rank0 - 1; i > 0; --i) {
+        default_strides[i - 1] = default_strides[i] * shape0[i];
+    }
+
+    size_t cut = rank0 - 1;
+    for (size_t inverted_idx = rank0; inverted_idx-- > 0;) {
+        const bool ok = (src_strides0[inverted_idx] == default_strides[inverted_idx]) &&
+                        (dst_strides0[inverted_idx] == default_strides[inverted_idx]) &&
+                        (src_strides0[inverted_idx] == dst_strides0[inverted_idx]);
+        if (!ok) {
+            break;
+        }
+        cut = inverted_idx;
+    }
+
+    FoldedTrailingLayout out;
+    out.shape.reserve(cut + 1);
+    out.src_strides.reserve(cut + 1);
+    out.dst_strides.reserve(cut + 1);
+
+    for (size_t d = 0; d < cut; ++d) {
+        out.shape.push_back(shape0[d]);
+        out.src_strides.push_back(src_strides0[d]);
+        out.dst_strides.push_back(dst_strides0[d]);
+    }
+
+    size_t folded_last = 1;
+    for (size_t d = cut; d < rank0; ++d) {
+        folded_last *= shape0[d];
+    }
+    out.shape.push_back(folded_last);
+    out.src_strides.push_back(elem_size);
+    out.dst_strides.push_back(elem_size);
+
+    return out;
+}
+
+}  // namespace
+
 // FIXME: Use ov::npuw::util::view instead
 ov::SoPtr<ov::ITensor> ov::npuw::util::make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
                                                          uint32_t dim,
@@ -153,6 +213,10 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
     OPENVINO_ASSERT(dst_tensor);
     OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());
 
+    // KV-cache values are byte-addressable in the current flow. Sub-byte element types (int4/uint4) are unsupported.
+    const auto et = src_tensor->get_element_type();
+    OPENVINO_ASSERT(et.bitwidth() % 8u == 0u, "sub-byte element types (e.g. int4/uint4) are not supported");
+
     void* base_data = src_tensor->data();
     void* dst_data = dst_tensor->data();
     OPENVINO_ASSERT(base_data && dst_data);
@@ -175,139 +239,26 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
 
     const size_t total_elems = src_tensor->get_size();
     OPENVINO_ASSERT(total_elems != 0);
+
     const size_t elem_size = src_tensor->get_byte_size() / total_elems;
 
-    ov::Strides src_strides0 = src_tensor->get_strides();
-    ov::Strides dst_strides0 = dst_tensor->get_strides();
+    const ov::Strides src_strides0 = src_tensor->get_strides();
+    const ov::Strides dst_strides0 = dst_tensor->get_strides();
     OPENVINO_ASSERT(src_strides0.size() == rank0);
     OPENVINO_ASSERT(dst_strides0.size() == rank0);
 
-    ov::Strides default_strides(rank0, 0);
-    default_strides[rank0 - 1] = elem_size;
-    for (size_t i = rank0 - 1; i > 0; --i) {
-        default_strides[i - 1] = default_strides[i] * shape0[i];
-    }
+    // The last dimension is packed in both src and dst.
+    OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size && dst_strides0[rank0 - 1] == elem_size,
+                    "src/dst last dimension is not packed");
 
     auto* base = static_cast<uint8_t*>(base_data);
 
-    auto compute_offset = [&](const ov::Shape& ix, const ov::Strides& strides_bytes) -> size_t {
-        size_t off = 0;
-        for (size_t d = 0; d < ix.size(); ++d) {
-            off += ix[d] * strides_bytes[d];
-        }
-        return off;
-    };
+    const auto folded = fold_common_trailing_packed_segment(shape0, src_strides0, dst_strides0, elem_size);
 
-    // ---------------------------------------------------------------------
-    // Fallback: last dimension not packed in either src or dst.
-    // We cannot memmove row_bytes as a contiguous block. Do element-wise memmove.
-    // ---------------------------------------------------------------------
-    if (src_strides0[rank0 - 1] != elem_size || dst_strides0[rank0 - 1] != elem_size) {
-        ov::Shape idx(rank0, 0);
-        for (size_t d = 0; d < rank0; ++d) {
-            idx[d] = shape0[d] - 1;
-        }
-
-        size_t src_off = compute_offset(idx, src_strides0);
-        size_t dst_off = compute_offset(idx, dst_strides0);
-
-        auto step_prev = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) {
-            off -= strides_bytes[dim];
-        };
-
-        auto wrap_dim = [&](size_t& off, const ov::Shape& shape, const ov::Strides& strides_bytes, size_t dim) {
-            off += (shape[dim] - 1) * strides_bytes[dim];
-        };
-
-        auto dec_idx_and_offsets = [&]() -> bool {
-            for (int d = static_cast<int>(rank0) - 1; d >= 0; --d) {
-                const size_t ud = static_cast<size_t>(d);
-                if (idx[ud] > 0) {
-                    --idx[ud];
-                    step_prev(src_off, src_strides0, ud);
-                    step_prev(dst_off, dst_strides0, ud);
-                    return true;
-                }
-                idx[ud] = shape0[ud] - 1;
-                wrap_dim(src_off, shape0, src_strides0, ud);
-                wrap_dim(dst_off, shape0, dst_strides0, ud);
-            }
-            return false;
-        };
-
-        while (true) {
-            uint8_t* src_ptr = base + src_off;
-            uint8_t* dst_ptr = base + dst_off;
-            if (src_ptr != dst_ptr) {
-                // If no overlap, memcpy is enough (faster). Otherwise use memmove.
-                const uint8_t* s0 = src_ptr;
-                const uint8_t* s1 = src_ptr + elem_size;
-                uint8_t* d0 = dst_ptr;
-                uint8_t* d1 = dst_ptr + elem_size;
-                const bool overlap = !(d1 <= s0 || s1 <= d0);
-                if (!overlap) {
-                    std::memcpy(dst_ptr, src_ptr, elem_size);
-                } else {
-                    std::memmove(dst_ptr, src_ptr, elem_size);
-                }
-            }
-
-            if (!dec_idx_and_offsets()) {
-                break;
-            }
-        }
-        return;
-    }
-
-    OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size);
-    OPENVINO_ASSERT(dst_strides0[rank0 - 1] == elem_size);
-    OPENVINO_ASSERT(default_strides[rank0 - 1] == elem_size);
-
-    // Find the COMMON trailing segment where src_stride == dst_stride == default_stride.
-    // This is the only part eligible for flattening.
-    size_t cut = rank0 - 1;  // at worst, we can always copy along last dim
-    for (size_t inverted_idx = rank0 - 1; inverted_idx < rank0; --inverted_idx) {
-        const bool ok = (src_strides0[inverted_idx] == default_strides[inverted_idx]) &&
-                        (dst_strides0[inverted_idx] == default_strides[inverted_idx]) &&
-                        (src_strides0[inverted_idx] == dst_strides0[inverted_idx]);
-        if (ok) {
-            cut = inverted_idx;
-            if (inverted_idx == 0) {
-                break;
-            }
-            continue;
-        }
-        break;
-    }
-
-    // Fold [cut..rank0-1] into a single last dimension.
-    ov::Shape shape;
-    ov::Strides src_strides;
-    ov::Strides dst_strides;
-
-    shape.reserve(cut + 1);
-    src_strides.reserve(cut + 1);
-    dst_strides.reserve(cut + 1);
-
-    for (size_t d = 0; d < cut; ++d) {
-        shape.push_back(shape0[d]);
-        src_strides.push_back(src_strides0[d]);
-        dst_strides.push_back(dst_strides0[d]);
-    }
-
-    size_t folded_last = 1;
-    for (size_t d = cut; d < rank0; ++d) {
-        folded_last *= shape0[d];
-    }
-    shape.push_back(folded_last);
-
-    src_strides.push_back(elem_size);
-    dst_strides.push_back(elem_size);
-
-    const size_t rank = shape.size();
+    const size_t rank = folded.shape.size();
     OPENVINO_ASSERT(rank >= 1);
 
-    const size_t row_elems = shape[rank - 1];
+    const size_t row_elems = folded.shape[rank - 1];
     const size_t row_bytes = row_elems * elem_size;
     if (row_bytes == 0) {
         return;
@@ -317,7 +268,7 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
 
     ov::Shape outer(outer_rank, 0);
     for (size_t d = 0; d < outer_rank; ++d) {
-        outer[d] = shape[d] - 1;
+        outer[d] = folded.shape[d] - 1;
     }
 
     auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t {
@@ -328,8 +279,8 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
         return off;
     };
 
-    size_t src_off = compute_outer_offset(outer, src_strides);
-    size_t dst_off = compute_outer_offset(outer, dst_strides);
+    size_t src_off = compute_outer_offset(outer, folded.src_strides);
+    size_t dst_off = compute_outer_offset(outer, folded.dst_strides);
 
     auto step_prev_outer = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) {
         off -= strides_bytes[dim];
@@ -345,13 +296,13 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
             const size_t ud = static_cast<size_t>(d);
             if (outer[ud] > 0) {
                 --outer[ud];
-                step_prev_outer(src_off, src_strides, ud);
-                step_prev_outer(dst_off, dst_strides, ud);
+                step_prev_outer(src_off, folded.src_strides, ud);
+                step_prev_outer(dst_off, folded.dst_strides, ud);
                 return true;
             }
-            outer[ud] = shape[ud] - 1;
-            wrap_outer_dim(src_off, shape, src_strides, ud);
-            wrap_outer_dim(dst_off, shape, dst_strides, ud);
+            outer[ud] = folded.shape[ud] - 1;
+            wrap_outer_dim(src_off, folded.shape, folded.src_strides, ud);
+            wrap_outer_dim(dst_off, folded.shape, folded.dst_strides, ud);
         }
         return false;
     };
@@ -371,7 +322,7 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_
 
 // In-place move along kv_dim when src/dst share the same buffer.
 // Requirements:
-//   - kv_dim_src == kv_dim_dst, otherwise throws
+//   - kv_dim_src == kv_dim_dst
 //   - src_tensor->data() == dst_tensor->data()
 void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor> src_tensor,
                                                 ov::SoPtr<ov::ITensor> dst_tensor,
@@ -380,9 +331,7 @@ void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor> src
     OPENVINO_ASSERT(src_tensor);
     OPENVINO_ASSERT(dst_tensor);
 
-    if (kv_dim_src != kv_dim_dst) {
-        OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst");
-    }
+    OPENVINO_ASSERT(kv_dim_src == kv_dim_dst, "copy_tensor_inplace_by_dim supports only kv_dim_src == kv_dim_dst");
 
     void* base_data = src_tensor->data();
     void* dst_data = dst_tensor->data();
@@ -392,12 +341,9 @@ void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor> src
 
     const auto& src_shape = src_tensor->get_shape();
     const auto& dst_shape = dst_tensor->get_shape();
-    OPENVINO_ASSERT(src_shape.size() == dst_shape.size());
     OPENVINO_ASSERT(src_shape == dst_shape);
     OPENVINO_ASSERT(kv_dim_src < src_shape.size());
 
-    // One generic implementation for all kv_dim.
-    // We rely on row-wise memmove on the (possibly flattened) last dimension and stride-based addressing.
     copy_inplace_generic_rows(src_tensor, dst_tensor);
 }