From 35789cc7bb2ebfd507790852622d585255e420a1 Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Thu, 11 Dec 2025 16:17:45 +0800 Subject: [PATCH 01/13] Implement inplace kv cache copy when it's shared --- .../src/plugin/npuw/infer_request_utils.cpp | 91 +++++++++++++ .../src/plugin/npuw/infer_request_utils.hpp | 5 + .../src/plugin/npuw/llm_infer_request.cpp | 120 ++++++++++++++++++ 3 files changed, 216 insertions(+) diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp index d4e4ff66371dbc..df9805579fb2e3 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp @@ -140,6 +140,97 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr src_tensor, } } +// In-place move along kv_dim when src/dst share the same buffer. +// Requirements: +// - kv_dim_src == kv_dim_dst, otherwise throws +// - src_tensor->data() == dst_tensor->data() +void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr src_tensor, + ov::SoPtr dst_tensor, + uint32_t kv_dim_src, + uint32_t kv_dim_dst) { + OPENVINO_ASSERT(src_tensor); + OPENVINO_ASSERT(dst_tensor); + + if (kv_dim_src != kv_dim_dst) { + OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst"); + } + + void* base_data = src_tensor->data(); + void* dst_data = dst_tensor->data(); + OPENVINO_ASSERT(base_data); + OPENVINO_ASSERT(dst_data); + OPENVINO_ASSERT(base_data == dst_data); + + const auto& shape = src_tensor->get_shape(); + const auto& dst_shape = dst_tensor->get_shape(); + OPENVINO_ASSERT(shape.size() == dst_shape.size()); + OPENVINO_ASSERT(shape == dst_shape); + OPENVINO_ASSERT(kv_dim_src < shape.size()); + + const auto& src_strides = src_tensor->get_strides(); + const auto& dst_strides = dst_tensor->get_strides(); + + const size_t total_elems = src_tensor->get_size(); + const size_t elem_size = src_tensor->get_byte_size() / total_elems; + + if (src_strides == dst_strides) { + LOG_INFO("identical strides, skip"); + return; + } + + for (size_t d = 0; d < shape.size(); ++d) { + if (shape[d] == 0) { + LOG_INFO("zero-sized dimension, nothing to move"); + return; + } + } + + auto* base = static_cast(base_data); + const size_t rank = shape.size(); + + std::vector idx(rank); + for (size_t d = 0; d < rank; ++d) { + idx[d] = shape[d] - 1; + } + + size_t src_off = 0; + size_t dst_off = 0; + for (size_t d = 0; d < rank; ++d) { + src_off += idx[d] * src_strides[d]; + dst_off += idx[d] * dst_strides[d]; + } + + auto dec_index_and_update_offsets = [&]() -> bool { + for (int d = static_cast(rank) - 1; d >= 0; --d) { + const size_t old = idx[static_cast(d)]; + if (old > 0) { + idx[static_cast(d)] = old - 1; + src_off -= src_strides[static_cast(d)]; + dst_off -= dst_strides[static_cast(d)]; + return true; + } else { + idx[static_cast(d)] = shape[static_cast(d)] - 1; + src_off += src_strides[static_cast(d)] * (shape[static_cast(d)] - 1); + dst_off += dst_strides[static_cast(d)] * (shape[static_cast(d)] - 1); + } + } + return false; + }; + + while (true) { + uint8_t* src_ptr = base + src_off; + uint8_t* dst_ptr = base + dst_off; + + if (src_ptr != dst_ptr) { + std::memmove(dst_ptr, src_ptr, elem_size); + } + + if (!dec_index_and_update_offsets()) { + break; + } + } +} + std::optional> ov::npuw::util::find_port_by_name( const std::vector>& ports, const std::string& name) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp index 022d49b56a140c..f526328cf12943 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp @@ -31,6 +31,11 @@ void copy_tensor_by_dim(ov::SoPtr src_tensor, uint32_t kv_dim_src, uint32_t kv_dim_dst); +void move_tensor_inplace_by_dim(ov::SoPtr src_tensor, + ov::SoPtr dst_tensor, + uint32_t kv_dim_src, + uint32_t kv_dim_dst); + std::optional> find_port_by_name(const std::vector>& ports, const std::string& name); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 6f7be5664975b1..9efaf3348b9e00 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -587,10 +587,125 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor // Part 2: The kv results from the last loop remain in the 'present' KV output tensor // The task is to copy both parts into the KV-cache input tensor for the decoding process + // Copy part 1 KV results + // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk + auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk; + // Start counting time. + auto t_start = std::chrono::high_resolution_clock::now(); + if (tokens_in_past_chunks > 0) { + // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption + // This is necessary because subsequent copy operations would overwrite the shared buffer + auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name)); + auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor, + gen_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); + ov::SoPtr prefill_past_kv_chunks; + // move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst. + if (m_past_kv_bound) { + if (pre_kv_dim == gen_kv_dim) { + prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv, + pre_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); + + uu::move_tensor_inplace_by_dim(prefill_past_kv_chunks, + kvcache_past_kv_chunks, + pre_kv_dim, + gen_kv_dim); + } else { + auto tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(), + prefill_past_kv->get_shape(), + m_pre_alloc_device, + m_npuw_llm_compiled_model->get_plugin()); + prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); + prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor, + pre_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); + uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); + } + } else { + prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv, + pre_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); + uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); + } + } + // End counting time. + auto t_end = std::chrono::high_resolution_clock::now(); + auto duration_ms = std::chrono::duration_cast(t_end - t_start).count(); + LOG_INFO("############tokens_in_past_chunks cost: " << duration_ms << " ms"); + // Copy part 2 KV results + auto prefill_present_kv_chunk = + uu::make_tensor_slice(prefill_out_tensor, + pre_kv_dim, + static_cast(prefill_chunk_size - m_tokens_in_present_chunk), + static_cast(prefill_chunk_size)); + + auto kvcache_last_kv_chunk = uu::make_tensor_slice(kvcache_in_tensor, + gen_kv_dim, + static_cast(tokens_in_past_chunks), + kvcache_desc.num_stored_tokens); + + uu::copy_tensor_by_dim(prefill_present_kv_chunk, kvcache_last_kv_chunk, pre_kv_dim, gen_kv_dim); + } else { + auto prefill_out_slice = + uu::make_tensor_slice(prefill_out_tensor, + pre_kv_dim, + kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens, + kvcache_desc.max_prompt_size); + + auto kvcache_in_slice = + uu::make_tensor_slice(kvcache_in_tensor, gen_kv_dim, 0u, kvcache_desc.num_stored_tokens); + + uu::copy_tensor_by_dim(prefill_out_slice, kvcache_in_slice, pre_kv_dim, gen_kv_dim); + } + }); + LOG_DEBUG("Done."); +} + +/* +void ov::npuw::LLMInferRequest::copy_kvcache() { + namespace uu = ov::npuw::util; + LOG_DEBUG("Copying kv-cache from prefill to generate model."); + LOG_BLOCK(); + auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; + const auto& kvcache_compiled = m_kvcache_request->get_compiled_model(); + // FIXME: Find only matching by names outputs and copy them, having previously checked that such inputs exist + ov::parallel_for(kvcache_compiled->outputs().size() - layer_ids::kStartOutputKVCacheLayers, [&](size_t out_idx) { + const std::size_t i = layer_ids::kStartOutputKVCacheLayers + out_idx; + const auto& output_name = kvcache_compiled->outputs()[i].get_any_name(); + auto prefill_out_tensor = m_prefill_request->get_tensor(m_prefill_out_ports.at(output_name)); + + const auto& input_name = std::regex_replace(output_name, std::regex("present"), layer_names::past_key_values); + if (m_kvcache_in_ports.find(input_name) == m_kvcache_in_ports.end()) { + // FIXME: Totally wrong debug message. input_name is an invalid name of input layer. + LOG_DEBUG("Input name " << input_name << " doesn't contain kv cache. Skipping."); + return; + } + const auto is_value_tensor = output_name.find("value") != std::string::npos; + const auto kv_dim = [&](bool v_trans) -> uint32_t { + return (is_value_tensor && v_trans) ? 3u : kvcache_desc.dim; + }; + const auto& pre_kv_dim = kv_dim(kvcache_desc.v_tensors_transposed_pre); + const auto& gen_kv_dim = kv_dim(kvcache_desc.v_tensors_transposed_gen); + auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); + + const auto prefill_chunk_size = m_npuw_llm_compiled_model->m_prefill_chunk_size; + const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill; + if (use_chunk_prefill) { + // The chunk prefilled KV results are divided into two parts: + // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor + // Part 2: The kv results from the last loop remain in the 'present' KV output tensor + // The task is to copy both parts into the KV-cache input tensor for the decoding process // Copy part 1 KV results // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk; + // Start counting time. + auto t_start = std::chrono::high_resolution_clock::now(); if (tokens_in_past_chunks > 0) { // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption // This is necessary because subsequent copy operations would overwrite the shared buffer @@ -621,6 +736,10 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); } + // End counting time. + auto t_end = std::chrono::high_resolution_clock::now(); + auto duration_ms = std::chrono::duration_cast(t_end - t_start).count(); + LOG_INFO("########################################tokens_in_past_chunks cost: " << duration_ms << " ms"); // Copy part 2 KV results auto prefill_present_kv_chunk = @@ -650,6 +769,7 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { }); LOG_DEBUG("Done."); } +*/ void ov::npuw::LLMInferRequest::update_kvcache_for( std::shared_ptr request, From 5eada49f8afdfaacb17e0e631ec7847a90c6caa5 Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Tue, 16 Dec 2025 17:36:24 +0800 Subject: [PATCH 02/13] Optimize and Fix --- .../src/plugin/npuw/infer_request_utils.cpp | 199 ++++++++++++++---- .../src/plugin/npuw/infer_request_utils.hpp | 8 +- .../src/plugin/npuw/llm_infer_request.cpp | 3 +- 3 files changed, 170 insertions(+), 40 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp index df9805579fb2e3..da6069d28c87a2 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp @@ -140,35 +140,81 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr src_tensor, } } -// In-place move along kv_dim when src/dst share the same buffer. -// Requirements: -// - kv_dim_src == kv_dim_dst, otherwise throws -// - src_tensor->data() == dst_tensor->data() -void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr src_tensor, - ov::SoPtr dst_tensor, - uint32_t kv_dim_src, - uint32_t kv_dim_dst) { - OPENVINO_ASSERT(src_tensor); - OPENVINO_ASSERT(dst_tensor); +void ov::npuw::util::copy_inplace_columns_by_row_chunks(ov::SoPtr src, ov::SoPtr& dst) { + const auto& src_shape = src->get_shape(); - if (kv_dim_src != kv_dim_dst) { - OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst"); + OPENVINO_ASSERT(src_shape.size() == 4u); + OPENVINO_ASSERT(src_shape == dst->get_shape()); + OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size()); + + const auto& src_strides = src->get_strides(); + const auto& dst_strides = dst->get_strides(); + const auto elem_size = src->get_byte_size() / src->get_size(); + + const auto C = src_shape[1]; + const auto H = src_shape[2]; + const auto W = src_shape[3]; + + const auto IS_H = src_strides[2]; + const auto OS_H = dst_strides[2]; + + const size_t chunk_byte_size = W * elem_size; + + const auto* src_p = static_cast(src->data()); + auto* dst_p = static_cast(dst->data()); + + const size_t num_chunks = C * H; + if (num_chunks == 0 || chunk_byte_size == 0) { + return; } - void* base_data = src_tensor->data(); - void* dst_data = dst_tensor->data(); - OPENVINO_ASSERT(base_data); - OPENVINO_ASSERT(dst_data); - OPENVINO_ASSERT(base_data == dst_data); + for (size_t i = num_chunks; i-- > 0;) { + const size_t src_offset = i * IS_H; + const size_t dst_offset = i * OS_H; + std::memmove(dst_p + dst_offset, src_p + src_offset, chunk_byte_size); + } +} + +void ov::npuw::util::copy_inplace_by_planes(ov::SoPtr src_tensor, ov::SoPtr dst_tensor) { + // [1, H, S1, E] -> [1, H, S2, E] + const int N = 0; + const int H = 1; + const int S = 2; + const int E = 3; + OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]); + OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]); + OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]); + OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type()); + OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u); + OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u); + + const auto* src_base = reinterpret_cast(src_tensor->data()); + auto* dst_base = reinterpret_cast(dst_tensor->data()); + + const auto num_planes = src_tensor->get_shape()[H]; + const auto src_plane_stride = src_tensor->get_strides()[H]; + const auto dst_plane_stride = dst_tensor->get_strides()[H]; + const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S]; + + if (num_planes == 0 || plane_size_in_bytes == 0) { + return; + } + + for (size_t i = num_planes; i-- > 0;) { + const auto* src_ptr = src_base + i * src_plane_stride; + auto* dst_ptr = dst_base + i * dst_plane_stride; + std::memmove(dst_ptr, src_ptr, plane_size_in_bytes); + } +} + +void ov::npuw::util::copy_inplace(ov::SoPtr src_tensor, ov::SoPtr dst_tensor) { const auto& shape = src_tensor->get_shape(); - const auto& dst_shape = dst_tensor->get_shape(); - OPENVINO_ASSERT(shape.size() == dst_shape.size()); - OPENVINO_ASSERT(shape == dst_shape); - OPENVINO_ASSERT(kv_dim_src < shape.size()); - const auto& src_strides = src_tensor->get_strides(); - const auto& dst_strides = dst_tensor->get_strides(); + auto* base = static_cast(src_tensor->data()); + + auto src_strides = src_tensor->get_strides(); + auto dst_strides = dst_tensor->get_strides(); const size_t total_elems = src_tensor->get_size(); const size_t elem_size = src_tensor->get_byte_size() / total_elems; @@ -185,33 +231,74 @@ void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr src_tenso } } - auto* base = static_cast(base_data); - const size_t rank = shape.size(); + auto rank = shape.size(); + + ov::Shape cur_pos{0}; + ov::Shape max_pos{1}; - std::vector idx(rank); - for (size_t d = 0; d < rank; ++d) { - idx[d] = shape[d] - 1; + if (src_tensor->get_element_type().bitwidth() < 8 || (is_scalar(shape))) { + // Doesn't support strides for LP types + // or both tensors have default strides + // Strides and positions already initialized + } else { + ov::Strides src_str, dst_str; + // Calculate src and dst shapes + bool found_step = false; + for (size_t inverted_idx = rank - 1; inverted_idx < rank; --inverted_idx) { + if (!found_step) { + if (src_strides[inverted_idx] == dst_strides[inverted_idx]) { + continue; + } else { + found_step = true; + size_t strides_size = inverted_idx + 1; + // Set right size + src_str.resize(strides_size + 1); + dst_str.resize(strides_size + 1); + max_pos.resize(strides_size + 1); + cur_pos.resize(strides_size + 1); + // In case of default continuous strides we can copy several elements + // In other case only one element + size_t dim = 1; + size_t strides = elem_size; + + if (strides_size < src_strides.size()) { + strides = src_strides[strides_size]; + dim = shape[strides_size]; + } + src_str[strides_size] = strides; + dst_str[strides_size] = strides; + max_pos[strides_size] = dim; + cur_pos[strides_size] = max_pos[strides_size] - 1; + } + } + src_str[inverted_idx] = src_strides[inverted_idx]; + dst_str[inverted_idx] = dst_strides[inverted_idx]; + max_pos[inverted_idx] = shape[inverted_idx]; + cur_pos[inverted_idx] = max_pos[inverted_idx] - 1; + } + src_strides = std::move(src_str); + dst_strides = std::move(dst_str); } size_t src_off = 0; size_t dst_off = 0; - for (size_t d = 0; d < rank; ++d) { - src_off += idx[d] * src_strides[d]; - dst_off += idx[d] * dst_strides[d]; + for (size_t d = 0; d < max_pos.size(); ++d) { + src_off += cur_pos[d] * src_strides[d]; + dst_off += cur_pos[d] * dst_strides[d]; } auto dec_index_and_update_offsets = [&]() -> bool { - for (int d = static_cast(rank) - 1; d >= 0; --d) { - const size_t old = idx[static_cast(d)]; + for (int d = static_cast(max_pos.size()) - 1; d >= 0; --d) { + const size_t old = cur_pos[static_cast(d)]; if (old > 0) { - idx[static_cast(d)] = old - 1; + cur_pos[static_cast(d)] = old - 1; src_off -= src_strides[static_cast(d)]; dst_off -= dst_strides[static_cast(d)]; return true; } else { - idx[static_cast(d)] = shape[static_cast(d)] - 1; - src_off += src_strides[static_cast(d)] * (shape[static_cast(d)] - 1); - dst_off += dst_strides[static_cast(d)] * (shape[static_cast(d)] - 1); + cur_pos[static_cast(d)] = max_pos[static_cast(d)] - 1; + src_off += src_strides[static_cast(d)] * (max_pos[static_cast(d)] - 1); + dst_off += dst_strides[static_cast(d)] * (max_pos[static_cast(d)] - 1); } } return false; @@ -222,7 +309,7 @@ void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr src_tenso uint8_t* dst_ptr = base + dst_off; if (src_ptr != dst_ptr) { - std::memmove(dst_ptr, src_ptr, elem_size); + std::memmove(dst_ptr, src_ptr, src_strides[src_strides.size() - 1]); } if (!dec_index_and_update_offsets()) { @@ -231,6 +318,42 @@ void ov::npuw::util::move_tensor_inplace_by_dim(ov::SoPtr src_tenso } } +// In-place move along kv_dim when src/dst share the same buffer. +// Requirements: +// - kv_dim_src == kv_dim_dst, otherwise throws +// - src_tensor->data() == dst_tensor->data() +void ov::npuw::util::copy_tensor_inplace_by_dim(ov::SoPtr src_tensor, + ov::SoPtr dst_tensor, + uint32_t kv_dim_src, + uint32_t kv_dim_dst) { + OPENVINO_ASSERT(src_tensor); + OPENVINO_ASSERT(dst_tensor); + + if (kv_dim_src != kv_dim_dst) { + OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst"); + } + + void* base_data = src_tensor->data(); + void* dst_data = dst_tensor->data(); + OPENVINO_ASSERT(base_data); + OPENVINO_ASSERT(dst_data); + OPENVINO_ASSERT(base_data == dst_data); + + const auto& src_shape = src_tensor->get_shape(); + const auto& dst_shape = dst_tensor->get_shape(); + OPENVINO_ASSERT(src_shape.size() == dst_shape.size()); + OPENVINO_ASSERT(src_shape == dst_shape); + OPENVINO_ASSERT(kv_dim_src < src_shape.size()); + + if (kv_dim_src == 3u) { + copy_inplace_columns_by_row_chunks(src_tensor, dst_tensor); + } else if (kv_dim_src == 2u) { + copy_inplace_by_planes(src_tensor, dst_tensor); + } else { + copy_inplace(src_tensor, dst_tensor); + } +} + std::optional> ov::npuw::util::find_port_by_name( const std::vector>& ports, const std::string& name) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp index f526328cf12943..fa53959b1280b4 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp @@ -31,7 +31,13 @@ void copy_tensor_by_dim(ov::SoPtr src_tensor, uint32_t kv_dim_src, uint32_t kv_dim_dst); -void move_tensor_inplace_by_dim(ov::SoPtr src_tensor, +void copy_inplace_columns_by_row_chunks(ov::SoPtr src, ov::SoPtr& dst); + +void copy_inplace_by_planes(ov::SoPtr src_tensor, ov::SoPtr dst_tensor); + +void copy_inplace(ov::SoPtr src_tensor, ov::SoPtr dst_tensor); + +void copy_tensor_inplace_by_dim(ov::SoPtr src_tensor, ov::SoPtr dst_tensor, uint32_t kv_dim_src, uint32_t kv_dim_dst); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 9efaf3348b9e00..bffed9211b042a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -582,6 +582,7 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { const auto prefill_chunk_size = m_npuw_llm_compiled_model->m_prefill_chunk_size; const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill; + LOG_INFO("############pre_kv_dim and gen_kv_dim" << pre_kv_dim << " " << gen_kv_dim << ";"); if (use_chunk_prefill) { // The chunk prefilled KV results are divided into two parts: // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor @@ -609,7 +610,7 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { 0u, static_cast(tokens_in_past_chunks)); - uu::move_tensor_inplace_by_dim(prefill_past_kv_chunks, + uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); From 40d955ab8d19af7d61933c487c54063794261735 Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Thu, 18 Dec 2025 16:24:22 +0800 Subject: [PATCH 03/13] Fix and optimize --- .../src/plugin/npuw/llm_infer_request.cpp | 118 +----------------- 1 file changed, 6 insertions(+), 112 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index bffed9211b042a..407c662abc338c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -582,7 +582,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { const auto prefill_chunk_size = m_npuw_llm_compiled_model->m_prefill_chunk_size; const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill; - LOG_INFO("############pre_kv_dim and gen_kv_dim" << pre_kv_dim << " " << gen_kv_dim << ";"); if (use_chunk_prefill) { // The chunk prefilled KV results are divided into two parts: // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor @@ -591,8 +590,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { // Copy part 1 KV results // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk; - // Start counting time. - auto t_start = std::chrono::high_resolution_clock::now(); if (tokens_in_past_chunks > 0) { // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption // This is necessary because subsequent copy operations would overwrite the shared buffer @@ -634,114 +631,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); } } - // End counting time. - auto t_end = std::chrono::high_resolution_clock::now(); - auto duration_ms = std::chrono::duration_cast(t_end - t_start).count(); - LOG_INFO("############tokens_in_past_chunks cost: " << duration_ms << " ms"); - // Copy part 2 KV results - auto prefill_present_kv_chunk = - uu::make_tensor_slice(prefill_out_tensor, - pre_kv_dim, - static_cast(prefill_chunk_size - m_tokens_in_present_chunk), - static_cast(prefill_chunk_size)); - - auto kvcache_last_kv_chunk = uu::make_tensor_slice(kvcache_in_tensor, - gen_kv_dim, - static_cast(tokens_in_past_chunks), - kvcache_desc.num_stored_tokens); - - uu::copy_tensor_by_dim(prefill_present_kv_chunk, kvcache_last_kv_chunk, pre_kv_dim, gen_kv_dim); - } else { - auto prefill_out_slice = - uu::make_tensor_slice(prefill_out_tensor, - pre_kv_dim, - kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens, - kvcache_desc.max_prompt_size); - - auto kvcache_in_slice = - uu::make_tensor_slice(kvcache_in_tensor, gen_kv_dim, 0u, kvcache_desc.num_stored_tokens); - - uu::copy_tensor_by_dim(prefill_out_slice, kvcache_in_slice, pre_kv_dim, gen_kv_dim); - } - }); - LOG_DEBUG("Done."); -} - -/* -void ov::npuw::LLMInferRequest::copy_kvcache() { - namespace uu = ov::npuw::util; - LOG_DEBUG("Copying kv-cache from prefill to generate model."); - LOG_BLOCK(); - auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; - const auto& kvcache_compiled = m_kvcache_request->get_compiled_model(); - // FIXME: Find only matching by names outputs and copy them, having previously checked that such inputs exist - ov::parallel_for(kvcache_compiled->outputs().size() - layer_ids::kStartOutputKVCacheLayers, [&](size_t out_idx) { - const std::size_t i = layer_ids::kStartOutputKVCacheLayers + out_idx; - const auto& output_name = kvcache_compiled->outputs()[i].get_any_name(); - auto prefill_out_tensor = m_prefill_request->get_tensor(m_prefill_out_ports.at(output_name)); - - const auto& input_name = std::regex_replace(output_name, std::regex("present"), layer_names::past_key_values); - if (m_kvcache_in_ports.find(input_name) == m_kvcache_in_ports.end()) { - // FIXME: Totally wrong debug message. input_name is an invalid name of input layer. - LOG_DEBUG("Input name " << input_name << " doesn't contain kv cache. Skipping."); - return; - } - const auto is_value_tensor = output_name.find("value") != std::string::npos; - const auto kv_dim = [&](bool v_trans) -> uint32_t { - return (is_value_tensor && v_trans) ? 3u : kvcache_desc.dim; - }; - - const auto& pre_kv_dim = kv_dim(kvcache_desc.v_tensors_transposed_pre); - const auto& gen_kv_dim = kv_dim(kvcache_desc.v_tensors_transposed_gen); - auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); - - const auto prefill_chunk_size = m_npuw_llm_compiled_model->m_prefill_chunk_size; - const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill; - if (use_chunk_prefill) { - // The chunk prefilled KV results are divided into two parts: - // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor - // Part 2: The kv results from the last loop remain in the 'present' KV output tensor - // The task is to copy both parts into the KV-cache input tensor for the decoding process - // Copy part 1 KV results - // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk - auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk; - // Start counting time. - auto t_start = std::chrono::high_resolution_clock::now(); - if (tokens_in_past_chunks > 0) { - // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption - // This is necessary because subsequent copy operations would overwrite the shared buffer - auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name)); - ov::SoPtr tmp_dense_kv_tensor; - ov::SoPtr prefill_past_kv_chunks; - if (m_past_kv_bound) { - tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(), - prefill_past_kv->get_shape(), - m_pre_alloc_device, - m_npuw_llm_compiled_model->get_plugin()); - prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); - prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor, - pre_kv_dim, - 0u, - static_cast(tokens_in_past_chunks)); - } else { - prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv, - pre_kv_dim, - 0u, - static_cast(tokens_in_past_chunks)); - } - - auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor, - gen_kv_dim, - 0u, - static_cast(tokens_in_past_chunks)); - - uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); - } - // End counting time. - auto t_end = std::chrono::high_resolution_clock::now(); - auto duration_ms = std::chrono::duration_cast(t_end - t_start).count(); - LOG_INFO("########################################tokens_in_past_chunks cost: " << duration_ms << " ms"); - // Copy part 2 KV results auto prefill_present_kv_chunk = uu::make_tensor_slice(prefill_out_tensor, @@ -770,7 +659,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { }); LOG_DEBUG("Done."); } -*/ void ov::npuw::LLMInferRequest::update_kvcache_for( std::shared_ptr request, @@ -1077,7 +965,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, if (!m_generate_initialized) { LOG_DEBUG("Copy kv-cache from prefill to generate model."); if (kvcache_desc.num_stored_tokens > 0) { + // Start counting time. + auto t_start = std::chrono::high_resolution_clock::now(); copy_kvcache(); + // End counting time. + auto t_end = std::chrono::high_resolution_clock::now(); + auto duration_ms = std::chrono::duration_cast(t_end - t_start).count(); + LOG_INFO("cost of copy_kvcache(): " << duration_ms << " ms"); } LOG_DEBUG("Prepare inputs."); From afd44182b76f543fabfdbe44216ea970a566d55c Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Tue, 6 Jan 2026 16:35:07 +0800 Subject: [PATCH 04/13] Refactor --- .../src/plugin/npuw/infer_request_utils.cpp | 419 ++++++++++++------ .../src/plugin/npuw/infer_request_utils.hpp | 8 +- .../intel_npu/src/plugin/npuw/util.cpp | 3 +- 3 files changed, 282 insertions(+), 148 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp index ce81501ed56e96..131f093289f4e8 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp @@ -1,3 +1,165 @@ +// // Copyright (C) 2025 Intel Corporation +// // SPDX-License-Identifier: Apache-2.0 +// // + +// #include "infer_request_utils.hpp" + +// #include "logging.hpp" +// #include "openvino/runtime/make_tensor.hpp" // get_tensor_impl +// #include "util_xarch.hpp" + +// // FIXME: Use ov::npuw::util::view instead +// ov::SoPtr ov::npuw::util::make_tensor_slice(ov::SoPtr tensor, +// uint32_t dim, +// uint32_t start_pos, +// uint32_t end_pos) { +// ov::Shape start_shape(std::vector(tensor->get_shape().size(), 0u)); +// start_shape[dim] = start_pos; +// ov::Shape end_shape = tensor->get_shape(); +// end_shape[dim] = end_pos; +// return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape)); +// } + +// void ov::npuw::util::copy_to_right(const ov::SoPtr& src, const ov::SoPtr& dst) { +// OPENVINO_ASSERT(src->get_byte_size() <= dst->get_byte_size()); +// std::copy_n(reinterpret_cast(src->data()), +// src->get_byte_size(), +// reinterpret_cast(dst->data()) + dst->get_byte_size() - src->get_byte_size()); +// } + +// void ov::npuw::util::copy_by_planes(ov::SoPtr src_tensor, ov::SoPtr dst_tensor) { +// // [1, H, S1, E] -> [1, H, S2, E] +// const int N = 0; +// const int H = 1; +// const int S = 2; +// const int E = 3; + +// OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]); +// OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]); +// OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]); +// OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type()); +// OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u); +// OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u); + +// const auto* src_tensor_data = reinterpret_cast(src_tensor->data()); +// auto* dst_tensor_data = reinterpret_cast(dst_tensor->data()); + +// const auto num_planes = src_tensor->get_shape()[H]; +// const auto src_plane_stride = src_tensor->get_strides()[H]; +// const auto dst_plane_stride = dst_tensor->get_strides()[H]; +// const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S]; + +// for (size_t i = 0; i < num_planes; ++i) { +// std::copy_n(src_tensor_data, plane_size_in_bytes, dst_tensor_data); +// dst_tensor_data += dst_plane_stride; +// src_tensor_data += src_plane_stride; +// } +// } + +// void ov::npuw::util::copy_columns_by_row_chunks(ov::SoPtr src, ov::SoPtr& dst) { +// /* +// src/dst layout: [1, heads, emb_size, seq_len] + +// X[*,i] - embedding for i-th token, +// Instead of copy columns, copy rows X[i,*] + +// [[X00 X01 ... X0n] [[X00 X01 ... X0n] +// [X10 X11 ... X1n] [X10 X11 ... X1n] +// [X20 X21 ... X2n] ... [X20 X21 ... X2n] +// ... ... +// [Xm0 Xm1 ... Xmn]] [Xm0 Xm1 ... Xmn]] +// */ + +// const auto& src_shape = src->get_shape(); + +// OPENVINO_ASSERT(src_shape.size() == 4u); +// OPENVINO_ASSERT(src_shape == dst->get_shape()); +// OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size()); + +// const auto& src_strides = src->get_strides(); +// const auto& dst_strides = dst->get_strides(); +// const auto elem_size = src->get_byte_size() / src->get_size(); + +// const auto C = src_shape[1]; +// const auto H = src_shape[2]; +// const auto W = src_shape[3]; + +// const auto IS_H = src_strides[2]; +// const auto OS_H = dst_strides[2]; + +// const size_t chunk_byte_size = W * elem_size; + +// const auto* src_p = static_cast(src->data()); +// auto* dst_p = static_cast(dst->data()); + +// for (size_t i = 0; i < C * H; ++i) { +// const size_t src_offset = i * IS_H; +// const size_t dst_offset = i * OS_H; +// std::copy_n(src_p + src_offset, chunk_byte_size, dst_p + dst_offset); +// } +// } + +// void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr src_tensor, +// ov::SoPtr dst_tensor, +// uint32_t kv_dim_src, +// uint32_t kv_dim_dst) { +// if (kv_dim_src != kv_dim_dst) { +// // new case - do a generic copy for now (in fact it is a permute) +// // Example: +// // kv_dim_src kv_dim_dst +// // v v +// // [1,8,256,128] --> [1,8,128,256] +// const auto& src_shape = src_tensor->get_shape(); +// const auto& dst_shape = dst_tensor->get_shape(); +// NPUW_ASSERT(src_shape.size() == 4); +// NPUW_ASSERT(dst_shape.size() == 4); +// NPUW_ASSERT(kv_dim_src < 4); +// NPUW_ASSERT(kv_dim_dst < 4); +// NPUW_ASSERT(src_shape[kv_dim_src] == dst_shape[kv_dim_dst]); + +// std::array axis = {0, 1, 2, 3}; +// // Remap like 0,1,2,3 => 0,1,3,2 (see example) +// std::swap(axis[kv_dim_src], axis[kv_dim_dst]); +// ov::npuw::util::permute_i4d(src_tensor, dst_tensor, axis); +// return; +// } +// // Old behavior +// NPUW_ASSERT(kv_dim_src == kv_dim_dst); +// if (kv_dim_src == 3u) { +// // Asserting that we work with last dimenston here: +// const auto& src_shape = src_tensor->get_shape(); +// OPENVINO_ASSERT(src_shape.size() == 4); +// // If last dimenstion of src_tensor is equal to 1, then we can squeeze +// // src_shape from [1, heads, d_v, seq_len=1] to [heads, d_v]. +// // We can then treat src_tensor as a continuous tensor of row value vectors +// // for multiple heads, while dst_tensor will still have [1, heads, d_v, seq_len!=1], +// // shape, awaiting updates at column dimension, as value vectors are columns now. +// if (src_shape[kv_dim_src] == 1 && src_tensor->is_continuous()) { +// // FIXME: ov::npuw::util::XARCH::copy_row_as_column(src_tensor, dst_tensor) throws when used here +// copy_columns_by_row_chunks(src_tensor, dst_tensor); +// } else { +// copy_columns_by_row_chunks(src_tensor, dst_tensor); +// } +// } else if (kv_dim_src == 2u) { +// copy_by_planes(src_tensor, dst_tensor); +// } else { +// src_tensor->copy_to(dst_tensor._ptr); +// } +// } + +// std::optional> ov::npuw::util::find_port_by_name( +// const std::vector>& ports, +// const std::string& name) { +// auto it = std::find_if(ports.begin(), ports.end(), [&](const auto& port) { +// return port.get_names().count(name) != 0; +// }); +// if (it == ports.end()) { +// return std::nullopt; +// } +// return std::make_optional(*it); +// } + +////////////////////////////////////////////////////////////////////// // Copyright (C) 2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -147,179 +309,158 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr src_tensor, } } -void ov::npuw::util::copy_inplace_columns_by_row_chunks(ov::SoPtr src, ov::SoPtr& dst) { - const auto& src_shape = src->get_shape(); - - OPENVINO_ASSERT(src_shape.size() == 4u); - OPENVINO_ASSERT(src_shape == dst->get_shape()); - OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size()); - - const auto& src_strides = src->get_strides(); - const auto& dst_strides = dst->get_strides(); - const auto elem_size = src->get_byte_size() / src->get_size(); - - const auto C = src_shape[1]; - const auto H = src_shape[2]; - const auto W = src_shape[3]; - - const auto IS_H = src_strides[2]; - const auto OS_H = dst_strides[2]; +void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_tensor, + ov::SoPtr dst_tensor) { + OPENVINO_ASSERT(src_tensor); + OPENVINO_ASSERT(dst_tensor); - const size_t chunk_byte_size = W * elem_size; + void* base_data = src_tensor->data(); + void* dst_data = dst_tensor->data(); + OPENVINO_ASSERT(base_data && dst_data); + OPENVINO_ASSERT(base_data == dst_data); - const auto* src_p = static_cast(src->data()); - auto* dst_p = static_cast(dst->data()); + const auto& shape0 = src_tensor->get_shape(); + const auto& dst_shape0 = dst_tensor->get_shape(); + OPENVINO_ASSERT(shape0 == dst_shape0); - const size_t num_chunks = C * H; - if (num_chunks == 0 || chunk_byte_size == 0) { + const size_t rank0 = shape0.size(); + if (rank0 == 0) { return; } - for (size_t i = num_chunks; i-- > 0;) { - const size_t src_offset = i * IS_H; - const size_t dst_offset = i * OS_H; - std::memmove(dst_p + dst_offset, src_p + src_offset, chunk_byte_size); + for (size_t d = 0; d < rank0; ++d) { + if (shape0[d] == 0) { + return; + } } -} -void ov::npuw::util::copy_inplace_by_planes(ov::SoPtr src_tensor, ov::SoPtr dst_tensor) { - // [1, H, S1, E] -> [1, H, S2, E] - const int N = 0; - const int H = 1; - const int S = 2; - const int E = 3; + const size_t total_elems = src_tensor->get_size(); + OPENVINO_ASSERT(total_elems != 0); + const size_t elem_size = src_tensor->get_byte_size() / total_elems; - OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]); - OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]); - OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]); - OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type()); - OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u); - OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u); + ov::Strides src_strides0 = src_tensor->get_strides(); + ov::Strides dst_strides0 = dst_tensor->get_strides(); + OPENVINO_ASSERT(src_strides0.size() == rank0); + OPENVINO_ASSERT(dst_strides0.size() == rank0); - const auto* src_base = reinterpret_cast(src_tensor->data()); - auto* dst_base = reinterpret_cast(dst_tensor->data()); + // Build default byte strides for given shape (same as ov::ITensor::copy_to logic). + ov::Strides default_strides(rank0, 0); + default_strides[rank0 - 1] = elem_size; + for (size_t i = rank0 - 1; i > 0; --i) { + default_strides[i - 1] = default_strides[i] * shape0[i]; + } - const auto num_planes = src_tensor->get_shape()[H]; - const auto src_plane_stride = src_tensor->get_strides()[H]; - const auto dst_plane_stride = dst_tensor->get_strides()[H]; - const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S]; + // Your explicit preconditions: + OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size); + OPENVINO_ASSERT(dst_strides0[rank0 - 1] == elem_size); + OPENVINO_ASSERT(default_strides[rank0 - 1] == elem_size); - if (num_planes == 0 || plane_size_in_bytes == 0) { - return; + if (rank0 >= 2) { + const size_t packed = shape0[rank0 - 1] * elem_size; + OPENVINO_ASSERT(src_strides0[rank0 - 2] == packed); + OPENVINO_ASSERT(dst_strides0[rank0 - 2] == packed); + OPENVINO_ASSERT(default_strides[rank0 - 2] == packed); } - for (size_t i = num_planes; i-- > 0;) { - const auto* src_ptr = src_base + i * src_plane_stride; - auto* dst_ptr = dst_base + i * dst_plane_stride; - std::memmove(dst_ptr, src_ptr, plane_size_in_bytes); + // Find the COMMON trailing segment where src_stride == dst_stride == default_stride. + // This is the only part eligible for flattening. + size_t cut = rank0 - 1; // at worst, we can always copy along last dim + for (size_t inverted_idx = rank0 - 1; inverted_idx < rank0; --inverted_idx) { + const bool ok = (src_strides0[inverted_idx] == default_strides[inverted_idx]) && + (dst_strides0[inverted_idx] == default_strides[inverted_idx]) && + (src_strides0[inverted_idx] == dst_strides0[inverted_idx]); + if (ok) { + cut = inverted_idx; + if (inverted_idx == 0) { + break; + } + continue; + } + break; } -} -void ov::npuw::util::copy_inplace(ov::SoPtr src_tensor, ov::SoPtr dst_tensor) { - const auto& shape = src_tensor->get_shape(); + // Fold [cut..rank0-1] into a single last dimension. + ov::Shape shape; + ov::Strides src_strides; + ov::Strides dst_strides; - auto* base = static_cast(src_tensor->data()); + shape.reserve(cut + 1); + src_strides.reserve(cut + 1); + dst_strides.reserve(cut + 1); - auto src_strides = src_tensor->get_strides(); - auto dst_strides = dst_tensor->get_strides(); + for (size_t d = 0; d < cut; ++d) { + shape.push_back(shape0[d]); + src_strides.push_back(src_strides0[d]); + dst_strides.push_back(dst_strides0[d]); + } - const size_t total_elems = src_tensor->get_size(); - const size_t elem_size = src_tensor->get_byte_size() / total_elems; + size_t folded_last = 1; + for (size_t d = cut; d < rank0; ++d) { + folded_last *= shape0[d]; + } + shape.push_back(folded_last); + + // For the folded last dim, the step is element-size (bytes per element). + // (Since the whole folded tail is default-contiguous, this holds.) + src_strides.push_back(elem_size); + dst_strides.push_back(elem_size); + + const size_t rank = shape.size(); + OPENVINO_ASSERT(rank >= 1); - if (src_strides == dst_strides) { - LOG_INFO("identical strides, skip"); + const size_t row_elems = shape[rank - 1]; + const size_t row_bytes = row_elems * elem_size; + if (row_bytes == 0) { return; } - for (size_t d = 0; d < shape.size(); ++d) { - if (shape[d] == 0) { - LOG_INFO("zero-sized dimension, nothing to move"); - return; - } + // Iterate outer coordinates in reverse lexicographic order for overlap-safe memmove. + size_t num_rows = 1; + for (size_t d = 0; d + 1 < rank; ++d) { + num_rows *= shape[d]; + } + if (num_rows == 0) { + return; } - auto rank = shape.size(); - - ov::Shape cur_pos{0}; - ov::Shape max_pos{1}; + auto* base = static_cast(base_data); - if (src_tensor->get_element_type().bitwidth() < 8 || (is_scalar(shape))) { - // Doesn't support strides for LP types - // or both tensors have default strides - // Strides and positions already initialized - } else { - ov::Strides src_str, dst_str; - // Calculate src and dst shapes - bool found_step = false; - for (size_t inverted_idx = rank - 1; inverted_idx < rank; --inverted_idx) { - if (!found_step) { - if (src_strides[inverted_idx] == dst_strides[inverted_idx]) { - continue; - } else { - found_step = true; - size_t strides_size = inverted_idx + 1; - // Set right size - src_str.resize(strides_size + 1); - dst_str.resize(strides_size + 1); - max_pos.resize(strides_size + 1); - cur_pos.resize(strides_size + 1); - // In case of default continuous strides we can copy several elements - // In other case only one element - size_t dim = 1; - size_t strides = elem_size; - - if (strides_size < src_strides.size()) { - strides = src_strides[strides_size]; - dim = shape[strides_size]; - } - src_str[strides_size] = strides; - dst_str[strides_size] = strides; - max_pos[strides_size] = dim; - cur_pos[strides_size] = max_pos[strides_size] - 1; - } - } - src_str[inverted_idx] = src_strides[inverted_idx]; - dst_str[inverted_idx] = dst_strides[inverted_idx]; - max_pos[inverted_idx] = shape[inverted_idx]; - cur_pos[inverted_idx] = max_pos[inverted_idx] - 1; - } - src_strides = std::move(src_str); - dst_strides = std::move(dst_str); + ov::Shape idx(rank - 1, 0); + for (size_t d = 0; d + 1 < rank; ++d) { + idx[d] = shape[d] - 1; } - size_t src_off = 0; - size_t dst_off = 0; - for (size_t d = 0; d < max_pos.size(); ++d) { - src_off += cur_pos[d] * src_strides[d]; - dst_off += cur_pos[d] * dst_strides[d]; - } + auto compute_offset = [&](const ov::Shape& outer, const ov::Strides& strides_bytes) -> size_t { + size_t off = 0; + for (size_t d = 0; d < outer.size(); ++d) { + off += outer[d] * strides_bytes[d]; + } + return off; + }; - auto dec_index_and_update_offsets = [&]() -> bool { - for (int d = static_cast(max_pos.size()) - 1; d >= 0; --d) { - const size_t old = cur_pos[static_cast(d)]; - if (old > 0) { - cur_pos[static_cast(d)] = old - 1; - src_off -= src_strides[static_cast(d)]; - dst_off -= dst_strides[static_cast(d)]; + auto dec_outer = [&]() -> bool { + for (int d = static_cast(rank) - 2; d >= 0; --d) { + const size_t ud = static_cast(d); + if (idx[ud] > 0) { + --idx[ud]; return true; - } else { - cur_pos[static_cast(d)] = max_pos[static_cast(d)] - 1; - src_off += src_strides[static_cast(d)] * (max_pos[static_cast(d)] - 1); - dst_off += dst_strides[static_cast(d)] * (max_pos[static_cast(d)] - 1); } + idx[ud] = shape[ud] - 1; } return false; }; while (true) { + const size_t src_off = compute_offset(idx, src_strides); + const size_t dst_off = compute_offset(idx, dst_strides); + uint8_t* src_ptr = base + src_off; uint8_t* dst_ptr = base + dst_off; - if (src_ptr != dst_ptr) { - std::memmove(dst_ptr, src_ptr, src_strides[src_strides.size() - 1]); + std::memmove(dst_ptr, src_ptr, row_bytes); } - if (!dec_index_and_update_offsets()) { + if (!dec_outer()) { break; } } @@ -329,7 +470,7 @@ void ov::npuw::util::copy_inplace(ov::SoPtr src_tensor, ov::SoPtrdata() == dst_tensor->data() -void ov::npuw::util::copy_tensor_inplace_by_dim(ov::SoPtr src_tensor, +void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr src_tensor, ov::SoPtr dst_tensor, uint32_t kv_dim_src, uint32_t kv_dim_dst) { @@ -352,13 +493,9 @@ void ov::npuw::util::copy_tensor_inplace_by_dim(ov::SoPtr src_tenso OPENVINO_ASSERT(src_shape == dst_shape); OPENVINO_ASSERT(kv_dim_src < src_shape.size()); - if (kv_dim_src == 3u) { - copy_inplace_columns_by_row_chunks(src_tensor, dst_tensor); - } else if (kv_dim_src == 2u) { - copy_inplace_by_planes(src_tensor, dst_tensor); - } else { - copy_inplace(src_tensor, dst_tensor); - } + // One generic implementation for all kv_dim. + // We rely on row-wise memmove on the (possibly flattened) last dimension and stride-based addressing. + copy_inplace_generic_rows(src_tensor, dst_tensor); } std::optional> ov::npuw::util::find_port_by_name( @@ -371,4 +508,4 @@ std::optional> ov::npuw::util::find_port_by_name( return std::nullopt; } return std::make_optional(*it); -} +} \ No newline at end of file diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp index b47bea1579bdec..d15c841b117ab7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp @@ -33,13 +33,9 @@ void copy_tensor_by_dim(ov::SoPtr src_tensor, uint32_t kv_dim_src, uint32_t kv_dim_dst); -void copy_inplace_columns_by_row_chunks(ov::SoPtr src, ov::SoPtr& dst); +void copy_inplace_generic_rows(const ov::SoPtr src_tensor, ov::SoPtr dst_tensor); -void copy_inplace_by_planes(ov::SoPtr src_tensor, ov::SoPtr dst_tensor); - -void copy_inplace(ov::SoPtr src_tensor, ov::SoPtr dst_tensor); - -void copy_tensor_inplace_by_dim(ov::SoPtr src_tensor, +void copy_tensor_inplace_by_dim(const ov::SoPtr src_tensor, ov::SoPtr dst_tensor, uint32_t kv_dim_src, uint32_t kv_dim_dst); diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index 80eb0aeeb590f0..56917dc8cc835c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -280,8 +280,9 @@ void ov::npuw::util::unpack(const ov::SoPtr& from, unpack_nf4f16(from, scale, to, unpack_options); } else if (type_from == ov::element::f8e4m3 || type_from == ov::element::f8e5m2 || type_from == ov::element::f8e8m0) { - // FIXME: Implement XARCH::unpack + LOG_INFO("######################## unpack_f8f16"); unpack_f8f16(from, scale, to, unpack_options); + //ov::npuw::util::XARCH::unpack_f8f16_scale(from, scale, to, unpack_options); } else if (type_from == ov::element::f16) { // FIXME: Implement XARCH::unpack unpack_f16f16(from, scale, to, unpack_options); From aa539a2330876b705d2174cc13decd2f2cb0b94d Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Tue, 6 Jan 2026 20:56:00 +0800 Subject: [PATCH 05/13] Add tests --- .../tests/unit/npuw/copy_inplace.cpp | 208 ++++++++++++++++++ .../tests/unit/npuw/copy_inplace.hpp | 149 +++++++++++++ 2 files changed, 357 insertions(+) create mode 100644 src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp create mode 100644 src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp new file mode 100644 index 00000000000000..fb3225b7c5d4aa --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp @@ -0,0 +1,208 @@ +// Copyright (C) 2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef HAVE_AVX2 +# include "copy_inplace.hpp" + +namespace { + +static ov::Strides make_padded_strides_keep_tail_default(const ov::Shape& shape, + const ov::element::Type& et, + size_t kv_dim, + size_t pad_elems) { + ov::Strides s = copy_inplace_details::default_byte_strides(shape, et); + + // Keep last 2 dims default contiguous explicitly. + if (shape.size() >= 1) { + s.back() = et.size(); + } + if (shape.size() >= 2) { + s[shape.size() - 2] = shape.back() * et.size(); + } + + const size_t rank = shape.size(); + if (rank <= 2) { + return s; + } + + const size_t last2_begin = rank - 2; + for (size_t d = 0; d < last2_begin; ++d) { + if (d <= kv_dim) { + s[d] += pad_elems * et.size(); + } + } + return s; +} + +static std::vector to_i8(const std::vector& v) { + std::vector out(v.size()); + std::memcpy(out.data(), v.data(), v.size()); + return out; +} + +void CopyInplaceTestsBase::make_input() { + const auto elem_bytes = copy_inplace_details::elem_size_bytes(type); + const auto total_elems = ov::shape_size(shape); + ASSERT_GT(total_elems, 0u); + + auto max_offset = [&](const ov::Strides& strides) -> size_t { + size_t off = 0; + for (size_t d = 0; d < shape.size(); ++d) { + off += (shape[d] - 1) * strides[d]; + } + return off; + }; + + const size_t src_max = max_offset(src_strides); + const size_t dst_max = max_offset(dst_strides); + const size_t byte_size = std::max(src_max, dst_max) + elem_bytes; + + base_bytes_initial.resize(byte_size); + ref_bytes.assign(byte_size, 0); + out_bytes.assign(byte_size, 0); + + std::mt19937 rng(42); + std::uniform_int_distribution dist(0, 255); + for (size_t i = 0; i < base_bytes_initial.size(); ++i) { + base_bytes_initial[i] = static_cast(dist(rng)); + } + + // External-memory tensor (safe for unit test lifetime). + baseTensor = ov::Tensor(ov::element::u8, ov::Shape{byte_size}, base_bytes_initial.data()); +} + +void CopyInplaceTestsBase::make_views() { + src_strides = copy_inplace_details::default_byte_strides(shape, type); + + const size_t pad_elems = 13; + dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems); + + void* base_ptr = baseTensor.data(); + ASSERT_NE(base_ptr, nullptr); + + srcView = ov::Tensor(type, shape, base_ptr, src_strides); + dstView = ov::Tensor(type, shape, base_ptr, dst_strides); +} + +bool CopyInplaceTestsBase::isNegative() const { + if (shape.size() < 2) { + return true; + } + if (kv_dim >= shape.size()) { + return true; + } + if (type.bitwidth() < 8) { + return true; + } + return false; +} + +void CopyInplaceTestsBase::make_ref_output() { + ref_bytes = base_bytes_initial; + + const auto elem_bytes = copy_inplace_details::elem_size_bytes(type); + const uint8_t* base_in = base_bytes_initial.data(); + + std::vector tmp_out = base_bytes_initial; + + ov::Shape idx(shape.size(), 0); + std::vector elem(elem_bytes); + + for (;;) { + copy_inplace_details::read_elem_bytes(base_in, idx, src_strides, elem_bytes, elem.data()); + copy_inplace_details::write_elem_bytes(tmp_out.data(), idx, dst_strides, elem_bytes, elem.data()); + + if (!copy_inplace_details::next_index(idx, shape)) { + break; + } + } + + ref_bytes = std::move(tmp_out); +} + +void CopyInplaceTestsBase::SetUp(const CopyInplaceTestsParams& getParam) { + ShapesInitializer shapeInit; + ov::element::Type_t t; + std::tie(t, shapeInit, kv_dim) = getParam; + + type = ov::element::Type(t); + + std::vector dims; + shapeInit(dims); + shape = ov::Shape{dims.begin(), dims.end()}; + + // Precompute strides first (no base pointer needed) + src_strides = copy_inplace_details::default_byte_strides(shape, type); + const size_t pad_elems = 13; + dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems); + + // Now allocate/fill buffer + make_input(); + + // Create views (needs baseTensor pointer) + void* base_ptr = baseTensor.data(); + ASSERT_NE(base_ptr, nullptr); + srcView = ov::Tensor(type, shape, base_ptr, src_strides); + dstView = ov::Tensor(type, shape, base_ptr, dst_strides); + + if (!isNegative()) { + make_ref_output(); + } +} + +std::string CopyInplaceTestsBase::ToString() const { + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < shape.size(); ++i) { + oss << shape[i] << ((i + 1 == shape.size()) ? "" : "x"); + } + oss << "]" + << "_type_" << type << "_kv_" << kv_dim; + return oss.str(); +} + +TEST_P(CopyInplaceTests, copy_tensor_inplace_by_dim_correctness) { + ASSERT_NO_THROW_IF(!isNegative(), { + auto src_it = ov::get_tensor_impl(srcView); + auto dst_it = ov::get_tensor_impl(dstView); + + ov::npuw::util::copy_tensor_inplace_by_dim(src_it, + dst_it, + static_cast(kv_dim), + static_cast(kv_dim)); + + uint8_t* base_ptr = baseTensor.data(); + ASSERT_NE(base_ptr, nullptr); + out_bytes.assign(base_ptr, base_ptr + out_bytes.size()); + + // test_utils.hpp defines details::ArraysMatch for vector + ASSERT_TRUE(details::ArraysMatch(to_i8(out_bytes), to_i8(ref_bytes))); + }); +} + +// Test cases +const auto TestCases = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::i8, ov::element::Type_t::f16, ov::element::Type_t::f32}), + details::ShapesIn({ + Tensors{ input = {1, 2, 3, 4}; +} // namespace +, Tensors { + input = {1, 8, 16, 32}; +} +, Tensors { + input = {1, 16, 33, 64}; +} +, Tensors { + input = {1, 4, 128, 16}; +} +, +}), + ::testing::Values(0, 1, 2, 3) +); + +INSTANTIATE_TEST_SUITE_P(CopyInplaceTests, CopyInplaceTests, TestCases, CopyInplaceTests::getTestCaseName); + +} // namespace + +#endif // HAVE_AVX2 \ No newline at end of file diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp new file mode 100644 index 00000000000000..b98d355396f333 --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp @@ -0,0 +1,149 @@ +// Copyright (C) 2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "infer_request_utils.hpp" // copy_tensor_inplace_by_dim +#include "openvino/runtime/make_tensor.hpp" +#include "openvino/runtime/tensor.hpp" +#include "test_utils.hpp" + +namespace { + +// NOTE: do NOT redefine ASSERT_NO_THROW_* macros here. +// They already exist in test_utils.hpp and warnings are treated as errors. + +// (type, shape, kv_dim) +using CopyInplaceTestsParams = std::tuple; + +namespace copy_inplace_details { + +inline ov::Strides default_byte_strides(const ov::Shape& shape, const ov::element::Type& et) { + ov::Strides strides(shape.size(), 0); + if (!strides.empty()) { + strides.back() = et.size(); + for (size_t i = shape.size() - 1; i > 0; --i) { + strides[i - 1] = strides[i] * shape[i]; + } + } + return strides; +} + +inline size_t elem_size_bytes(const ov::element::Type& et) { + return et.size(); +} + +inline void read_elem_bytes(const uint8_t* base, + const ov::Shape& idx, + const ov::Strides& strides, + size_t elem_bytes, + uint8_t* out_elem) { + size_t off = 0; + for (size_t d = 0; d < idx.size(); ++d) { + off += idx[d] * strides[d]; + } + std::memcpy(out_elem, base + off, elem_bytes); +} + +inline void write_elem_bytes(uint8_t* base, + const ov::Shape& idx, + const ov::Strides& strides, + size_t elem_bytes, + const uint8_t* elem) { + size_t off = 0; + for (size_t d = 0; d < idx.size(); ++d) { + off += idx[d] * strides[d]; + } + std::memcpy(base + off, elem, elem_bytes); +} + +// Enumerate ND index in lexicographic order. +inline bool next_index(ov::Shape& idx, const ov::Shape& shape) { + // shape is assumed non-empty and all dims > 0 in this test suite + for (int d = static_cast(shape.size()) - 1; d >= 0; --d) { + const size_t ud = static_cast(d); + if (++idx[ud] < shape[ud]) { + return true; + } + idx[ud] = 0; + } + return false; +} + +} // namespace copy_inplace_details + +class CopyInplaceTestsBase { +protected: + ov::element::Type type; + ov::Tensor baseTensor; // shared buffer owner (u8) + ov::Tensor srcView; + ov::Tensor dstView; + ov::Shape shape; + + std::vector base_bytes_initial; + std::vector ref_bytes; + std::vector out_bytes; + + std::size_t kv_dim = 0; + + ov::Strides src_strides; + ov::Strides dst_strides; + + void make_input(); + void make_views(); + void make_ref_output(); + bool isNegative() const; + +public: + void SetUp(const CopyInplaceTestsParams& getParam); + std::string ToString() const; +}; + +template +class CopyInplaceTestsTmpl : public ::testing::Test, + public T, + public ::testing::WithParamInterface { +protected: + void SetUp() override { + T::SetUp(GetParam()); + } + +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + ov::element::Type_t t; + ShapesInitializer shapeInit; + std::size_t kv_dim = 0; + std::tie(t, shapeInit, kv_dim) = obj.param; + + std::vector dims; + shapeInit(dims); + + std::ostringstream oss; + oss << "S"; + for (size_t i = 0; i < dims.size(); ++i) { + oss << dims[i]; + if (i + 1 != dims.size()) + oss << "x"; + } + oss << "_T" << ov::element::Type(t) << "_KV" << kv_dim; + return oss.str(); + } +}; + +using CopyInplaceTests = CopyInplaceTestsTmpl; + +} // anonymous namespace \ No newline at end of file From c874ad422db73e3f0578b863687a2eeb7951352e Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Thu, 15 Jan 2026 17:08:14 +0800 Subject: [PATCH 06/13] add unit tests and optimize --- .../src/plugin/npuw/infer_request_utils.cpp | 101 +++++++++++------- .../src/plugin/npuw/llm_infer_request.cpp | 40 +++---- .../intel_npu/src/plugin/npuw/util.cpp | 3 +- .../tests/unit/npuw/copy_inplace.cpp | 35 ++++-- 4 files changed, 112 insertions(+), 67 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp index 131f093289f4e8..9e734064b331cc 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp @@ -343,25 +343,66 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ OPENVINO_ASSERT(src_strides0.size() == rank0); OPENVINO_ASSERT(dst_strides0.size() == rank0); - // Build default byte strides for given shape (same as ov::ITensor::copy_to logic). ov::Strides default_strides(rank0, 0); default_strides[rank0 - 1] = elem_size; for (size_t i = rank0 - 1; i > 0; --i) { default_strides[i - 1] = default_strides[i] * shape0[i]; } - // Your explicit preconditions: + auto* base = static_cast(base_data); + + auto compute_offset = [&](const ov::Shape& ix, const ov::Strides& strides_bytes) -> size_t { + size_t off = 0; + for (size_t d = 0; d < ix.size(); ++d) { + off += ix[d] * strides_bytes[d]; + } + return off; + }; + + // --------------------------------------------------------------------- + // Last dimension not packed in either src or dst. + // We cannot memmove row_bytes as a contiguous block. Do element-wise memmove. + // Keep reverse lexicographic order to be overlap-safe for in-place move. + // --------------------------------------------------------------------- + if (src_strides0[rank0 - 1] != elem_size || dst_strides0[rank0 - 1] != elem_size) { + ov::Shape idx(shape0.size(), 0); + for (size_t d = 0; d < rank0; ++d) { + idx[d] = shape0[d] - 1; + } + + auto dec_idx = [&]() -> bool { + for (int d = static_cast(rank0) - 1; d >= 0; --d) { + const size_t ud = static_cast(d); + if (idx[ud] > 0) { + --idx[ud]; + return true; + } + idx[ud] = shape0[ud] - 1; + } + return false; + }; + + while (true) { + const size_t src_off = compute_offset(idx, src_strides0); + const size_t dst_off = compute_offset(idx, dst_strides0); + + uint8_t* src_ptr = base + src_off; + uint8_t* dst_ptr = base + dst_off; + if (src_ptr != dst_ptr) { + std::memmove(dst_ptr, src_ptr, elem_size); + } + + if (!dec_idx()) { + break; + } + } + return; + } + OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size); OPENVINO_ASSERT(dst_strides0[rank0 - 1] == elem_size); OPENVINO_ASSERT(default_strides[rank0 - 1] == elem_size); - if (rank0 >= 2) { - const size_t packed = shape0[rank0 - 1] * elem_size; - OPENVINO_ASSERT(src_strides0[rank0 - 2] == packed); - OPENVINO_ASSERT(dst_strides0[rank0 - 2] == packed); - OPENVINO_ASSERT(default_strides[rank0 - 2] == packed); - } - // Find the COMMON trailing segment where src_stride == dst_stride == default_stride. // This is the only part eligible for flattening. size_t cut = rank0 - 1; // at worst, we can always copy along last dim @@ -401,7 +442,6 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ shape.push_back(folded_last); // For the folded last dim, the step is element-size (bytes per element). - // (Since the whole folded tail is default-contiguous, this holds.) src_strides.push_back(elem_size); dst_strides.push_back(elem_size); @@ -414,45 +454,34 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ return; } - // Iterate outer coordinates in reverse lexicographic order for overlap-safe memmove. - size_t num_rows = 1; - for (size_t d = 0; d + 1 < rank; ++d) { - num_rows *= shape[d]; - } - if (num_rows == 0) { - return; - } - - auto* base = static_cast(base_data); - - ov::Shape idx(rank - 1, 0); + ov::Shape outer(rank - 1, 0); for (size_t d = 0; d + 1 < rank; ++d) { - idx[d] = shape[d] - 1; + outer[d] = shape[d] - 1; } - auto compute_offset = [&](const ov::Shape& outer, const ov::Strides& strides_bytes) -> size_t { - size_t off = 0; - for (size_t d = 0; d < outer.size(); ++d) { - off += outer[d] * strides_bytes[d]; - } - return off; - }; - auto dec_outer = [&]() -> bool { for (int d = static_cast(rank) - 2; d >= 0; --d) { const size_t ud = static_cast(d); - if (idx[ud] > 0) { - --idx[ud]; + if (outer[ud] > 0) { + --outer[ud]; return true; } - idx[ud] = shape[ud] - 1; + outer[ud] = shape[ud] - 1; } return false; }; + auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t { + size_t off = 0; + for (size_t d = 0; d < o.size(); ++d) { + off += o[d] * strides_bytes[d]; + } + return off; + }; + while (true) { - const size_t src_off = compute_offset(idx, src_strides); - const size_t dst_off = compute_offset(idx, dst_strides); + const size_t src_off = compute_outer_offset(outer, src_strides); + const size_t dst_off = compute_outer_offset(outer, dst_strides); uint8_t* src_ptr = base + src_off; uint8_t* dst_ptr = base + dst_off; diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 4bc9a28b72335b..21a82380d072af 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -12,6 +12,7 @@ #include "openvino/core/parallel.hpp" #include "openvino/runtime/iasync_infer_request.hpp" #include "util.hpp" +#include "perf.hpp" namespace { @@ -597,10 +598,10 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { // move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst. if (m_past_kv_bound) { if (pre_kv_dim == gen_kv_dim) { - prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv, - pre_kv_dim, - 0u, - static_cast(tokens_in_past_chunks)); + prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv, + pre_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, @@ -612,17 +613,17 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { m_pre_alloc_device, m_npuw_llm_compiled_model->get_plugin()); prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); - prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor, - pre_kv_dim, - 0u, - static_cast(tokens_in_past_chunks)); + prefill_past_kv_chunks = uu::make_tensor_slice(tmp_dense_kv_tensor, + pre_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); } } else { - prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv, - pre_kv_dim, - 0u, - static_cast(tokens_in_past_chunks)); + prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv, + pre_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); } } @@ -975,13 +976,14 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, if (!m_generate_initialized) { LOG_DEBUG("Copy kv-cache from prefill to generate model."); if (kvcache_desc.num_stored_tokens > 0) { - // Start counting time. - auto t_start = std::chrono::high_resolution_clock::now(); - copy_kvcache(); - // End counting time. - auto t_end = std::chrono::high_resolution_clock::now(); - auto duration_ms = std::chrono::duration_cast(t_end - t_start).count(); - LOG_INFO("cost of copy_kvcache(): " << duration_ms << " ms"); + using MS = ov::npuw::perf::metric; + MS m_ms_copy_kvcache("copy_kvcache", /*active*/ true); + + m_ms_copy_kvcache += ov::npuw::perf::ms_to_run([&]() { + copy_kvcache(); + }); + + LOG_INFO("cost of copy_kvcache(): " << m_ms_copy_kvcache.med() << " ms"); } LOG_DEBUG("Prepare inputs."); diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index 56917dc8cc835c..80eb0aeeb590f0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -280,9 +280,8 @@ void ov::npuw::util::unpack(const ov::SoPtr& from, unpack_nf4f16(from, scale, to, unpack_options); } else if (type_from == ov::element::f8e4m3 || type_from == ov::element::f8e5m2 || type_from == ov::element::f8e8m0) { - LOG_INFO("######################## unpack_f8f16"); + // FIXME: Implement XARCH::unpack unpack_f8f16(from, scale, to, unpack_options); - //ov::npuw::util::XARCH::unpack_f8f16_scale(from, scale, to, unpack_options); } else if (type_from == ov::element::f16) { // FIXME: Implement XARCH::unpack unpack_f16f16(from, scale, to, unpack_options); diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp index fb3225b7c5d4aa..5f8c4f1de9c07e 100644 --- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp @@ -6,32 +6,47 @@ # include "copy_inplace.hpp" namespace { - static ov::Strides make_padded_strides_keep_tail_default(const ov::Shape& shape, const ov::element::Type& et, size_t kv_dim, size_t pad_elems) { ov::Strides s = copy_inplace_details::default_byte_strides(shape, et); - // Keep last 2 dims default contiguous explicitly. - if (shape.size() >= 1) { - s.back() = et.size(); + const size_t rank = shape.size(); + if (rank == 0) { + return s; } - if (shape.size() >= 2) { - s[shape.size() - 2] = shape.back() * et.size(); + + // Keep last 2 dims default contiguous explicitly. + s[rank - 1] = et.size(); + if (rank >= 2) { + s[rank - 2] = shape[rank - 1] * et.size(); } - const size_t rank = shape.size(); if (rank <= 2) { return s; } const size_t last2_begin = rank - 2; - for (size_t d = 0; d < last2_begin; ++d) { - if (d <= kv_dim) { - s[d] += pad_elems * et.size(); + + // If kv_dim is in the last 2 dims, "keep tail default" means we should not pad there. + if (kv_dim >= last2_begin) { + // Recompute outer strides consistently (no padding) + for (size_t d = last2_begin; d-- > 0;) { + s[d] = s[d + 1] * shape[d + 1]; + } + return s; + } + + // Recompute strides from inner to outer; at kv_dim insert a gap measured in *inner blocks*. + for (size_t d = last2_begin; d-- > 0;) { + s[d] = s[d + 1] * shape[d + 1]; + if (d == kv_dim) { + // pad_elems is number of extra "inner blocks" after each index-step in kv_dim + s[d] += pad_elems * s[d + 1]; } } + return s; } From 42f5cf5c8d5604256f11ae7fa8bb3cc848072cf4 Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Thu, 15 Jan 2026 17:43:39 +0800 Subject: [PATCH 07/13] Fix --- .../src/plugin/npuw/infer_request_utils.cpp | 162 ------------------ 1 file changed, 162 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp index 9e734064b331cc..c51381b5c04eb1 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp @@ -1,165 +1,3 @@ -// // Copyright (C) 2025 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 -// // - -// #include "infer_request_utils.hpp" - -// #include "logging.hpp" -// #include "openvino/runtime/make_tensor.hpp" // get_tensor_impl -// #include "util_xarch.hpp" - -// // FIXME: Use ov::npuw::util::view instead -// ov::SoPtr ov::npuw::util::make_tensor_slice(ov::SoPtr tensor, -// uint32_t dim, -// uint32_t start_pos, -// uint32_t end_pos) { -// ov::Shape start_shape(std::vector(tensor->get_shape().size(), 0u)); -// start_shape[dim] = start_pos; -// ov::Shape end_shape = tensor->get_shape(); -// end_shape[dim] = end_pos; -// return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape)); -// } - -// void ov::npuw::util::copy_to_right(const ov::SoPtr& src, const ov::SoPtr& dst) { -// OPENVINO_ASSERT(src->get_byte_size() <= dst->get_byte_size()); -// std::copy_n(reinterpret_cast(src->data()), -// src->get_byte_size(), -// reinterpret_cast(dst->data()) + dst->get_byte_size() - src->get_byte_size()); -// } - -// void ov::npuw::util::copy_by_planes(ov::SoPtr src_tensor, ov::SoPtr dst_tensor) { -// // [1, H, S1, E] -> [1, H, S2, E] -// const int N = 0; -// const int H = 1; -// const int S = 2; -// const int E = 3; - -// OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]); -// OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]); -// OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]); -// OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type()); -// OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u); -// OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u); - -// const auto* src_tensor_data = reinterpret_cast(src_tensor->data()); -// auto* dst_tensor_data = reinterpret_cast(dst_tensor->data()); - -// const auto num_planes = src_tensor->get_shape()[H]; -// const auto src_plane_stride = src_tensor->get_strides()[H]; -// const auto dst_plane_stride = dst_tensor->get_strides()[H]; -// const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S]; - -// for (size_t i = 0; i < num_planes; ++i) { -// std::copy_n(src_tensor_data, plane_size_in_bytes, dst_tensor_data); -// dst_tensor_data += dst_plane_stride; -// src_tensor_data += src_plane_stride; -// } -// } - -// void ov::npuw::util::copy_columns_by_row_chunks(ov::SoPtr src, ov::SoPtr& dst) { -// /* -// src/dst layout: [1, heads, emb_size, seq_len] - -// X[*,i] - embedding for i-th token, -// Instead of copy columns, copy rows X[i,*] - -// [[X00 X01 ... X0n] [[X00 X01 ... X0n] -// [X10 X11 ... X1n] [X10 X11 ... X1n] -// [X20 X21 ... X2n] ... [X20 X21 ... X2n] -// ... ... -// [Xm0 Xm1 ... Xmn]] [Xm0 Xm1 ... Xmn]] -// */ - -// const auto& src_shape = src->get_shape(); - -// OPENVINO_ASSERT(src_shape.size() == 4u); -// OPENVINO_ASSERT(src_shape == dst->get_shape()); -// OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size()); - -// const auto& src_strides = src->get_strides(); -// const auto& dst_strides = dst->get_strides(); -// const auto elem_size = src->get_byte_size() / src->get_size(); - -// const auto C = src_shape[1]; -// const auto H = src_shape[2]; -// const auto W = src_shape[3]; - -// const auto IS_H = src_strides[2]; -// const auto OS_H = dst_strides[2]; - -// const size_t chunk_byte_size = W * elem_size; - -// const auto* src_p = static_cast(src->data()); -// auto* dst_p = static_cast(dst->data()); - -// for (size_t i = 0; i < C * H; ++i) { -// const size_t src_offset = i * IS_H; -// const size_t dst_offset = i * OS_H; -// std::copy_n(src_p + src_offset, chunk_byte_size, dst_p + dst_offset); -// } -// } - -// void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr src_tensor, -// ov::SoPtr dst_tensor, -// uint32_t kv_dim_src, -// uint32_t kv_dim_dst) { -// if (kv_dim_src != kv_dim_dst) { -// // new case - do a generic copy for now (in fact it is a permute) -// // Example: -// // kv_dim_src kv_dim_dst -// // v v -// // [1,8,256,128] --> [1,8,128,256] -// const auto& src_shape = src_tensor->get_shape(); -// const auto& dst_shape = dst_tensor->get_shape(); -// NPUW_ASSERT(src_shape.size() == 4); -// NPUW_ASSERT(dst_shape.size() == 4); -// NPUW_ASSERT(kv_dim_src < 4); -// NPUW_ASSERT(kv_dim_dst < 4); -// NPUW_ASSERT(src_shape[kv_dim_src] == dst_shape[kv_dim_dst]); - -// std::array axis = {0, 1, 2, 3}; -// // Remap like 0,1,2,3 => 0,1,3,2 (see example) -// std::swap(axis[kv_dim_src], axis[kv_dim_dst]); -// ov::npuw::util::permute_i4d(src_tensor, dst_tensor, axis); -// return; -// } -// // Old behavior -// NPUW_ASSERT(kv_dim_src == kv_dim_dst); -// if (kv_dim_src == 3u) { -// // Asserting that we work with last dimenston here: -// const auto& src_shape = src_tensor->get_shape(); -// OPENVINO_ASSERT(src_shape.size() == 4); -// // If last dimenstion of src_tensor is equal to 1, then we can squeeze -// // src_shape from [1, heads, d_v, seq_len=1] to [heads, d_v]. -// // We can then treat src_tensor as a continuous tensor of row value vectors -// // for multiple heads, while dst_tensor will still have [1, heads, d_v, seq_len!=1], -// // shape, awaiting updates at column dimension, as value vectors are columns now. -// if (src_shape[kv_dim_src] == 1 && src_tensor->is_continuous()) { -// // FIXME: ov::npuw::util::XARCH::copy_row_as_column(src_tensor, dst_tensor) throws when used here -// copy_columns_by_row_chunks(src_tensor, dst_tensor); -// } else { -// copy_columns_by_row_chunks(src_tensor, dst_tensor); -// } -// } else if (kv_dim_src == 2u) { -// copy_by_planes(src_tensor, dst_tensor); -// } else { -// src_tensor->copy_to(dst_tensor._ptr); -// } -// } - -// std::optional> ov::npuw::util::find_port_by_name( -// const std::vector>& ports, -// const std::string& name) { -// auto it = std::find_if(ports.begin(), ports.end(), [&](const auto& port) { -// return port.get_names().count(name) != 0; -// }); -// if (it == ports.end()) { -// return std::nullopt; -// } -// return std::make_optional(*it); -// } - -////////////////////////////////////////////////////////////////////// // Copyright (C) 2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // From d807b919b6d914259fd215b7aa66d3b1e4afdc7d Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Thu, 15 Jan 2026 17:57:13 +0800 Subject: [PATCH 08/13] Fix --- src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp | 2 +- src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp index 5f8c4f1de9c07e..f1ebe0197a2a8a 100644 --- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp @@ -220,4 +220,4 @@ INSTANTIATE_TEST_SUITE_P(CopyInplaceTests, CopyInplaceTests, TestCases, CopyInpl } // namespace -#endif // HAVE_AVX2 \ No newline at end of file +#endif // HAVE_AVX2 diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp index b98d355396f333..d3ec609ca51f68 100644 --- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp @@ -146,4 +146,4 @@ class CopyInplaceTestsTmpl : public ::testing::Test, using CopyInplaceTests = CopyInplaceTestsTmpl; -} // anonymous namespace \ No newline at end of file +} // anonymous namespace From e4e9fad36e86b2445136cba1127d630685e25526 Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Fri, 16 Jan 2026 17:07:58 +0800 Subject: [PATCH 09/13] Format --- src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 775b57491b0815..1063d3768b4606 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -11,8 +11,8 @@ #include "logging.hpp" #include "openvino/core/parallel.hpp" #include "openvino/runtime/iasync_infer_request.hpp" -#include "util.hpp" #include "perf.hpp" +#include "util.hpp" namespace { From 03137612d0fbc2cc20adcf524b035a42fceb0bfc Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Fri, 16 Jan 2026 17:44:26 +0800 Subject: [PATCH 10/13] Fix --- .../tests/unit/npuw/copy_inplace.cpp | 24 ------------------- .../tests/unit/npuw/copy_inplace.hpp | 2 -- 2 files changed, 26 deletions(-) diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp index f1ebe0197a2a8a..59263206fde808 100644 --- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp @@ -87,19 +87,6 @@ void CopyInplaceTestsBase::make_input() { baseTensor = ov::Tensor(ov::element::u8, ov::Shape{byte_size}, base_bytes_initial.data()); } -void CopyInplaceTestsBase::make_views() { - src_strides = copy_inplace_details::default_byte_strides(shape, type); - - const size_t pad_elems = 13; - dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems); - - void* base_ptr = baseTensor.data(); - ASSERT_NE(base_ptr, nullptr); - - srcView = ov::Tensor(type, shape, base_ptr, src_strides); - dstView = ov::Tensor(type, shape, base_ptr, dst_strides); -} - bool CopyInplaceTestsBase::isNegative() const { if (shape.size() < 2) { return true; @@ -166,17 +153,6 @@ void CopyInplaceTestsBase::SetUp(const CopyInplaceTestsParams& getParam) { } } -std::string CopyInplaceTestsBase::ToString() const { - std::ostringstream oss; - oss << "["; - for (size_t i = 0; i < shape.size(); ++i) { - oss << shape[i] << ((i + 1 == shape.size()) ? "" : "x"); - } - oss << "]" - << "_type_" << type << "_kv_" << kv_dim; - return oss.str(); -} - TEST_P(CopyInplaceTests, copy_tensor_inplace_by_dim_correctness) { ASSERT_NO_THROW_IF(!isNegative(), { auto src_it = ov::get_tensor_impl(srcView); diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp index d3ec609ca51f68..f9e556a72861a9 100644 --- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp @@ -104,13 +104,11 @@ class CopyInplaceTestsBase { ov::Strides dst_strides; void make_input(); - void make_views(); void make_ref_output(); bool isNegative() const; public: void SetUp(const CopyInplaceTestsParams& getParam); - std::string ToString() const; }; template From ce64e7ecf7f00810c4faaf520e9bfdf2d02544f4 Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Mon, 19 Jan 2026 18:26:31 +0800 Subject: [PATCH 11/13] Optimize offset computing and clean up --- .../src/plugin/npuw/infer_request_utils.cpp | 88 +++++++++++++------ .../tests/unit/npuw/copy_inplace.cpp | 33 ++----- .../tests/unit/npuw/copy_inplace.hpp | 10 +-- 3 files changed, 70 insertions(+), 61 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp index 9a0baedeb032cf..be36cc535eb0b0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp @@ -151,6 +151,7 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ ov::SoPtr dst_tensor) { OPENVINO_ASSERT(src_tensor); OPENVINO_ASSERT(dst_tensor); + OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type()); void* base_data = src_tensor->data(); void* dst_data = dst_tensor->data(); @@ -198,39 +199,60 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ }; // --------------------------------------------------------------------- - // Last dimension not packed in either src or dst. + // Fallback: last dimension not packed in either src or dst. // We cannot memmove row_bytes as a contiguous block. Do element-wise memmove. - // Keep reverse lexicographic order to be overlap-safe for in-place move. // --------------------------------------------------------------------- if (src_strides0[rank0 - 1] != elem_size || dst_strides0[rank0 - 1] != elem_size) { - ov::Shape idx(shape0.size(), 0); + ov::Shape idx(rank0, 0); for (size_t d = 0; d < rank0; ++d) { idx[d] = shape0[d] - 1; } - auto dec_idx = [&]() -> bool { + size_t src_off = compute_offset(idx, src_strides0); + size_t dst_off = compute_offset(idx, dst_strides0); + + auto step_prev = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) { + off -= strides_bytes[dim]; + }; + + auto wrap_dim = [&](size_t& off, const ov::Shape& shape, const ov::Strides& strides_bytes, size_t dim) { + off += (shape[dim] - 1) * strides_bytes[dim]; + }; + + auto dec_idx_and_offsets = [&]() -> bool { for (int d = static_cast(rank0) - 1; d >= 0; --d) { const size_t ud = static_cast(d); if (idx[ud] > 0) { --idx[ud]; + step_prev(src_off, src_strides0, ud); + step_prev(dst_off, dst_strides0, ud); return true; } idx[ud] = shape0[ud] - 1; + wrap_dim(src_off, shape0, src_strides0, ud); + wrap_dim(dst_off, shape0, dst_strides0, ud); } return false; }; while (true) { - const size_t src_off = compute_offset(idx, src_strides0); - const size_t dst_off = compute_offset(idx, dst_strides0); - uint8_t* src_ptr = base + src_off; uint8_t* dst_ptr = base + dst_off; if (src_ptr != dst_ptr) { - std::memmove(dst_ptr, src_ptr, elem_size); + // If no overlap, memcpy is enough (faster). Otherwise use memmove. + const uint8_t* s0 = src_ptr; + const uint8_t* s1 = src_ptr + elem_size; + uint8_t* d0 = dst_ptr; + uint8_t* d1 = dst_ptr + elem_size; + const bool overlap = !(d1 <= s0 || s1 <= d0); + if (!overlap) { + std::memcpy(dst_ptr, src_ptr, elem_size); + } else { + std::memmove(dst_ptr, src_ptr, elem_size); + } } - if (!dec_idx()) { + if (!dec_idx_and_offsets()) { break; } } @@ -279,7 +301,6 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ } shape.push_back(folded_last); - // For the folded last dim, the step is element-size (bytes per element). src_strides.push_back(elem_size); dst_strides.push_back(elem_size); @@ -292,42 +313,57 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ return; } - ov::Shape outer(rank - 1, 0); - for (size_t d = 0; d + 1 < rank; ++d) { + const size_t outer_rank = rank - 1; + + ov::Shape outer(outer_rank, 0); + for (size_t d = 0; d < outer_rank; ++d) { outer[d] = shape[d] - 1; } - auto dec_outer = [&]() -> bool { - for (int d = static_cast(rank) - 2; d >= 0; --d) { + auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t { + size_t off = 0; + for (size_t d = 0; d < o.size(); ++d) { + off += o[d] * strides_bytes[d]; + } + return off; + }; + + size_t src_off = compute_outer_offset(outer, src_strides); + size_t dst_off = compute_outer_offset(outer, dst_strides); + + auto step_prev_outer = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) { + off -= strides_bytes[dim]; + }; + + auto wrap_outer_dim = + [&](size_t& off, const ov::Shape& shape_folded, const ov::Strides& strides_bytes, size_t dim) { + off += (shape_folded[dim] - 1) * strides_bytes[dim]; + }; + + auto dec_outer_and_offsets = [&]() -> bool { + for (int d = static_cast(outer_rank) - 1; d >= 0; --d) { const size_t ud = static_cast(d); if (outer[ud] > 0) { --outer[ud]; + step_prev_outer(src_off, src_strides, ud); + step_prev_outer(dst_off, dst_strides, ud); return true; } outer[ud] = shape[ud] - 1; + wrap_outer_dim(src_off, shape, src_strides, ud); + wrap_outer_dim(dst_off, shape, dst_strides, ud); } return false; }; - auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t { - size_t off = 0; - for (size_t d = 0; d < o.size(); ++d) { - off += o[d] * strides_bytes[d]; - } - return off; - }; - while (true) { - const size_t src_off = compute_outer_offset(outer, src_strides); - const size_t dst_off = compute_outer_offset(outer, dst_strides); - uint8_t* src_ptr = base + src_off; uint8_t* dst_ptr = base + dst_off; if (src_ptr != dst_ptr) { std::memmove(dst_ptr, src_ptr, row_bytes); } - if (!dec_outer()) { + if (!dec_outer_and_offsets()) { break; } } diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp index 59263206fde808..ae66c127b375d8 100644 --- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp @@ -16,33 +16,17 @@ static ov::Strides make_padded_strides_keep_tail_default(const ov::Shape& shape, if (rank == 0) { return s; } - - // Keep last 2 dims default contiguous explicitly. - s[rank - 1] = et.size(); - if (rank >= 2) { - s[rank - 2] = shape[rank - 1] * et.size(); - } - - if (rank <= 2) { - return s; - } - - const size_t last2_begin = rank - 2; - - // If kv_dim is in the last 2 dims, "keep tail default" means we should not pad there. - if (kv_dim >= last2_begin) { - // Recompute outer strides consistently (no padding) - for (size_t d = last2_begin; d-- > 0;) { - s[d] = s[d + 1] * shape[d + 1]; + if (rank == 1) { + if (kv_dim == 0) { + s[0] += pad_elems * et.size(); } return s; } - // Recompute strides from inner to outer; at kv_dim insert a gap measured in *inner blocks*. - for (size_t d = last2_begin; d-- > 0;) { + s[rank - 1] = et.size(); + for (size_t d = rank - 1; d-- > 0;) { s[d] = s[d + 1] * shape[d + 1]; if (d == kv_dim) { - // pad_elems is number of extra "inner blocks" after each index-step in kv_dim s[d] += pad_elems * s[d + 1]; } } @@ -83,7 +67,6 @@ void CopyInplaceTestsBase::make_input() { base_bytes_initial[i] = static_cast(dist(rng)); } - // External-memory tensor (safe for unit test lifetime). baseTensor = ov::Tensor(ov::element::u8, ov::Shape{byte_size}, base_bytes_initial.data()); } @@ -134,15 +117,12 @@ void CopyInplaceTestsBase::SetUp(const CopyInplaceTestsParams& getParam) { shapeInit(dims); shape = ov::Shape{dims.begin(), dims.end()}; - // Precompute strides first (no base pointer needed) src_strides = copy_inplace_details::default_byte_strides(shape, type); const size_t pad_elems = 13; dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems); - // Now allocate/fill buffer make_input(); - // Create views (needs baseTensor pointer) void* base_ptr = baseTensor.data(); ASSERT_NE(base_ptr, nullptr); srcView = ov::Tensor(type, shape, base_ptr, src_strides); @@ -167,7 +147,6 @@ TEST_P(CopyInplaceTests, copy_tensor_inplace_by_dim_correctness) { ASSERT_NE(base_ptr, nullptr); out_bytes.assign(base_ptr, base_ptr + out_bytes.size()); - // test_utils.hpp defines details::ArraysMatch for vector ASSERT_TRUE(details::ArraysMatch(to_i8(out_bytes), to_i8(ref_bytes))); }); } @@ -177,7 +156,7 @@ const auto TestCases = ::testing::Combine( ::testing::ValuesIn({ov::element::Type_t::i8, ov::element::Type_t::f16, ov::element::Type_t::f32}), details::ShapesIn({ Tensors{ input = {1, 2, 3, 4}; -} // namespace +} , Tensors { input = {1, 8, 16, 32}; } diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp index f9e556a72861a9..96ee02f961a4bd 100644 --- a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp @@ -17,17 +17,13 @@ #include #include -#include "infer_request_utils.hpp" // copy_tensor_inplace_by_dim +#include "infer_request_utils.hpp" #include "openvino/runtime/make_tensor.hpp" #include "openvino/runtime/tensor.hpp" #include "test_utils.hpp" namespace { -// NOTE: do NOT redefine ASSERT_NO_THROW_* macros here. -// They already exist in test_utils.hpp and warnings are treated as errors. - -// (type, shape, kv_dim) using CopyInplaceTestsParams = std::tuple; namespace copy_inplace_details { @@ -71,9 +67,7 @@ inline void write_elem_bytes(uint8_t* base, std::memcpy(base + off, elem, elem_bytes); } -// Enumerate ND index in lexicographic order. inline bool next_index(ov::Shape& idx, const ov::Shape& shape) { - // shape is assumed non-empty and all dims > 0 in this test suite for (int d = static_cast(shape.size()) - 1; d >= 0; --d) { const size_t ud = static_cast(d); if (++idx[ud] < shape[ud]) { @@ -89,7 +83,7 @@ inline bool next_index(ov::Shape& idx, const ov::Shape& shape) { class CopyInplaceTestsBase { protected: ov::element::Type type; - ov::Tensor baseTensor; // shared buffer owner (u8) + ov::Tensor baseTensor; ov::Tensor srcView; ov::Tensor dstView; ov::Shape shape; From f58df15bea7aefd63ab241940373846f788d7803 Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Wed, 28 Jan 2026 14:48:19 +0800 Subject: [PATCH 12/13] Add necessary comments --- src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index b78a23e441406c..1eb131fe733eda 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -518,7 +518,11 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { 0u, static_cast(tokens_in_past_chunks)); ov::SoPtr prefill_past_kv_chunks; - // move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst. + // In-place KV copy is only safe/possible when the source and destination KV layouts match. + // When we have mixed v-transpose settings across models (prefill vs generate: v-transpose OFF/ON), + // the effective KV "token" dimension differs (pre_kv_dim != gen_kv_dim), so an in-place move/copy + // would corrupt data. Therefore, we only use in-place copy when pre_kv_dim == gen_kv_dim; + // otherwise we must copy via a temporary tensor. if (m_past_kv_bound) { if (pre_kv_dim == gen_kv_dim) { prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv, From 9067950afdea03c0e66905f60394d1c34d057d1e Mon Sep 17 00:00:00 2001 From: DingZhangIntel Date: Thu, 29 Jan 2026 15:46:16 +0800 Subject: [PATCH 13/13] Fix and refatore --- .../src/plugin/npuw/infer_request_utils.cpp | 220 +++++++----------- 1 file changed, 83 insertions(+), 137 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp index be36cc535eb0b0..bd13391029b1f7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp @@ -8,6 +8,66 @@ #include "openvino/runtime/make_tensor.hpp" // get_tensor_impl #include "util_xarch.hpp" +namespace { + +struct FoldedTrailingLayout { + ov::Shape shape; + ov::Strides src_strides; + ov::Strides dst_strides; +}; + +// Folds the maximal COMMON trailing segment where: +// src_stride == dst_stride == default_stride (packed / contiguous-by-bytes) +// into a single last dimension. +// This is the only segment eligible for flattening to speed up row-wise memmove. +FoldedTrailingLayout fold_common_trailing_packed_segment(const ov::Shape& shape0, + const ov::Strides& src_strides0, + const ov::Strides& dst_strides0, + size_t elem_size) { + const size_t rank0 = shape0.size(); + OPENVINO_ASSERT(rank0 > 0); + + ov::Strides default_strides(rank0, 0); + default_strides[rank0 - 1] = elem_size; + for (size_t i = rank0 - 1; i > 0; --i) { + default_strides[i - 1] = default_strides[i] * shape0[i]; + } + + size_t cut = rank0 - 1; + for (size_t inverted_idx = rank0; inverted_idx-- > 0;) { + const bool ok = (src_strides0[inverted_idx] == default_strides[inverted_idx]) && + (dst_strides0[inverted_idx] == default_strides[inverted_idx]) && + (src_strides0[inverted_idx] == dst_strides0[inverted_idx]); + if (!ok) { + break; + } + cut = inverted_idx; + } + + FoldedTrailingLayout out; + out.shape.reserve(cut + 1); + out.src_strides.reserve(cut + 1); + out.dst_strides.reserve(cut + 1); + + for (size_t d = 0; d < cut; ++d) { + out.shape.push_back(shape0[d]); + out.src_strides.push_back(src_strides0[d]); + out.dst_strides.push_back(dst_strides0[d]); + } + + size_t folded_last = 1; + for (size_t d = cut; d < rank0; ++d) { + folded_last *= shape0[d]; + } + out.shape.push_back(folded_last); + out.src_strides.push_back(elem_size); + out.dst_strides.push_back(elem_size); + + return out; +} + +} // namespace + // FIXME: Use ov::npuw::util::view instead ov::SoPtr ov::npuw::util::make_tensor_slice(ov::SoPtr tensor, uint32_t dim, @@ -153,6 +213,10 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ OPENVINO_ASSERT(dst_tensor); OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type()); + // KV-cache values are byte-addressable in the current flow. Sub-byte element types (int4/uint4) are unsupported. + const auto et = src_tensor->get_element_type(); + OPENVINO_ASSERT(et.bitwidth() % 8u == 0u, "sub-byte element types (e.g. int4/uint4) are not supported"); + void* base_data = src_tensor->data(); void* dst_data = dst_tensor->data(); OPENVINO_ASSERT(base_data && dst_data); @@ -175,139 +239,26 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ const size_t total_elems = src_tensor->get_size(); OPENVINO_ASSERT(total_elems != 0); + const size_t elem_size = src_tensor->get_byte_size() / total_elems; - ov::Strides src_strides0 = src_tensor->get_strides(); - ov::Strides dst_strides0 = dst_tensor->get_strides(); + const ov::Strides src_strides0 = src_tensor->get_strides(); + const ov::Strides dst_strides0 = dst_tensor->get_strides(); OPENVINO_ASSERT(src_strides0.size() == rank0); OPENVINO_ASSERT(dst_strides0.size() == rank0); - ov::Strides default_strides(rank0, 0); - default_strides[rank0 - 1] = elem_size; - for (size_t i = rank0 - 1; i > 0; --i) { - default_strides[i - 1] = default_strides[i] * shape0[i]; - } + // The last dimension is packed in both src and dst. + OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size && dst_strides0[rank0 - 1] == elem_size, + "src/dst last dimension is not packed"); auto* base = static_cast(base_data); - auto compute_offset = [&](const ov::Shape& ix, const ov::Strides& strides_bytes) -> size_t { - size_t off = 0; - for (size_t d = 0; d < ix.size(); ++d) { - off += ix[d] * strides_bytes[d]; - } - return off; - }; + const auto folded = fold_common_trailing_packed_segment(shape0, src_strides0, dst_strides0, elem_size); - // --------------------------------------------------------------------- - // Fallback: last dimension not packed in either src or dst. - // We cannot memmove row_bytes as a contiguous block. Do element-wise memmove. - // --------------------------------------------------------------------- - if (src_strides0[rank0 - 1] != elem_size || dst_strides0[rank0 - 1] != elem_size) { - ov::Shape idx(rank0, 0); - for (size_t d = 0; d < rank0; ++d) { - idx[d] = shape0[d] - 1; - } - - size_t src_off = compute_offset(idx, src_strides0); - size_t dst_off = compute_offset(idx, dst_strides0); - - auto step_prev = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) { - off -= strides_bytes[dim]; - }; - - auto wrap_dim = [&](size_t& off, const ov::Shape& shape, const ov::Strides& strides_bytes, size_t dim) { - off += (shape[dim] - 1) * strides_bytes[dim]; - }; - - auto dec_idx_and_offsets = [&]() -> bool { - for (int d = static_cast(rank0) - 1; d >= 0; --d) { - const size_t ud = static_cast(d); - if (idx[ud] > 0) { - --idx[ud]; - step_prev(src_off, src_strides0, ud); - step_prev(dst_off, dst_strides0, ud); - return true; - } - idx[ud] = shape0[ud] - 1; - wrap_dim(src_off, shape0, src_strides0, ud); - wrap_dim(dst_off, shape0, dst_strides0, ud); - } - return false; - }; - - while (true) { - uint8_t* src_ptr = base + src_off; - uint8_t* dst_ptr = base + dst_off; - if (src_ptr != dst_ptr) { - // If no overlap, memcpy is enough (faster). Otherwise use memmove. - const uint8_t* s0 = src_ptr; - const uint8_t* s1 = src_ptr + elem_size; - uint8_t* d0 = dst_ptr; - uint8_t* d1 = dst_ptr + elem_size; - const bool overlap = !(d1 <= s0 || s1 <= d0); - if (!overlap) { - std::memcpy(dst_ptr, src_ptr, elem_size); - } else { - std::memmove(dst_ptr, src_ptr, elem_size); - } - } - - if (!dec_idx_and_offsets()) { - break; - } - } - return; - } - - OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size); - OPENVINO_ASSERT(dst_strides0[rank0 - 1] == elem_size); - OPENVINO_ASSERT(default_strides[rank0 - 1] == elem_size); - - // Find the COMMON trailing segment where src_stride == dst_stride == default_stride. - // This is the only part eligible for flattening. - size_t cut = rank0 - 1; // at worst, we can always copy along last dim - for (size_t inverted_idx = rank0 - 1; inverted_idx < rank0; --inverted_idx) { - const bool ok = (src_strides0[inverted_idx] == default_strides[inverted_idx]) && - (dst_strides0[inverted_idx] == default_strides[inverted_idx]) && - (src_strides0[inverted_idx] == dst_strides0[inverted_idx]); - if (ok) { - cut = inverted_idx; - if (inverted_idx == 0) { - break; - } - continue; - } - break; - } - - // Fold [cut..rank0-1] into a single last dimension. - ov::Shape shape; - ov::Strides src_strides; - ov::Strides dst_strides; - - shape.reserve(cut + 1); - src_strides.reserve(cut + 1); - dst_strides.reserve(cut + 1); - - for (size_t d = 0; d < cut; ++d) { - shape.push_back(shape0[d]); - src_strides.push_back(src_strides0[d]); - dst_strides.push_back(dst_strides0[d]); - } - - size_t folded_last = 1; - for (size_t d = cut; d < rank0; ++d) { - folded_last *= shape0[d]; - } - shape.push_back(folded_last); - - src_strides.push_back(elem_size); - dst_strides.push_back(elem_size); - - const size_t rank = shape.size(); + const size_t rank = folded.shape.size(); OPENVINO_ASSERT(rank >= 1); - const size_t row_elems = shape[rank - 1]; + const size_t row_elems = folded.shape[rank - 1]; const size_t row_bytes = row_elems * elem_size; if (row_bytes == 0) { return; @@ -317,7 +268,7 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ ov::Shape outer(outer_rank, 0); for (size_t d = 0; d < outer_rank; ++d) { - outer[d] = shape[d] - 1; + outer[d] = folded.shape[d] - 1; } auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t { @@ -328,8 +279,8 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ return off; }; - size_t src_off = compute_outer_offset(outer, src_strides); - size_t dst_off = compute_outer_offset(outer, dst_strides); + size_t src_off = compute_outer_offset(outer, folded.src_strides); + size_t dst_off = compute_outer_offset(outer, folded.dst_strides); auto step_prev_outer = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) { off -= strides_bytes[dim]; @@ -345,13 +296,13 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ const size_t ud = static_cast(d); if (outer[ud] > 0) { --outer[ud]; - step_prev_outer(src_off, src_strides, ud); - step_prev_outer(dst_off, dst_strides, ud); + step_prev_outer(src_off, folded.src_strides, ud); + step_prev_outer(dst_off, folded.dst_strides, ud); return true; } - outer[ud] = shape[ud] - 1; - wrap_outer_dim(src_off, shape, src_strides, ud); - wrap_outer_dim(dst_off, shape, dst_strides, ud); + outer[ud] = folded.shape[ud] - 1; + wrap_outer_dim(src_off, folded.shape, folded.src_strides, ud); + wrap_outer_dim(dst_off, folded.shape, folded.dst_strides, ud); } return false; }; @@ -371,7 +322,7 @@ void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_ // In-place move along kv_dim when src/dst share the same buffer. // Requirements: -// - kv_dim_src == kv_dim_dst, otherwise throws +// - kv_dim_src == kv_dim_dst // - src_tensor->data() == dst_tensor->data() void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr src_tensor, ov::SoPtr dst_tensor, @@ -380,9 +331,7 @@ void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr src OPENVINO_ASSERT(src_tensor); OPENVINO_ASSERT(dst_tensor); - if (kv_dim_src != kv_dim_dst) { - OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst"); - } + OPENVINO_ASSERT(kv_dim_src == kv_dim_dst, "copy_tensor_inplace_by_dim supports only kv_dim_src == kv_dim_dst"); void* base_data = src_tensor->data(); void* dst_data = dst_tensor->data(); @@ -392,12 +341,9 @@ void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr src const auto& src_shape = src_tensor->get_shape(); const auto& dst_shape = dst_tensor->get_shape(); - OPENVINO_ASSERT(src_shape.size() == dst_shape.size()); OPENVINO_ASSERT(src_shape == dst_shape); OPENVINO_ASSERT(kv_dim_src < src_shape.size()); - // One generic implementation for all kv_dim. - // We rely on row-wise memmove on the (possibly flattened) last dimension and stride-based addressing. copy_inplace_generic_rows(src_tensor, dst_tensor); }