diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp index 7a484df9acf0e8..bd13391029b1f7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp @@ -8,6 +8,66 @@ #include "openvino/runtime/make_tensor.hpp" // get_tensor_impl #include "util_xarch.hpp" +namespace { + +struct FoldedTrailingLayout { + ov::Shape shape; + ov::Strides src_strides; + ov::Strides dst_strides; +}; + +// Folds the maximal COMMON trailing segment where: +// src_stride == dst_stride == default_stride (packed / contiguous-by-bytes) +// into a single last dimension. +// This is the only segment eligible for flattening to speed up row-wise memmove. +FoldedTrailingLayout fold_common_trailing_packed_segment(const ov::Shape& shape0, + const ov::Strides& src_strides0, + const ov::Strides& dst_strides0, + size_t elem_size) { + const size_t rank0 = shape0.size(); + OPENVINO_ASSERT(rank0 > 0); + + ov::Strides default_strides(rank0, 0); + default_strides[rank0 - 1] = elem_size; + for (size_t i = rank0 - 1; i > 0; --i) { + default_strides[i - 1] = default_strides[i] * shape0[i]; + } + + size_t cut = rank0 - 1; + for (size_t inverted_idx = rank0; inverted_idx-- > 0;) { + const bool ok = (src_strides0[inverted_idx] == default_strides[inverted_idx]) && + (dst_strides0[inverted_idx] == default_strides[inverted_idx]) && + (src_strides0[inverted_idx] == dst_strides0[inverted_idx]); + if (!ok) { + break; + } + cut = inverted_idx; + } + + FoldedTrailingLayout out; + out.shape.reserve(cut + 1); + out.src_strides.reserve(cut + 1); + out.dst_strides.reserve(cut + 1); + + for (size_t d = 0; d < cut; ++d) { + out.shape.push_back(shape0[d]); + out.src_strides.push_back(src_strides0[d]); + out.dst_strides.push_back(dst_strides0[d]); + } + + size_t folded_last = 1; + for (size_t d = cut; d < rank0; ++d) { + folded_last *= shape0[d]; + } + out.shape.push_back(folded_last); + out.src_strides.push_back(elem_size); + out.dst_strides.push_back(elem_size); + + return out; +} + +} // namespace + // FIXME: Use ov::npuw::util::view instead ov::SoPtr ov::npuw::util::make_tensor_slice(ov::SoPtr tensor, uint32_t dim, @@ -147,6 +207,146 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr src_tensor, } } +void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr src_tensor, + ov::SoPtr dst_tensor) { + OPENVINO_ASSERT(src_tensor); + OPENVINO_ASSERT(dst_tensor); + OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type()); + + // KV-cache values are byte-addressable in the current flow. Sub-byte element types (int4/uint4) are unsupported. + const auto et = src_tensor->get_element_type(); + OPENVINO_ASSERT(et.bitwidth() % 8u == 0u, "sub-byte element types (e.g. int4/uint4) are not supported"); + + void* base_data = src_tensor->data(); + void* dst_data = dst_tensor->data(); + OPENVINO_ASSERT(base_data && dst_data); + OPENVINO_ASSERT(base_data == dst_data); + + const auto& shape0 = src_tensor->get_shape(); + const auto& dst_shape0 = dst_tensor->get_shape(); + OPENVINO_ASSERT(shape0 == dst_shape0); + + const size_t rank0 = shape0.size(); + if (rank0 == 0) { + return; + } + + for (size_t d = 0; d < rank0; ++d) { + if (shape0[d] == 0) { + return; + } + } + + const size_t total_elems = src_tensor->get_size(); + OPENVINO_ASSERT(total_elems != 0); + + const size_t elem_size = src_tensor->get_byte_size() / total_elems; + + const ov::Strides src_strides0 = src_tensor->get_strides(); + const ov::Strides dst_strides0 = dst_tensor->get_strides(); + OPENVINO_ASSERT(src_strides0.size() == rank0); + OPENVINO_ASSERT(dst_strides0.size() == rank0); + + // The last dimension is packed in both src and dst. + OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size && dst_strides0[rank0 - 1] == elem_size, + "src/dst last dimension is not packed"); + + auto* base = static_cast(base_data); + + const auto folded = fold_common_trailing_packed_segment(shape0, src_strides0, dst_strides0, elem_size); + + const size_t rank = folded.shape.size(); + OPENVINO_ASSERT(rank >= 1); + + const size_t row_elems = folded.shape[rank - 1]; + const size_t row_bytes = row_elems * elem_size; + if (row_bytes == 0) { + return; + } + + const size_t outer_rank = rank - 1; + + ov::Shape outer(outer_rank, 0); + for (size_t d = 0; d < outer_rank; ++d) { + outer[d] = folded.shape[d] - 1; + } + + auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t { + size_t off = 0; + for (size_t d = 0; d < o.size(); ++d) { + off += o[d] * strides_bytes[d]; + } + return off; + }; + + size_t src_off = compute_outer_offset(outer, folded.src_strides); + size_t dst_off = compute_outer_offset(outer, folded.dst_strides); + + auto step_prev_outer = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) { + off -= strides_bytes[dim]; + }; + + auto wrap_outer_dim = + [&](size_t& off, const ov::Shape& shape_folded, const ov::Strides& strides_bytes, size_t dim) { + off += (shape_folded[dim] - 1) * strides_bytes[dim]; + }; + + auto dec_outer_and_offsets = [&]() -> bool { + for (int d = static_cast(outer_rank) - 1; d >= 0; --d) { + const size_t ud = static_cast(d); + if (outer[ud] > 0) { + --outer[ud]; + step_prev_outer(src_off, folded.src_strides, ud); + step_prev_outer(dst_off, folded.dst_strides, ud); + return true; + } + outer[ud] = folded.shape[ud] - 1; + wrap_outer_dim(src_off, folded.shape, folded.src_strides, ud); + wrap_outer_dim(dst_off, folded.shape, folded.dst_strides, ud); + } + return false; + }; + + while (true) { + uint8_t* src_ptr = base + src_off; + uint8_t* dst_ptr = base + dst_off; + if (src_ptr != dst_ptr) { + std::memmove(dst_ptr, src_ptr, row_bytes); + } + + if (!dec_outer_and_offsets()) { + break; + } + } +} + +// In-place move along kv_dim when src/dst share the same buffer. +// Requirements: +// - kv_dim_src == kv_dim_dst +// - src_tensor->data() == dst_tensor->data() +void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr src_tensor, + ov::SoPtr dst_tensor, + uint32_t kv_dim_src, + uint32_t kv_dim_dst) { + OPENVINO_ASSERT(src_tensor); + OPENVINO_ASSERT(dst_tensor); + + OPENVINO_ASSERT(kv_dim_src == kv_dim_dst, "copy_tensor_inplace_by_dim supports only kv_dim_src == kv_dim_dst"); + + void* base_data = src_tensor->data(); + void* dst_data = dst_tensor->data(); + OPENVINO_ASSERT(base_data); + OPENVINO_ASSERT(dst_data); + OPENVINO_ASSERT(base_data == dst_data); + + const auto& src_shape = src_tensor->get_shape(); + const auto& dst_shape = dst_tensor->get_shape(); + OPENVINO_ASSERT(src_shape == dst_shape); + OPENVINO_ASSERT(kv_dim_src < src_shape.size()); + + copy_inplace_generic_rows(src_tensor, dst_tensor); +} + std::optional> ov::npuw::util::find_port_by_name( const std::vector>& ports, const std::string& name) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp index bd1d89ae8444c7..84538f3b322a1c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp @@ -33,6 +33,13 @@ void copy_tensor_by_dim(ov::SoPtr src_tensor, uint32_t kv_dim_src, uint32_t kv_dim_dst); +void copy_inplace_generic_rows(const ov::SoPtr src_tensor, ov::SoPtr dst_tensor); + +void copy_tensor_inplace_by_dim(const ov::SoPtr src_tensor, + ov::SoPtr dst_tensor, + uint32_t kv_dim_src, + uint32_t kv_dim_dst); + std::optional> find_port_by_name(const std::vector>& ports, const std::string& name); /** diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index a82bd878a11ffe..1eb131fe733eda 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -11,6 +11,7 @@ #include "logging.hpp" #include "openvino/core/parallel.hpp" #include "openvino/runtime/iasync_infer_request.hpp" +#include "perf.hpp" #include "util.hpp" namespace { @@ -505,7 +506,6 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor // Part 2: The kv results from the last loop remain in the 'present' KV output tensor // The task is to copy both parts into the KV-cache input tensor for the decoding process - // Copy part 1 KV results // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk; @@ -513,33 +513,47 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption // This is necessary because subsequent copy operations would overwrite the shared buffer auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name)); - ov::SoPtr tmp_dense_kv_tensor; - ov::SoPtr prefill_past_kv_chunks; - if (m_past_kv_bound) { - tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(), - prefill_past_kv->get_shape(), - m_pre_alloc_device, - m_npuw_llm_compiled_model->get_plugin()); - prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); - prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor, - pre_kv_dim, - 0u, - static_cast(tokens_in_past_chunks)); - } else { - prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv, - pre_kv_dim, - 0u, - static_cast(tokens_in_past_chunks)); - } - auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor, gen_kv_dim, 0u, static_cast(tokens_in_past_chunks)); - - uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); + ov::SoPtr prefill_past_kv_chunks; + // In-place KV copy is only safe/possible when the source and destination KV layouts match. + // When we have mixed v-transpose settings across models (prefill vs generate: v-transpose OFF/ON), + // the effective KV "token" dimension differs (pre_kv_dim != gen_kv_dim), so an in-place move/copy + // would corrupt data. Therefore, we only use in-place copy when pre_kv_dim == gen_kv_dim; + // otherwise we must copy via a temporary tensor. + if (m_past_kv_bound) { + if (pre_kv_dim == gen_kv_dim) { + prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv, + pre_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); + + uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks, + kvcache_past_kv_chunks, + pre_kv_dim, + gen_kv_dim); + } else { + auto tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(), + prefill_past_kv->get_shape(), + m_pre_alloc_device, + m_npuw_llm_compiled_model->get_plugin()); + prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); + prefill_past_kv_chunks = uu::make_tensor_slice(tmp_dense_kv_tensor, + pre_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); + uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); + } + } else { + prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv, + pre_kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); + uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); + } } - // Copy part 2 KV results auto prefill_present_kv_chunk = uu::make_tensor_slice(prefill_out_tensor, @@ -846,7 +860,14 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, if (!m_generate_initialized) { LOG_DEBUG("Copy kv-cache from prefill to generate model."); if (kvcache_desc.num_stored_tokens > 0) { - copy_kvcache(); + using MS = ov::npuw::perf::metric; + MS m_ms_copy_kvcache("copy_kvcache", /*active*/ true); + + m_ms_copy_kvcache += ov::npuw::perf::ms_to_run([&]() { + copy_kvcache(); + }); + + LOG_INFO("cost of copy_kvcache(): " << m_ms_copy_kvcache.med() << " ms"); } LOG_DEBUG("Prepare inputs."); diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp new file mode 100644 index 00000000000000..ae66c127b375d8 --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.cpp @@ -0,0 +1,178 @@ +// Copyright (C) 2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef HAVE_AVX2 +# include "copy_inplace.hpp" + +namespace { +static ov::Strides make_padded_strides_keep_tail_default(const ov::Shape& shape, + const ov::element::Type& et, + size_t kv_dim, + size_t pad_elems) { + ov::Strides s = copy_inplace_details::default_byte_strides(shape, et); + + const size_t rank = shape.size(); + if (rank == 0) { + return s; + } + if (rank == 1) { + if (kv_dim == 0) { + s[0] += pad_elems * et.size(); + } + return s; + } + + s[rank - 1] = et.size(); + for (size_t d = rank - 1; d-- > 0;) { + s[d] = s[d + 1] * shape[d + 1]; + if (d == kv_dim) { + s[d] += pad_elems * s[d + 1]; + } + } + + return s; +} + +static std::vector to_i8(const std::vector& v) { + std::vector out(v.size()); + std::memcpy(out.data(), v.data(), v.size()); + return out; +} + +void CopyInplaceTestsBase::make_input() { + const auto elem_bytes = copy_inplace_details::elem_size_bytes(type); + const auto total_elems = ov::shape_size(shape); + ASSERT_GT(total_elems, 0u); + + auto max_offset = [&](const ov::Strides& strides) -> size_t { + size_t off = 0; + for (size_t d = 0; d < shape.size(); ++d) { + off += (shape[d] - 1) * strides[d]; + } + return off; + }; + + const size_t src_max = max_offset(src_strides); + const size_t dst_max = max_offset(dst_strides); + const size_t byte_size = std::max(src_max, dst_max) + elem_bytes; + + base_bytes_initial.resize(byte_size); + ref_bytes.assign(byte_size, 0); + out_bytes.assign(byte_size, 0); + + std::mt19937 rng(42); + std::uniform_int_distribution dist(0, 255); + for (size_t i = 0; i < base_bytes_initial.size(); ++i) { + base_bytes_initial[i] = static_cast(dist(rng)); + } + + baseTensor = ov::Tensor(ov::element::u8, ov::Shape{byte_size}, base_bytes_initial.data()); +} + +bool CopyInplaceTestsBase::isNegative() const { + if (shape.size() < 2) { + return true; + } + if (kv_dim >= shape.size()) { + return true; + } + if (type.bitwidth() < 8) { + return true; + } + return false; +} + +void CopyInplaceTestsBase::make_ref_output() { + ref_bytes = base_bytes_initial; + + const auto elem_bytes = copy_inplace_details::elem_size_bytes(type); + const uint8_t* base_in = base_bytes_initial.data(); + + std::vector tmp_out = base_bytes_initial; + + ov::Shape idx(shape.size(), 0); + std::vector elem(elem_bytes); + + for (;;) { + copy_inplace_details::read_elem_bytes(base_in, idx, src_strides, elem_bytes, elem.data()); + copy_inplace_details::write_elem_bytes(tmp_out.data(), idx, dst_strides, elem_bytes, elem.data()); + + if (!copy_inplace_details::next_index(idx, shape)) { + break; + } + } + + ref_bytes = std::move(tmp_out); +} + +void CopyInplaceTestsBase::SetUp(const CopyInplaceTestsParams& getParam) { + ShapesInitializer shapeInit; + ov::element::Type_t t; + std::tie(t, shapeInit, kv_dim) = getParam; + + type = ov::element::Type(t); + + std::vector dims; + shapeInit(dims); + shape = ov::Shape{dims.begin(), dims.end()}; + + src_strides = copy_inplace_details::default_byte_strides(shape, type); + const size_t pad_elems = 13; + dst_strides = make_padded_strides_keep_tail_default(shape, type, kv_dim, pad_elems); + + make_input(); + + void* base_ptr = baseTensor.data(); + ASSERT_NE(base_ptr, nullptr); + srcView = ov::Tensor(type, shape, base_ptr, src_strides); + dstView = ov::Tensor(type, shape, base_ptr, dst_strides); + + if (!isNegative()) { + make_ref_output(); + } +} + +TEST_P(CopyInplaceTests, copy_tensor_inplace_by_dim_correctness) { + ASSERT_NO_THROW_IF(!isNegative(), { + auto src_it = ov::get_tensor_impl(srcView); + auto dst_it = ov::get_tensor_impl(dstView); + + ov::npuw::util::copy_tensor_inplace_by_dim(src_it, + dst_it, + static_cast(kv_dim), + static_cast(kv_dim)); + + uint8_t* base_ptr = baseTensor.data(); + ASSERT_NE(base_ptr, nullptr); + out_bytes.assign(base_ptr, base_ptr + out_bytes.size()); + + ASSERT_TRUE(details::ArraysMatch(to_i8(out_bytes), to_i8(ref_bytes))); + }); +} + +// Test cases +const auto TestCases = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::i8, ov::element::Type_t::f16, ov::element::Type_t::f32}), + details::ShapesIn({ + Tensors{ input = {1, 2, 3, 4}; +} +, Tensors { + input = {1, 8, 16, 32}; +} +, Tensors { + input = {1, 16, 33, 64}; +} +, Tensors { + input = {1, 4, 128, 16}; +} +, +}), + ::testing::Values(0, 1, 2, 3) +); + +INSTANTIATE_TEST_SUITE_P(CopyInplaceTests, CopyInplaceTests, TestCases, CopyInplaceTests::getTestCaseName); + +} // namespace + +#endif // HAVE_AVX2 diff --git a/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp new file mode 100644 index 00000000000000..96ee02f961a4bd --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/copy_inplace.hpp @@ -0,0 +1,141 @@ +// Copyright (C) 2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "infer_request_utils.hpp" +#include "openvino/runtime/make_tensor.hpp" +#include "openvino/runtime/tensor.hpp" +#include "test_utils.hpp" + +namespace { + +using CopyInplaceTestsParams = std::tuple; + +namespace copy_inplace_details { + +inline ov::Strides default_byte_strides(const ov::Shape& shape, const ov::element::Type& et) { + ov::Strides strides(shape.size(), 0); + if (!strides.empty()) { + strides.back() = et.size(); + for (size_t i = shape.size() - 1; i > 0; --i) { + strides[i - 1] = strides[i] * shape[i]; + } + } + return strides; +} + +inline size_t elem_size_bytes(const ov::element::Type& et) { + return et.size(); +} + +inline void read_elem_bytes(const uint8_t* base, + const ov::Shape& idx, + const ov::Strides& strides, + size_t elem_bytes, + uint8_t* out_elem) { + size_t off = 0; + for (size_t d = 0; d < idx.size(); ++d) { + off += idx[d] * strides[d]; + } + std::memcpy(out_elem, base + off, elem_bytes); +} + +inline void write_elem_bytes(uint8_t* base, + const ov::Shape& idx, + const ov::Strides& strides, + size_t elem_bytes, + const uint8_t* elem) { + size_t off = 0; + for (size_t d = 0; d < idx.size(); ++d) { + off += idx[d] * strides[d]; + } + std::memcpy(base + off, elem, elem_bytes); +} + +inline bool next_index(ov::Shape& idx, const ov::Shape& shape) { + for (int d = static_cast(shape.size()) - 1; d >= 0; --d) { + const size_t ud = static_cast(d); + if (++idx[ud] < shape[ud]) { + return true; + } + idx[ud] = 0; + } + return false; +} + +} // namespace copy_inplace_details + +class CopyInplaceTestsBase { +protected: + ov::element::Type type; + ov::Tensor baseTensor; + ov::Tensor srcView; + ov::Tensor dstView; + ov::Shape shape; + + std::vector base_bytes_initial; + std::vector ref_bytes; + std::vector out_bytes; + + std::size_t kv_dim = 0; + + ov::Strides src_strides; + ov::Strides dst_strides; + + void make_input(); + void make_ref_output(); + bool isNegative() const; + +public: + void SetUp(const CopyInplaceTestsParams& getParam); +}; + +template +class CopyInplaceTestsTmpl : public ::testing::Test, + public T, + public ::testing::WithParamInterface { +protected: + void SetUp() override { + T::SetUp(GetParam()); + } + +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + ov::element::Type_t t; + ShapesInitializer shapeInit; + std::size_t kv_dim = 0; + std::tie(t, shapeInit, kv_dim) = obj.param; + + std::vector dims; + shapeInit(dims); + + std::ostringstream oss; + oss << "S"; + for (size_t i = 0; i < dims.size(); ++i) { + oss << dims[i]; + if (i + 1 != dims.size()) + oss << "x"; + } + oss << "_T" << ov::element::Type(t) << "_KV" << kv_dim; + return oss.str(); + } +}; + +using CopyInplaceTests = CopyInplaceTestsTmpl; + +} // anonymous namespace