Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 200 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,66 @@
#include "openvino/runtime/make_tensor.hpp" // get_tensor_impl
#include "util_xarch.hpp"

namespace {

struct FoldedTrailingLayout {
ov::Shape shape;
ov::Strides src_strides;
ov::Strides dst_strides;
};

// Folds the maximal COMMON trailing segment where:
// src_stride == dst_stride == default_stride (packed / contiguous-by-bytes)
// into a single last dimension.
// This is the only segment eligible for flattening to speed up row-wise memmove.
FoldedTrailingLayout fold_common_trailing_packed_segment(const ov::Shape& shape0,
const ov::Strides& src_strides0,
const ov::Strides& dst_strides0,
size_t elem_size) {
const size_t rank0 = shape0.size();
OPENVINO_ASSERT(rank0 > 0);

ov::Strides default_strides(rank0, 0);
default_strides[rank0 - 1] = elem_size;
for (size_t i = rank0 - 1; i > 0; --i) {
default_strides[i - 1] = default_strides[i] * shape0[i];
}

size_t cut = rank0 - 1;
for (size_t inverted_idx = rank0; inverted_idx-- > 0;) {
const bool ok = (src_strides0[inverted_idx] == default_strides[inverted_idx]) &&
(dst_strides0[inverted_idx] == default_strides[inverted_idx]) &&
(src_strides0[inverted_idx] == dst_strides0[inverted_idx]);
if (!ok) {
break;
}
cut = inverted_idx;
}

FoldedTrailingLayout out;
out.shape.reserve(cut + 1);
out.src_strides.reserve(cut + 1);
out.dst_strides.reserve(cut + 1);

for (size_t d = 0; d < cut; ++d) {
out.shape.push_back(shape0[d]);
out.src_strides.push_back(src_strides0[d]);
out.dst_strides.push_back(dst_strides0[d]);
}

size_t folded_last = 1;
for (size_t d = cut; d < rank0; ++d) {
folded_last *= shape0[d];
}
out.shape.push_back(folded_last);
out.src_strides.push_back(elem_size);
out.dst_strides.push_back(elem_size);

return out;
}

} // namespace

// FIXME: Use ov::npuw::util::view instead
ov::SoPtr<ov::ITensor> ov::npuw::util::make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
uint32_t dim,
Expand Down Expand Up @@ -147,6 +207,146 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
}
}

void ov::npuw::util::copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_tensor,
ov::SoPtr<ov::ITensor> dst_tensor) {
OPENVINO_ASSERT(src_tensor);
OPENVINO_ASSERT(dst_tensor);
OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());

// KV-cache values are byte-addressable in the current flow. Sub-byte element types (int4/uint4) are unsupported.
const auto et = src_tensor->get_element_type();
OPENVINO_ASSERT(et.bitwidth() % 8u == 0u, "sub-byte element types (e.g. int4/uint4) are not supported");

void* base_data = src_tensor->data();
void* dst_data = dst_tensor->data();
OPENVINO_ASSERT(base_data && dst_data);
OPENVINO_ASSERT(base_data == dst_data);

const auto& shape0 = src_tensor->get_shape();
const auto& dst_shape0 = dst_tensor->get_shape();
OPENVINO_ASSERT(shape0 == dst_shape0);

const size_t rank0 = shape0.size();
if (rank0 == 0) {
return;
}

for (size_t d = 0; d < rank0; ++d) {
if (shape0[d] == 0) {
return;
}
}

const size_t total_elems = src_tensor->get_size();
OPENVINO_ASSERT(total_elems != 0);

const size_t elem_size = src_tensor->get_byte_size() / total_elems;

const ov::Strides src_strides0 = src_tensor->get_strides();
const ov::Strides dst_strides0 = dst_tensor->get_strides();
OPENVINO_ASSERT(src_strides0.size() == rank0);
OPENVINO_ASSERT(dst_strides0.size() == rank0);

// The last dimension is packed in both src and dst.
OPENVINO_ASSERT(src_strides0[rank0 - 1] == elem_size && dst_strides0[rank0 - 1] == elem_size,
"src/dst last dimension is not packed");

auto* base = static_cast<uint8_t*>(base_data);

const auto folded = fold_common_trailing_packed_segment(shape0, src_strides0, dst_strides0, elem_size);

const size_t rank = folded.shape.size();
OPENVINO_ASSERT(rank >= 1);

const size_t row_elems = folded.shape[rank - 1];
const size_t row_bytes = row_elems * elem_size;
if (row_bytes == 0) {
return;
}

const size_t outer_rank = rank - 1;

ov::Shape outer(outer_rank, 0);
for (size_t d = 0; d < outer_rank; ++d) {
outer[d] = folded.shape[d] - 1;
}

auto compute_outer_offset = [&](const ov::Shape& o, const ov::Strides& strides_bytes) -> size_t {
size_t off = 0;
for (size_t d = 0; d < o.size(); ++d) {
off += o[d] * strides_bytes[d];
}
return off;
};

size_t src_off = compute_outer_offset(outer, folded.src_strides);
size_t dst_off = compute_outer_offset(outer, folded.dst_strides);

auto step_prev_outer = [&](size_t& off, const ov::Strides& strides_bytes, size_t dim) {
off -= strides_bytes[dim];
};

auto wrap_outer_dim =
[&](size_t& off, const ov::Shape& shape_folded, const ov::Strides& strides_bytes, size_t dim) {
off += (shape_folded[dim] - 1) * strides_bytes[dim];
};

auto dec_outer_and_offsets = [&]() -> bool {
for (int d = static_cast<int>(outer_rank) - 1; d >= 0; --d) {
const size_t ud = static_cast<size_t>(d);
if (outer[ud] > 0) {
--outer[ud];
step_prev_outer(src_off, folded.src_strides, ud);
step_prev_outer(dst_off, folded.dst_strides, ud);
return true;
}
outer[ud] = folded.shape[ud] - 1;
wrap_outer_dim(src_off, folded.shape, folded.src_strides, ud);
wrap_outer_dim(dst_off, folded.shape, folded.dst_strides, ud);
}
return false;
};

while (true) {
uint8_t* src_ptr = base + src_off;
uint8_t* dst_ptr = base + dst_off;
if (src_ptr != dst_ptr) {
std::memmove(dst_ptr, src_ptr, row_bytes);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so it is not an avx version but rather using memove ? Ok if that works we need exactly perf data, and i think tests as well for bunch of actual cases found in LLM workloads.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the earlier ticket about optimizing copy, I tried replacing std::memcpy with an AVX2 implementation, but it resulted in almost no performance improvement. The std::memmove used here relies on essentially the same highly optimized underlying implementation as std::memcpy, so I didn’t pursue an additional AVX2 optimization in this case. I’m still running further measurements, and I’ll share more details once those tests are complete.

}

if (!dec_outer_and_offsets()) {
break;
}
}
}

// In-place move along kv_dim when src/dst share the same buffer.
// Requirements:
// - kv_dim_src == kv_dim_dst
// - src_tensor->data() == dst_tensor->data()
void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor> src_tensor,
ov::SoPtr<ov::ITensor> dst_tensor,
uint32_t kv_dim_src,
uint32_t kv_dim_dst) {
OPENVINO_ASSERT(src_tensor);
OPENVINO_ASSERT(dst_tensor);

OPENVINO_ASSERT(kv_dim_src == kv_dim_dst, "copy_tensor_inplace_by_dim supports only kv_dim_src == kv_dim_dst");

void* base_data = src_tensor->data();
void* dst_data = dst_tensor->data();
OPENVINO_ASSERT(base_data);
OPENVINO_ASSERT(dst_data);
OPENVINO_ASSERT(base_data == dst_data);

const auto& src_shape = src_tensor->get_shape();
const auto& dst_shape = dst_tensor->get_shape();
OPENVINO_ASSERT(src_shape == dst_shape);
OPENVINO_ASSERT(kv_dim_src < src_shape.size());

copy_inplace_generic_rows(src_tensor, dst_tensor);
}

std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name(
const std::vector<ov::Output<const ov::Node>>& ports,
const std::string& name) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ void copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
uint32_t kv_dim_src,
uint32_t kv_dim_dst);

void copy_inplace_generic_rows(const ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor);

void copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor> src_tensor,
ov::SoPtr<ov::ITensor> dst_tensor,
uint32_t kv_dim_src,
uint32_t kv_dim_dst);

std::optional<ov::Output<const ov::Node>> find_port_by_name(const std::vector<ov::Output<const ov::Node>>& ports,
const std::string& name);
/**
Expand Down
69 changes: 45 additions & 24 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "logging.hpp"
#include "openvino/core/parallel.hpp"
#include "openvino/runtime/iasync_infer_request.hpp"
#include "perf.hpp"
#include "util.hpp"

namespace {
Expand Down Expand Up @@ -505,41 +506,54 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
// Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor
// Part 2: The kv results from the last loop remain in the 'present' KV output tensor
// The task is to copy both parts into the KV-cache input tensor for the decoding process

// Copy part 1 KV results
// tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk
auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk;
if (tokens_in_past_chunks > 0) {
// Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption
// This is necessary because subsequent copy operations would overwrite the shared buffer
auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name));
ov::SoPtr<ov::ITensor> tmp_dense_kv_tensor;
ov::SoPtr<ov::ITensor> prefill_past_kv_chunks;
if (m_past_kv_bound) {
tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(),
prefill_past_kv->get_shape(),
m_pre_alloc_device,
m_npuw_llm_compiled_model->get_plugin());
prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor,
pre_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));
} else {
prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv,
pre_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));
}

auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor,
Copy link
Contributor

@esmirno esmirno Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw sometimes this make_tensor gets called without namespace uu, but i dont see any using namespace stuff, so i would suggest align all usages, also as i see you've commented out implementation of make_tensor_slice - is this temporary?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also try to switch to utils::view helper as it looks fully covered functionality of make_tensor_slice

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I standardized the usage pattern and updated the code to consistently use uu::make_tensor_slice.
Also, there’s a small difference between uu::make_tensor_slice and utils::view:
The last parameter of uu::make_tensor_slice represents the end position, while the last parameter of utils::view represents the slice length. This difference wouldn’t prevent us from switching to utils::view, but to stay consistent with the other functions in llm_infer_request.cpp, I’m keeping uu::make_tensor_slice for now.

gen_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));

uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
ov::SoPtr<ov::ITensor> prefill_past_kv_chunks;
// In-place KV copy is only safe/possible when the source and destination KV layouts match.
// When we have mixed v-transpose settings across models (prefill vs generate: v-transpose OFF/ON),
// the effective KV "token" dimension differs (pre_kv_dim != gen_kv_dim), so an in-place move/copy
// would corrupt data. Therefore, we only use in-place copy when pre_kv_dim == gen_kv_dim;
// otherwise we must copy via a temporary tensor.
if (m_past_kv_bound) {
if (pre_kv_dim == gen_kv_dim) {
prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv,
pre_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));

uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks,
kvcache_past_kv_chunks,
pre_kv_dim,
gen_kv_dim);
} else {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the future readers need to add a comment here that in-place copy is not possible when we have v-transpose OFF/ON x-models.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added comments in the latest commit.

auto tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(),
prefill_past_kv->get_shape(),
m_pre_alloc_device,
m_npuw_llm_compiled_model->get_plugin());
prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
prefill_past_kv_chunks = uu::make_tensor_slice(tmp_dense_kv_tensor,
pre_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));
uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
}
} else {
prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv,
pre_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));
uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
}
}

// Copy part 2 KV results
auto prefill_present_kv_chunk =
uu::make_tensor_slice(prefill_out_tensor,
Expand Down Expand Up @@ -846,7 +860,14 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
if (!m_generate_initialized) {
LOG_DEBUG("Copy kv-cache from prefill to generate model.");
if (kvcache_desc.num_stored_tokens > 0) {
copy_kvcache();
using MS = ov::npuw::perf::metric<ov::npuw::perf::MSec>;
MS m_ms_copy_kvcache("copy_kvcache", /*active*/ true);

m_ms_copy_kvcache += ov::npuw::perf::ms_to_run([&]() {
copy_kvcache();
});

LOG_INFO("cost of copy_kvcache(): " << m_ms_copy_kvcache.med() << " ms");
}

LOG_DEBUG("Prepare inputs.");
Expand Down
Loading
Loading