Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 214 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,220 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
}
}

void ov::npuw::util::copy_inplace_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

void ov::npuw::util::copy_inplace_columns_by_row_chunks(const ov::SoPtr<ov::ITensor>& src, ov::SoPtr<ov::ITensor>& dst)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed at latest commit.

const auto& src_shape = src->get_shape();

OPENVINO_ASSERT(src_shape.size() == 4u);
OPENVINO_ASSERT(src_shape == dst->get_shape());
OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size());

const auto& src_strides = src->get_strides();
const auto& dst_strides = dst->get_strides();
const auto elem_size = src->get_byte_size() / src->get_size();

const auto C = src_shape[1];
const auto H = src_shape[2];
const auto W = src_shape[3];

const auto IS_H = src_strides[2];
const auto OS_H = dst_strides[2];

const size_t chunk_byte_size = W * elem_size;

const auto* src_p = static_cast<uint8_t*>(src->data());
auto* dst_p = static_cast<uint8_t*>(dst->data());

const size_t num_chunks = C * H;
if (num_chunks == 0 || chunk_byte_size == 0) {
return;
}

for (size_t i = num_chunks; i-- > 0;) {
const size_t src_offset = i * IS_H;
const size_t dst_offset = i * OS_H;
std::memmove(dst_p + dst_offset, src_p + src_offset, chunk_byte_size);
}
}

void ov::npuw::util::copy_inplace_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

void ov::npuw::util::copy_inplace_by_planes(const ov::SoPtr<ov::ITensor>& src_tensor, ov::SoPtr<ov::ITensor>& dst_tensor)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

// [1, H, S1, E] -> [1, H, S2, E]
const int N = 0;
const int H = 1;
const int S = 2;
const int E = 3;

OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]);
OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]);
OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]);
OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());
OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u);
OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u);

const auto* src_base = reinterpret_cast<uint8_t*>(src_tensor->data());
auto* dst_base = reinterpret_cast<uint8_t*>(dst_tensor->data());

const auto num_planes = src_tensor->get_shape()[H];
const auto src_plane_stride = src_tensor->get_strides()[H];
const auto dst_plane_stride = dst_tensor->get_strides()[H];
const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S];

if (num_planes == 0 || plane_size_in_bytes == 0) {
return;
}

for (size_t i = num_planes; i-- > 0;) {
const auto* src_ptr = src_base + i * src_plane_stride;
auto* dst_ptr = dst_base + i * dst_plane_stride;
std::memmove(dst_ptr, src_ptr, plane_size_in_bytes);
}
}

void ov::npuw::util::copy_inplace(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

void ov::npuw::util::copy_inplace(const ov::SoPtr<ov::ITensor>& src_tensor, ov::SoPtr<ov::ITensor>& dst_tensor)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

const auto& shape = src_tensor->get_shape();

auto* base = static_cast<uint8_t*>(src_tensor->data());

auto src_strides = src_tensor->get_strides();
auto dst_strides = dst_tensor->get_strides();

const size_t total_elems = src_tensor->get_size();
const size_t elem_size = src_tensor->get_byte_size() / total_elems;

if (src_strides == dst_strides) {
LOG_INFO("identical strides, skip");
return;
}

for (size_t d = 0; d < shape.size(); ++d) {
if (shape[d] == 0) {
LOG_INFO("zero-sized dimension, nothing to move");
return;
}
}

auto rank = shape.size();

ov::Shape cur_pos{0};
ov::Shape max_pos{1};

if (src_tensor->get_element_type().bitwidth() < 8 || (is_scalar(shape))) {
// Doesn't support strides for LP types
// or both tensors have default strides
// Strides and positions already initialized
} else {
ov::Strides src_str, dst_str;
// Calculate src and dst shapes
bool found_step = false;
for (size_t inverted_idx = rank - 1; inverted_idx < rank; --inverted_idx) {
if (!found_step) {
if (src_strides[inverted_idx] == dst_strides[inverted_idx]) {
continue;
} else {
found_step = true;
size_t strides_size = inverted_idx + 1;
// Set right size
src_str.resize(strides_size + 1);
dst_str.resize(strides_size + 1);
max_pos.resize(strides_size + 1);
cur_pos.resize(strides_size + 1);
// In case of default continuous strides we can copy several elements
// In other case only one element
size_t dim = 1;
size_t strides = elem_size;

if (strides_size < src_strides.size()) {
strides = src_strides[strides_size];
dim = shape[strides_size];
}
src_str[strides_size] = strides;
dst_str[strides_size] = strides;
max_pos[strides_size] = dim;
cur_pos[strides_size] = max_pos[strides_size] - 1;
}
}
src_str[inverted_idx] = src_strides[inverted_idx];
dst_str[inverted_idx] = dst_strides[inverted_idx];
max_pos[inverted_idx] = shape[inverted_idx];
cur_pos[inverted_idx] = max_pos[inverted_idx] - 1;
}
src_strides = std::move(src_str);
dst_strides = std::move(dst_str);
}

size_t src_off = 0;
size_t dst_off = 0;
for (size_t d = 0; d < max_pos.size(); ++d) {
src_off += cur_pos[d] * src_strides[d];
dst_off += cur_pos[d] * dst_strides[d];
}

auto dec_index_and_update_offsets = [&]() -> bool {
for (int d = static_cast<int>(max_pos.size()) - 1; d >= 0; --d) {
const size_t old = cur_pos[static_cast<size_t>(d)];
if (old > 0) {
cur_pos[static_cast<size_t>(d)] = old - 1;
src_off -= src_strides[static_cast<size_t>(d)];
dst_off -= dst_strides[static_cast<size_t>(d)];
return true;
} else {
cur_pos[static_cast<size_t>(d)] = max_pos[static_cast<size_t>(d)] - 1;
src_off += src_strides[static_cast<size_t>(d)] * (max_pos[static_cast<size_t>(d)] - 1);
dst_off += dst_strides[static_cast<size_t>(d)] * (max_pos[static_cast<size_t>(d)] - 1);
}
}
return false;
};

while (true) {
uint8_t* src_ptr = base + src_off;
uint8_t* dst_ptr = base + dst_off;

if (src_ptr != dst_ptr) {
std::memmove(dst_ptr, src_ptr, src_strides[src_strides.size() - 1]);
}

if (!dec_index_and_update_offsets()) {
break;
}
}
}

// In-place move along kv_dim when src/dst share the same buffer.
// Requirements:
// - kv_dim_src == kv_dim_dst, otherwise throws
// - src_tensor->data() == dst_tensor->data()
void ov::npuw::util::copy_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

void ov::npuw::util::copy_tensor_inplace_by_dim(const ov::SoPtr<ov::ITensor>& src_tensor, ov::SoPtr<ov::ITensor>& dst_tensor, ...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

ov::SoPtr<ov::ITensor> dst_tensor,
uint32_t kv_dim_src,
uint32_t kv_dim_dst) {
OPENVINO_ASSERT(src_tensor);
OPENVINO_ASSERT(dst_tensor);

if (kv_dim_src != kv_dim_dst) {
OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst");
}

void* base_data = src_tensor->data();
void* dst_data = dst_tensor->data();
OPENVINO_ASSERT(base_data);
OPENVINO_ASSERT(dst_data);
OPENVINO_ASSERT(base_data == dst_data);

const auto& src_shape = src_tensor->get_shape();
const auto& dst_shape = dst_tensor->get_shape();
OPENVINO_ASSERT(src_shape.size() == dst_shape.size());
OPENVINO_ASSERT(src_shape == dst_shape);
OPENVINO_ASSERT(kv_dim_src < src_shape.size());

if (kv_dim_src == 3u) {
copy_inplace_columns_by_row_chunks(src_tensor, dst_tensor);
} else if (kv_dim_src == 2u) {
copy_inplace_by_planes(src_tensor, dst_tensor);
} else {
copy_inplace(src_tensor, dst_tensor);
}
}

std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name(
const std::vector<ov::Output<const ov::Node>>& ports,
const std::string& name) {
Expand Down
11 changes: 11 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/infer_request_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ void copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
uint32_t kv_dim_src,
uint32_t kv_dim_dst);

void copy_inplace_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst);

void copy_inplace_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor);

void copy_inplace(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor);

void copy_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor,
ov::SoPtr<ov::ITensor> dst_tensor,
uint32_t kv_dim_src,
uint32_t kv_dim_dst);

std::optional<ov::Output<const ov::Node>> find_port_by_name(const std::vector<ov::Output<const ov::Node>>& ports,
const std::string& name);

Expand Down
53 changes: 34 additions & 19 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -582,41 +582,50 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
// Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor
// Part 2: The kv results from the last loop remain in the 'present' KV output tensor
// The task is to copy both parts into the KV-cache input tensor for the decoding process

// Copy part 1 KV results
// tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk
auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk;
if (tokens_in_past_chunks > 0) {
// Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption
// This is necessary because subsequent copy operations would overwrite the shared buffer
auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name));
ov::SoPtr<ov::ITensor> tmp_dense_kv_tensor;
auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor,
Copy link
Contributor

@esmirno esmirno Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw sometimes this make_tensor gets called without namespace uu, but i dont see any using namespace stuff, so i would suggest align all usages, also as i see you've commented out implementation of make_tensor_slice - is this temporary?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also try to switch to utils::view helper as it looks fully covered functionality of make_tensor_slice

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I standardized the usage pattern and updated the code to consistently use uu::make_tensor_slice.
Also, there’s a small difference between uu::make_tensor_slice and utils::view:
The last parameter of uu::make_tensor_slice represents the end position, while the last parameter of utils::view represents the slice length. This difference wouldn’t prevent us from switching to utils::view, but to stay consistent with the other functions in llm_infer_request.cpp, I’m keeping uu::make_tensor_slice for now.

gen_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));
ov::SoPtr<ov::ITensor> prefill_past_kv_chunks;
// move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst.
if (m_past_kv_bound) {
tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(),
prefill_past_kv->get_shape(),
m_pre_alloc_device,
m_npuw_llm_compiled_model->get_plugin());
prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor,
pre_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));
if (pre_kv_dim == gen_kv_dim) {
prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv,
pre_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));

uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks,
kvcache_past_kv_chunks,
pre_kv_dim,
gen_kv_dim);
} else {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the future readers need to add a comment here that in-place copy is not possible when we have v-transpose OFF/ON x-models.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added comments in the latest commit.

auto tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(),
prefill_past_kv->get_shape(),
m_pre_alloc_device,
m_npuw_llm_compiled_model->get_plugin());
prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor,
pre_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));
uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
}
} else {
prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv,
pre_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));
uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
}

auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor,
gen_kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));

uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim);
}

// Copy part 2 KV results
auto prefill_present_kv_chunk =
uu::make_tensor_slice(prefill_out_tensor,
Expand Down Expand Up @@ -966,7 +975,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
if (!m_generate_initialized) {
LOG_DEBUG("Copy kv-cache from prefill to generate model.");
if (kvcache_desc.num_stored_tokens > 0) {
// Start counting time.
auto t_start = std::chrono::high_resolution_clock::now();
copy_kvcache();
// End counting time.
auto t_end = std::chrono::high_resolution_clock::now();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please use utils like: profiler and ms_to_run


    // Quick-and-dirty profiling
    using MS = ov::npuw::perf::metric<ov::npuw::perf::MSec>;
    using B = ov::npuw::perf::counter<ov::npuw::perf::Bytes>;

    MS m_ms_unpack;
    ov::npuw::perf::Profile<MS> m_profile;
    mutable ov::npuw::perf::Profile<B> m_footprint; 

   m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
    ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
        auto cidx = closure_copy_required[j];
        auto& closure = desc_closure[cidx];
        const auto closure_param_id = comp_model_desc.param_base + cidx;
        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
        auto clparam = request->get_tensor(iport);
        ov::get_tensor_impl(closure)->copy_to(clparam._ptr);
    });
   }); // ms_to_run

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed at latest commit .

auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
LOG_INFO("cost of copy_kvcache(): " << duration_ms << " ms");
}

LOG_DEBUG("Prepare inputs.");
Expand Down
Loading