-
Notifications
You must be signed in to change notification settings - Fork 3k
[NPUW]Implement inplace kv cache copy when it's shared #33201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 5 commits
35789cc
5eada49
40d955a
5f81709
fb7e815
b33cc18
afd4418
aa539a2
c874ad4
42f5cf5
12d0d7a
d807b91
e4e9fad
0313761
ce64e7e
4e32de4
897fd5c
f58df15
9067950
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -147,6 +147,220 @@ void ov::npuw::util::copy_tensor_by_dim(ov::SoPtr<ov::ITensor> src_tensor, | |
| } | ||
| } | ||
|
|
||
| void ov::npuw::util::copy_inplace_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst) { | ||
| const auto& src_shape = src->get_shape(); | ||
|
|
||
| OPENVINO_ASSERT(src_shape.size() == 4u); | ||
| OPENVINO_ASSERT(src_shape == dst->get_shape()); | ||
| OPENVINO_ASSERT(src->get_byte_size() == dst->get_byte_size()); | ||
|
|
||
| const auto& src_strides = src->get_strides(); | ||
| const auto& dst_strides = dst->get_strides(); | ||
| const auto elem_size = src->get_byte_size() / src->get_size(); | ||
|
|
||
| const auto C = src_shape[1]; | ||
| const auto H = src_shape[2]; | ||
| const auto W = src_shape[3]; | ||
|
|
||
| const auto IS_H = src_strides[2]; | ||
| const auto OS_H = dst_strides[2]; | ||
|
|
||
| const size_t chunk_byte_size = W * elem_size; | ||
|
|
||
| const auto* src_p = static_cast<uint8_t*>(src->data()); | ||
| auto* dst_p = static_cast<uint8_t*>(dst->data()); | ||
|
|
||
| const size_t num_chunks = C * H; | ||
| if (num_chunks == 0 || chunk_byte_size == 0) { | ||
| return; | ||
| } | ||
|
|
||
| for (size_t i = num_chunks; i-- > 0;) { | ||
| const size_t src_offset = i * IS_H; | ||
| const size_t dst_offset = i * OS_H; | ||
| std::memmove(dst_p + dst_offset, src_p + src_offset, chunk_byte_size); | ||
| } | ||
| } | ||
|
|
||
| void ov::npuw::util::copy_inplace_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) { | ||
|
||
| // [1, H, S1, E] -> [1, H, S2, E] | ||
| const int N = 0; | ||
| const int H = 1; | ||
| const int S = 2; | ||
| const int E = 3; | ||
|
|
||
| OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]); | ||
| OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]); | ||
| OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]); | ||
| OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type()); | ||
| OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u); | ||
| OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u); | ||
|
|
||
| const auto* src_base = reinterpret_cast<uint8_t*>(src_tensor->data()); | ||
| auto* dst_base = reinterpret_cast<uint8_t*>(dst_tensor->data()); | ||
|
|
||
| const auto num_planes = src_tensor->get_shape()[H]; | ||
| const auto src_plane_stride = src_tensor->get_strides()[H]; | ||
| const auto dst_plane_stride = dst_tensor->get_strides()[H]; | ||
| const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S]; | ||
|
|
||
| if (num_planes == 0 || plane_size_in_bytes == 0) { | ||
| return; | ||
| } | ||
|
|
||
| for (size_t i = num_planes; i-- > 0;) { | ||
| const auto* src_ptr = src_base + i * src_plane_stride; | ||
| auto* dst_ptr = dst_base + i * dst_plane_stride; | ||
| std::memmove(dst_ptr, src_ptr, plane_size_in_bytes); | ||
| } | ||
| } | ||
|
|
||
| void ov::npuw::util::copy_inplace(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) { | ||
|
||
| const auto& shape = src_tensor->get_shape(); | ||
|
|
||
| auto* base = static_cast<uint8_t*>(src_tensor->data()); | ||
|
|
||
| auto src_strides = src_tensor->get_strides(); | ||
| auto dst_strides = dst_tensor->get_strides(); | ||
|
|
||
| const size_t total_elems = src_tensor->get_size(); | ||
| const size_t elem_size = src_tensor->get_byte_size() / total_elems; | ||
|
|
||
| if (src_strides == dst_strides) { | ||
| LOG_INFO("identical strides, skip"); | ||
| return; | ||
| } | ||
|
|
||
| for (size_t d = 0; d < shape.size(); ++d) { | ||
| if (shape[d] == 0) { | ||
| LOG_INFO("zero-sized dimension, nothing to move"); | ||
| return; | ||
| } | ||
| } | ||
|
|
||
| auto rank = shape.size(); | ||
|
|
||
| ov::Shape cur_pos{0}; | ||
| ov::Shape max_pos{1}; | ||
|
|
||
| if (src_tensor->get_element_type().bitwidth() < 8 || (is_scalar(shape))) { | ||
| // Doesn't support strides for LP types | ||
| // or both tensors have default strides | ||
| // Strides and positions already initialized | ||
| } else { | ||
| ov::Strides src_str, dst_str; | ||
| // Calculate src and dst shapes | ||
| bool found_step = false; | ||
| for (size_t inverted_idx = rank - 1; inverted_idx < rank; --inverted_idx) { | ||
| if (!found_step) { | ||
| if (src_strides[inverted_idx] == dst_strides[inverted_idx]) { | ||
| continue; | ||
| } else { | ||
| found_step = true; | ||
| size_t strides_size = inverted_idx + 1; | ||
| // Set right size | ||
| src_str.resize(strides_size + 1); | ||
| dst_str.resize(strides_size + 1); | ||
| max_pos.resize(strides_size + 1); | ||
| cur_pos.resize(strides_size + 1); | ||
| // In case of default continuous strides we can copy several elements | ||
| // In other case only one element | ||
| size_t dim = 1; | ||
| size_t strides = elem_size; | ||
|
|
||
| if (strides_size < src_strides.size()) { | ||
| strides = src_strides[strides_size]; | ||
| dim = shape[strides_size]; | ||
| } | ||
| src_str[strides_size] = strides; | ||
| dst_str[strides_size] = strides; | ||
| max_pos[strides_size] = dim; | ||
| cur_pos[strides_size] = max_pos[strides_size] - 1; | ||
| } | ||
| } | ||
| src_str[inverted_idx] = src_strides[inverted_idx]; | ||
| dst_str[inverted_idx] = dst_strides[inverted_idx]; | ||
| max_pos[inverted_idx] = shape[inverted_idx]; | ||
| cur_pos[inverted_idx] = max_pos[inverted_idx] - 1; | ||
| } | ||
| src_strides = std::move(src_str); | ||
| dst_strides = std::move(dst_str); | ||
| } | ||
|
|
||
| size_t src_off = 0; | ||
| size_t dst_off = 0; | ||
| for (size_t d = 0; d < max_pos.size(); ++d) { | ||
| src_off += cur_pos[d] * src_strides[d]; | ||
| dst_off += cur_pos[d] * dst_strides[d]; | ||
| } | ||
|
|
||
| auto dec_index_and_update_offsets = [&]() -> bool { | ||
| for (int d = static_cast<int>(max_pos.size()) - 1; d >= 0; --d) { | ||
| const size_t old = cur_pos[static_cast<size_t>(d)]; | ||
| if (old > 0) { | ||
| cur_pos[static_cast<size_t>(d)] = old - 1; | ||
| src_off -= src_strides[static_cast<size_t>(d)]; | ||
| dst_off -= dst_strides[static_cast<size_t>(d)]; | ||
| return true; | ||
| } else { | ||
| cur_pos[static_cast<size_t>(d)] = max_pos[static_cast<size_t>(d)] - 1; | ||
| src_off += src_strides[static_cast<size_t>(d)] * (max_pos[static_cast<size_t>(d)] - 1); | ||
| dst_off += dst_strides[static_cast<size_t>(d)] * (max_pos[static_cast<size_t>(d)] - 1); | ||
| } | ||
| } | ||
| return false; | ||
| }; | ||
|
|
||
| while (true) { | ||
| uint8_t* src_ptr = base + src_off; | ||
| uint8_t* dst_ptr = base + dst_off; | ||
|
|
||
| if (src_ptr != dst_ptr) { | ||
| std::memmove(dst_ptr, src_ptr, src_strides[src_strides.size() - 1]); | ||
| } | ||
|
|
||
| if (!dec_index_and_update_offsets()) { | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // In-place move along kv_dim when src/dst share the same buffer. | ||
| // Requirements: | ||
| // - kv_dim_src == kv_dim_dst, otherwise throws | ||
| // - src_tensor->data() == dst_tensor->data() | ||
| void ov::npuw::util::copy_tensor_inplace_by_dim(ov::SoPtr<ov::ITensor> src_tensor, | ||
|
||
| ov::SoPtr<ov::ITensor> dst_tensor, | ||
| uint32_t kv_dim_src, | ||
| uint32_t kv_dim_dst) { | ||
| OPENVINO_ASSERT(src_tensor); | ||
| OPENVINO_ASSERT(dst_tensor); | ||
|
|
||
| if (kv_dim_src != kv_dim_dst) { | ||
| OPENVINO_THROW("move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst"); | ||
| } | ||
|
|
||
| void* base_data = src_tensor->data(); | ||
| void* dst_data = dst_tensor->data(); | ||
| OPENVINO_ASSERT(base_data); | ||
| OPENVINO_ASSERT(dst_data); | ||
| OPENVINO_ASSERT(base_data == dst_data); | ||
|
|
||
| const auto& src_shape = src_tensor->get_shape(); | ||
| const auto& dst_shape = dst_tensor->get_shape(); | ||
| OPENVINO_ASSERT(src_shape.size() == dst_shape.size()); | ||
| OPENVINO_ASSERT(src_shape == dst_shape); | ||
| OPENVINO_ASSERT(kv_dim_src < src_shape.size()); | ||
|
|
||
| if (kv_dim_src == 3u) { | ||
| copy_inplace_columns_by_row_chunks(src_tensor, dst_tensor); | ||
| } else if (kv_dim_src == 2u) { | ||
| copy_inplace_by_planes(src_tensor, dst_tensor); | ||
| } else { | ||
| copy_inplace(src_tensor, dst_tensor); | ||
| } | ||
| } | ||
|
|
||
| std::optional<ov::Output<const ov::Node>> ov::npuw::util::find_port_by_name( | ||
| const std::vector<ov::Output<const ov::Node>>& ports, | ||
| const std::string& name) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -582,41 +582,50 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { | |
| // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor | ||
| // Part 2: The kv results from the last loop remain in the 'present' KV output tensor | ||
| // The task is to copy both parts into the KV-cache input tensor for the decoding process | ||
|
|
||
| // Copy part 1 KV results | ||
| // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk | ||
| auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk; | ||
| if (tokens_in_past_chunks > 0) { | ||
| // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption | ||
| // This is necessary because subsequent copy operations would overwrite the shared buffer | ||
| auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name)); | ||
| ov::SoPtr<ov::ITensor> tmp_dense_kv_tensor; | ||
| auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw sometimes this make_tensor gets called without namespace uu, but i dont see any using namespace stuff, so i would suggest align all usages, also as i see you've commented out implementation of make_tensor_slice - is this temporary?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also try to switch to utils::view helper as it looks fully covered functionality of make_tensor_slice
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I standardized the usage pattern and updated the code to consistently use |
||
| gen_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
| ov::SoPtr<ov::ITensor> prefill_past_kv_chunks; | ||
| // move_tensor_inplace_by_dim currently supports only kv_dim_src == kv_dim_dst. | ||
| if (m_past_kv_bound) { | ||
| tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(), | ||
| prefill_past_kv->get_shape(), | ||
| m_pre_alloc_device, | ||
| m_npuw_llm_compiled_model->get_plugin()); | ||
| prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); | ||
| prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor, | ||
| pre_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
| if (pre_kv_dim == gen_kv_dim) { | ||
| prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv, | ||
| pre_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
|
|
||
| uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks, | ||
| kvcache_past_kv_chunks, | ||
| pre_kv_dim, | ||
| gen_kv_dim); | ||
| } else { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the future readers need to add a comment here that in-place copy is not possible when we have v-transpose OFF/ON x-models.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added comments in the latest commit. |
||
| auto tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(), | ||
| prefill_past_kv->get_shape(), | ||
| m_pre_alloc_device, | ||
| m_npuw_llm_compiled_model->get_plugin()); | ||
| prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); | ||
| prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor, | ||
| pre_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
| uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); | ||
| } | ||
| } else { | ||
| prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv, | ||
| pre_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
| uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); | ||
| } | ||
|
|
||
| auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor, | ||
| gen_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
|
|
||
| uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); | ||
| } | ||
|
|
||
| // Copy part 2 KV results | ||
| auto prefill_present_kv_chunk = | ||
| uu::make_tensor_slice(prefill_out_tensor, | ||
|
|
@@ -966,7 +975,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids, | |
| if (!m_generate_initialized) { | ||
| LOG_DEBUG("Copy kv-cache from prefill to generate model."); | ||
| if (kvcache_desc.num_stored_tokens > 0) { | ||
| // Start counting time. | ||
| auto t_start = std::chrono::high_resolution_clock::now(); | ||
| copy_kvcache(); | ||
| // End counting time. | ||
| auto t_end = std::chrono::high_resolution_clock::now(); | ||
|
||
| auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count(); | ||
| LOG_INFO("cost of copy_kvcache(): " << duration_ms << " ms"); | ||
| } | ||
|
|
||
| LOG_DEBUG("Prepare inputs."); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed at latest commit.