-
Notifications
You must be signed in to change notification settings - Fork 3k
[NPUW]Implement inplace kv cache copy when it's shared #33201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
35789cc
5eada49
40d955a
5f81709
fb7e815
b33cc18
afd4418
aa539a2
c874ad4
42f5cf5
12d0d7a
d807b91
e4e9fad
0313761
ce64e7e
4e32de4
897fd5c
f58df15
9067950
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,7 @@ | |
| #include "logging.hpp" | ||
| #include "openvino/core/parallel.hpp" | ||
| #include "openvino/runtime/iasync_infer_request.hpp" | ||
| #include "perf.hpp" | ||
| #include "util.hpp" | ||
|
|
||
| namespace { | ||
|
|
@@ -505,41 +506,54 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { | |
| // Part 1: The KV results from loops 1 to n-1 have been copied into the 'past' KV input tensor | ||
| // Part 2: The kv results from the last loop remain in the 'present' KV output tensor | ||
| // The task is to copy both parts into the KV-cache input tensor for the decoding process | ||
|
|
||
| // Copy part 1 KV results | ||
| // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk | ||
| auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - m_tokens_in_present_chunk; | ||
| if (tokens_in_past_chunks > 0) { | ||
| // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption | ||
| // This is necessary because subsequent copy operations would overwrite the shared buffer | ||
| auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name)); | ||
| ov::SoPtr<ov::ITensor> tmp_dense_kv_tensor; | ||
| ov::SoPtr<ov::ITensor> prefill_past_kv_chunks; | ||
| if (m_past_kv_bound) { | ||
| tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(), | ||
| prefill_past_kv->get_shape(), | ||
| m_pre_alloc_device, | ||
| m_npuw_llm_compiled_model->get_plugin()); | ||
| prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); | ||
| prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor, | ||
| pre_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
| } else { | ||
| prefill_past_kv_chunks = make_tensor_slice(prefill_past_kv, | ||
| pre_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
| } | ||
|
|
||
| auto kvcache_past_kv_chunks = uu::make_tensor_slice(kvcache_in_tensor, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw sometimes this make_tensor gets called without namespace uu, but i dont see any using namespace stuff, so i would suggest align all usages, also as i see you've commented out implementation of make_tensor_slice - is this temporary?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also try to switch to utils::view helper as it looks fully covered functionality of make_tensor_slice
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I standardized the usage pattern and updated the code to consistently use |
||
| gen_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
|
|
||
| uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); | ||
| ov::SoPtr<ov::ITensor> prefill_past_kv_chunks; | ||
| // In-place KV copy is only safe/possible when the source and destination KV layouts match. | ||
| // When we have mixed v-transpose settings across models (prefill vs generate: v-transpose OFF/ON), | ||
| // the effective KV "token" dimension differs (pre_kv_dim != gen_kv_dim), so an in-place move/copy | ||
| // would corrupt data. Therefore, we only use in-place copy when pre_kv_dim == gen_kv_dim; | ||
| // otherwise we must copy via a temporary tensor. | ||
| if (m_past_kv_bound) { | ||
| if (pre_kv_dim == gen_kv_dim) { | ||
| prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv, | ||
| pre_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
|
|
||
| uu::copy_tensor_inplace_by_dim(prefill_past_kv_chunks, | ||
| kvcache_past_kv_chunks, | ||
| pre_kv_dim, | ||
| gen_kv_dim); | ||
| } else { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the future readers need to add a comment here that in-place copy is not possible when we have v-transpose OFF/ON x-models.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added comments in the latest commit. |
||
| auto tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(), | ||
| prefill_past_kv->get_shape(), | ||
| m_pre_alloc_device, | ||
| m_npuw_llm_compiled_model->get_plugin()); | ||
| prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); | ||
| prefill_past_kv_chunks = uu::make_tensor_slice(tmp_dense_kv_tensor, | ||
| pre_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
| uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); | ||
| } | ||
| } else { | ||
| prefill_past_kv_chunks = uu::make_tensor_slice(prefill_past_kv, | ||
| pre_kv_dim, | ||
| 0u, | ||
| static_cast<uint32_t>(tokens_in_past_chunks)); | ||
| uu::copy_tensor_by_dim(prefill_past_kv_chunks, kvcache_past_kv_chunks, pre_kv_dim, gen_kv_dim); | ||
| } | ||
| } | ||
|
|
||
| // Copy part 2 KV results | ||
| auto prefill_present_kv_chunk = | ||
| uu::make_tensor_slice(prefill_out_tensor, | ||
|
|
@@ -846,7 +860,14 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids, | |
| if (!m_generate_initialized) { | ||
| LOG_DEBUG("Copy kv-cache from prefill to generate model."); | ||
| if (kvcache_desc.num_stored_tokens > 0) { | ||
| copy_kvcache(); | ||
| using MS = ov::npuw::perf::metric<ov::npuw::perf::MSec>; | ||
| MS m_ms_copy_kvcache("copy_kvcache", /*active*/ true); | ||
|
|
||
| m_ms_copy_kvcache += ov::npuw::perf::ms_to_run([&]() { | ||
| copy_kvcache(); | ||
| }); | ||
|
|
||
| LOG_INFO("cost of copy_kvcache(): " << m_ms_copy_kvcache.med() << " ms"); | ||
| } | ||
|
|
||
| LOG_DEBUG("Prepare inputs."); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so it is not an avx version but rather using memove ? Ok if that works we need exactly perf data, and i think tests as well for bunch of actual cases found in LLM workloads.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the earlier ticket about optimizing copy, I tried replacing
std::memcpywith an AVX2 implementation, but it resulted in almost no performance improvement. Thestd::memmoveused here relies on essentially the same highly optimized underlying implementation asstd::memcpy, so I didn’t pursue an additional AVX2 optimization in this case. I’m still running further measurements, and I’ll share more details once those tests are complete.