Skip to content

Commit d8580e5

Browse files
committed
Improved CutLMHead to not interfere with other outputs
1 parent bdb7742 commit d8580e5

File tree

3 files changed

+45
-33
lines changed

3 files changed

+45
-33
lines changed

src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -895,11 +895,20 @@ class CutLMHead : public ov::pass::MatcherPass {
895895
auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
896896
auto matched_result = std::static_pointer_cast<ov::op::v0::Result>(matched_node_result);
897897

898-
// Some LLMs add intermediate hidden state outputs that can interfere with LM head detection.
899-
// Skip Result nodes that were manually added (marked with "manually_added_output" in RT_INFO).
900-
// For example, Eagle-3 target/draft models add "last_hidden_state" output which should be skipped.
901-
const auto& rt_info = matched_result->get_rt_info();
902-
if (rt_info.count("manually_added_output")) {
898+
// Skip Result nodes that are not logits.
899+
// Note: We can check that Result's output name is "logits" and it will be a
900+
// sufficiently reliable check for finding exatly logits output, because:
901+
// 1. LLMInferRequest always rely on "logits" name to get logits from
902+
/// prefill/kvcache models.
903+
// 2. - Following Exporter configs: OnnxConfig, OnnxConfigWithPast,
904+
// TextDecoderOnnxConfig and TextDecoderWithPositionIdsOnnxConfig
905+
// from optimum-onnx name LLM output with "logits".
906+
// - Most of optimum-intel OpenVINO Exporter configs are derived
907+
// from the configs above.
908+
// - optimum-intel `export()` function set names for output tensors
909+
// from Exporter config: https://github.com/huggingface/optimum-intel/blob/main/optimum/exporters/openvino/convert.py#L442-L445
910+
if (matched_result->output(0).get_names().count(
911+
ov::npuw::LLMCompiledModel::layer_names::logits) == 0) {
903912
return false;
904913
}
905914

@@ -912,14 +921,14 @@ class CutLMHead : public ov::pass::MatcherPass {
912921
// ICompiledModel::ICompiledModel().
913922
// As a WA, setting the same name to output from MatMul
914923
// avoids the issue.
915-
matmul_first_source.set_names({ov::npuw::LLMCompiledModel::output_embeds});
916-
matched_result->output(0).set_names({ov::npuw::LLMCompiledModel::output_embeds});
924+
matmul_first_source.set_names({ov::npuw::LLMCompiledModel::layer_names::output_embeds});
925+
matched_result->output(0).set_names({ov::npuw::LLMCompiledModel::layer_names::output_embeds});
917926
matched_result->validate_and_infer_types();
918927

919928
// Create an additional model after cut point:
920929
auto new_param = std::make_shared<ov::op::v0::Parameter>(matmul_first_source.get_element_type(),
921930
matmul_first_source.get_partial_shape());
922-
new_param->output(0).add_names({ov::npuw::LLMCompiledModel::output_embeds});
931+
new_param->output(0).add_names({ov::npuw::LLMCompiledModel::layer_names::output_embeds});
923932
matched_matmul->input(0).replace_source_output(new_param);
924933
auto new_result = std::make_shared<ov::op::v0::Result>(matched_node_last_op);
925934
lm_head_model =
@@ -1041,7 +1050,7 @@ void slice_out_embeds(std::shared_ptr<ov::Model> model,
10411050
std::size_t max_generation_token_len) {
10421051
std::shared_ptr<ov::Node> embed_result;
10431052
for (auto&& output : model->outputs()) {
1044-
if (output.get_any_name() == ov::npuw::LLMCompiledModel::output_embeds) {
1053+
if (output.get_any_name() == ov::npuw::LLMCompiledModel::layer_names::output_embeds) {
10451054
embed_result = output.get_node_shared_ptr();
10461055
}
10471056
}

src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
2626
std::map<std::string, std::tuple<ov::PropertyMutability, std::function<ov::Any(const ::intel_npu::Config&)>>>;
2727

2828
public:
29-
static constexpr const char* output_embeds = "npuw_output_embed";
29+
struct layer_names {
30+
static constexpr const char* output_embeds = "npuw_output_embed";
31+
static constexpr const char* logits = "logits";
32+
};
3033

3134
static constexpr uint32_t whisper_batch_dim = 0u;
3235
static constexpr uint32_t whisper_seq_len_dim = 2u;

src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -626,9 +626,12 @@ void Partitioner::identifySubgraphs() {
626626
// Exception: param is registered via slice or convert
627627
const auto& links_from = result_cache.at(src_node);
628628
NPUW_ASSERT(links_from.size() > 0);
629+
// Note: It may happen that one output layer has more than one
630+
// Result nodes, that are the same!
631+
// Please see the `results_cache` filling below.
629632
if (links_from.size() > 1) {
630633
LOG_INFO("Parameter " << this_param->get_friendly_name()
631-
<< " has more than one possible Result nodes to connect!"
634+
<< " has more than one possible similar Result nodes to connect!"
632635
<< " Will pick the first one: " << (*links_from.begin()).second);
633636
}
634637
const auto link_to = LinkPtrTo{this_group_idx, this_param};
@@ -656,9 +659,6 @@ void Partitioner::identifySubgraphs() {
656659
// set as part of kvcache conversion routune.
657660
LOG_BLOCK();
658661
std::set<std::string> output_layers_cache(group.output_layers.begin(), group.output_layers.end());
659-
for (auto&& ol_name : group.output_layers) {
660-
LOG_VERB("Initially registered output layer: " << ol_name);
661-
}
662662

663663
// Have to switch clang-format here to make cpplint happy
664664
// clang-format off
@@ -709,9 +709,14 @@ void Partitioner::identifySubgraphs() {
709709
// v
710710
// op102
711711
bool has_external_readers = false;
712-
// NB: It turns out sometime we may end up with multiple
713-
// readers from the output node and more than 1 of them
714-
// will be the Result node!
712+
// NB: It turns out that sometime we may end up with multiple
713+
// Result nodes from one output layer, but they should be equal.
714+
// Ex.: OmniThinker multi-outputs case with cut LM head.
715+
// Output embeddings (not logits) became a Result from the
716+
// prefill/kvcache model when LM head is cut.
717+
// However, last operation before LM head has had already
718+
// a connected Result node corresponding to the second
719+
// output of the original model.
715720
std::vector<NodeSPtr> maybe_results;
716721
auto readers = output_desc.get_target_inputs();
717722
// This is possible then some of layer's outputs are not used in the model.
@@ -732,13 +737,20 @@ void Partitioner::identifySubgraphs() {
732737
}
733738
}
734739
if (!maybe_results.empty()) {
735-
for (auto&& mr: maybe_results) {
736-
LOG_VERB(mr);
737-
}
738740
// This layer's output was connected to Result already.
739741
// It happens when this layer is the original model's output
740742
// Keep it to make the ugly top-level I/O matching procedure work.
741743
// FIXME: This needs to be refactored
744+
745+
// Sanity check that if layer output connects with multiple Result nodes,
746+
// then all Result nodes share the same shape.
747+
if (maybe_results.size() > 1) {
748+
const auto shape = (*maybe_results.begin())->get_shape();
749+
for (int i = 1; i < maybe_results.size(); ++i) {
750+
OPENVINO_ASSERT(shape == maybe_results[i]->get_shape(),
751+
"Multiple results from one output layer should be similar!");
752+
}
753+
}
742754
for (auto&& mr : maybe_results) {
743755
group.sg._results.push_back(ov::as_type_ptr<ov::op::v0::Result>(mr));
744756
result_cache[output_desc].push_back(
@@ -772,6 +784,7 @@ void Partitioner::identifySubgraphs() {
772784
}
773785
auto new_result = std::make_shared<ov::op::v0::Result>(result_src);
774786
result_cache[output_desc].push_back(LinkPtrFrom{this_group_idx, new_result});
787+
775788
ov::copy_runtime_info(output_desc.get_node_shared_ptr(), new_result);
776789
group.sg._results.push_back(std::move(new_result));
777790
}
@@ -2446,28 +2459,18 @@ void Partitioner::finalizeLinks() {
24462459
// result order.. but how? But how... see above, the complexity
24472460
// is there.
24482461

2449-
LOG_VERB("before get_idx_param");
24502462
std::size_t subgraph_idx_to;
24512463
PPtr subgraph_param_to;
24522464
std::tie(subgraph_idx_to, subgraph_param_to) = ptr_link.first;
2453-
LOG_VERB("subgraph_idx_to: " << subgraph_idx_to);
2454-
LOG_VERB("subgraph_param_to: " << subgraph_param_to);
24552465
auto param_idx = get_idx_param(subgraph_idx_to, subgraph_param_to);
2456-
LOG_VERB("after get_idx_param");
24572466

2458-
LOG_VERB("before get_idx_result");
24592467
std::size_t subgraph_idx_from;
24602468
RPtr subgraph_result_from;
24612469
std::tie(subgraph_idx_from, subgraph_result_from) = ptr_link.second;
2462-
LOG_VERB("subgraph_idx_from: " << subgraph_idx_from);
2463-
LOG_VERB("subgraph_result_from: " << subgraph_result_from);
24642470
auto result_idx = get_idx_result(subgraph_idx_from, subgraph_result_from);
2465-
LOG_VERB("after get_idx_result");
24662471

2467-
LOG_VERB("before subgraph_links");
24682472
subgraph_links[ov::npuw::LinkTo{subgraph_idx_to, param_idx}] =
24692473
ov::npuw::LinkFrom{subgraph_idx_from, result_idx};
2470-
LOG_VERB("after subgraph_links");
24712474

24722475
LOG_BLOCK();
24732476
LOG_DEBUG("Record link [" << subgraph_idx_to << "]:" << param_idx << " <--- [" << subgraph_idx_from << "]/"
@@ -2560,11 +2563,8 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
25602563
p.saveRepeatedConstants(func_group);
25612564
p.saveTailDictConstants(func_group);
25622565
p.matchParameters(func_group);
2563-
std::cout << "here" << std::endl;
25642566
p.matchResults(func_group);
2565-
std::cout << "here 2" << std::endl;
25662567
p.matchRepeatedSubgraphs(func_group);
2567-
25682568
p.spatial(func_group);
25692569
p.attention(func_group);
25702570
p.optimize(func_group);

0 commit comments

Comments
 (0)