Skip to content
30 changes: 21 additions & 9 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -907,11 +907,20 @@ class CutLMHead : public ov::pass::MatcherPass {
auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
auto matched_result = std::static_pointer_cast<ov::op::v0::Result>(matched_node_result);

// Some LLMs add intermediate hidden state outputs that can interfere with LM head detection.
// Skip Result nodes that were manually added (marked with "manually_added_output" in RT_INFO).
// For example, Eagle-3 target/draft models add "last_hidden_state" output which should be skipped.
const auto& rt_info = matched_result->get_rt_info();
if (rt_info.count("manually_added_output")) {
// Skip Result nodes that are not logits.
// Note: We can check that Result's output name is "logits" and it will be a
// sufficiently reliable check for finding exatly logits output, because:
// 1. LLMInferRequest always rely on "logits" name to get logits from
/// prefill/kvcache models.
// 2. - Following Exporter configs: OnnxConfig, OnnxConfigWithPast,
// TextDecoderOnnxConfig and TextDecoderWithPositionIdsOnnxConfig
// from optimum-onnx name LLM output with "logits".
// - Most of optimum-intel OpenVINO Exporter configs are derived
// from the configs above.
// - optimum-intel `export()` function set names for output tensors
// from Exporter config:
// https://github.com/huggingface/optimum-intel/blob/main/optimum/exporters/openvino/convert.py#L442-L445
if (matched_result->output(0).get_names().count(ov::npuw::LLMCompiledModel::layer_names::logits) == 0) {
return false;
}

Expand All @@ -924,14 +933,14 @@ class CutLMHead : public ov::pass::MatcherPass {
// ICompiledModel::ICompiledModel().
// As a WA, setting the same name to output from MatMul
// avoids the issue.
matmul_first_source.set_names({ov::npuw::LLMCompiledModel::output_embeds});
matched_result->output(0).set_names({ov::npuw::LLMCompiledModel::output_embeds});
matmul_first_source.set_names({ov::npuw::LLMCompiledModel::layer_names::output_embeds});
matched_result->output(0).set_names({ov::npuw::LLMCompiledModel::layer_names::output_embeds});
matched_result->validate_and_infer_types();

// Create an additional model after cut point:
auto new_param = std::make_shared<ov::op::v0::Parameter>(matmul_first_source.get_element_type(),
matmul_first_source.get_partial_shape());
new_param->output(0).add_names({ov::npuw::LLMCompiledModel::output_embeds});
new_param->output(0).add_names({ov::npuw::LLMCompiledModel::layer_names::output_embeds});
matched_matmul->input(0).replace_source_output(new_param);
auto new_result = std::make_shared<ov::op::v0::Result>(matched_node_last_op);
lm_head_model =
Expand Down Expand Up @@ -1053,7 +1062,7 @@ void slice_out_embeds(std::shared_ptr<ov::Model> model,
std::size_t max_generation_token_len) {
std::shared_ptr<ov::Node> embed_result;
for (auto&& output : model->outputs()) {
if (output.get_any_name() == ov::npuw::LLMCompiledModel::output_embeds) {
if (output.get_any_name() == ov::npuw::LLMCompiledModel::layer_names::output_embeds) {
embed_result = output.get_node_shared_ptr();
}
}
Expand Down Expand Up @@ -1759,6 +1768,9 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
merge_config_with(prefill_config, other_props);
merge_config_with(generate_config, other_props);
merge_config_with(prefill_config, prefill_config_addition_value);
if (lm_head_model) {
prefill_config.erase("NPUW_SLICE_OUT");
}
merge_config_with(generate_config, generate_config_addition_value);

// Convert LLM-specific attention hints to NPUW_ATTN
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
std::map<std::string, std::tuple<ov::PropertyMutability, std::function<ov::Any(const ::intel_npu::Config&)>>>;

public:
static constexpr const char* output_embeds = "npuw_output_embed";
struct layer_names {
static constexpr const char* output_embeds = "npuw_output_embed";
static constexpr const char* logits = "logits";
};

static constexpr uint32_t whisper_batch_dim = 0u;
static constexpr uint32_t whisper_seq_len_dim = 2u;
Expand Down
12 changes: 0 additions & 12 deletions src/plugins/intel_npu/src/plugin/npuw/llm_eagle3_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,18 +147,6 @@ void Eagle3Extension::prepare_inputs(const std::shared_ptr<ov::IAsyncInferReques
pad_hidden_state_input(m_hidden_states, padded_hidden_states);
}

void Eagle3Extension::update_last_hidden_state(
const std::shared_ptr<ov::IAsyncInferRequest>& request,
const std::unordered_map<std::string, ov::Output<const ov::Node>>& out_ports) {
auto last_hidden_state_it = out_ports.find(Eagle3LayerNames::last_hidden_state);
OPENVINO_ASSERT(last_hidden_state_it != out_ports.end(), "Eagle3 model must have last_hidden_state output port");

m_last_hidden_state = request->get_tensor(last_hidden_state_it->second);

LOG_VERB("Eagle3 " << (m_role == Eagle3ModelRole::Draft ? "Draft" : "Target")
<< ": Retrieved last_hidden_state output tensor");
}

void Eagle3Extension::prepare_inputs_for_chunk(
const std::shared_ptr<ov::IAsyncInferRequest>& request,
const std::unordered_map<std::string, ov::Output<const ov::Node>>& in_ports,
Expand Down
25 changes: 15 additions & 10 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -832,8 +832,11 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at(layer_names::logits));
}

if (m_eagle3_ext.is_eagle3_model()) {
m_eagle3_ext.update_last_hidden_state(m_prefill_request, m_prefill_out_ports);
for (auto&& [name, port] : m_prefill_out_ports) {
if (name == layer_names::logits) {
continue;
}
m_other_outputs[name] = m_prefill_request->get_tensor(port);
}

m_generate_initialized = false;
Expand Down Expand Up @@ -957,8 +960,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));
}

if (m_eagle3_ext.is_eagle3_model()) {
m_eagle3_ext.update_last_hidden_state(m_kvcache_request, m_kvcache_out_ports);
for (auto&& [name, port] : m_kvcache_out_ports) {
if (name == layer_names::logits) {
continue;
}
m_other_outputs[name] = m_kvcache_request->get_tensor(port);
}

LOG_DEBUG("Done");
Expand Down Expand Up @@ -1045,13 +1051,12 @@ ov::SoPtr<ov::ITensor> ov::npuw::LLMInferRequest::get_tensor(const ov::Output<co
return m_logits;
}

if (m_eagle3_ext.is_eagle3_model()) {
if (port_names.count(Eagle3LayerNames::last_hidden_state) > 0) {
auto last_hidden_state = m_eagle3_ext.get_last_hidden_state();
if (!last_hidden_state) {
OPENVINO_THROW("Last hidden state tensor is not available. Please run inference first.");
for (auto&& [name, tensor] : m_other_outputs) {
if (port_names.count(name) > 0) {
if (!tensor) {
OPENVINO_THROW("Output tensor \"", name, "\" is not available. Please run inference first.");
}
return last_hidden_state;
return tensor;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class LLMInferRequest : public ov::npuw::LLMInferBaseRequest {
// This infer request is optional, so can be null.
std::shared_ptr<ov::IAsyncInferRequest> m_lm_head_request;
ov::SoPtr<ov::ITensor> m_logits;
std::unordered_map<std::string, ov::SoPtr<ov::ITensor>> m_other_outputs;

std::unordered_map<std::string, ov::Output<const ov::Node>> m_prefill_in_ports;
std::unordered_map<std::string, ov::Output<const ov::Node>> m_prefill_out_ports;
Expand Down
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this change fixes the issue when a single Node has two Results connected to it.. And this happens when you introduce one more Result to feed the 3rd model. Can this be avoided?

If our partitioning doesn't work for that case, I'd probably fix it later

Copy link
Contributor Author

@AsyaPronina AsyaPronina Jan 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the concern, however, CutLMHead transformations happens before the model is split into prefill and generate. When we work with the prefill model, we also add a Slice but before only one Result node. So two Result nodes got separated: one is now after the Slice , another one is still connected to the layer before LM head. However, we don't need Slice for generate model as it outputs 1 token already. And only here we face the issue with two Results from one layer before the LM head.
If to merge two Results into one in advance, then Slice will be added to the both results..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After testing it turns out that preserving changes in partitioning is a more safe approach. Merging multiple Result nodes per output layer for generate model works, however this one Result node would have multiple names (from multiple Result-s node to catch their meanings). But, in LLMInferRequest, on the contrary, we are using output_port->get_any_name() to get any tensor name but only one for mapping of names to outputs. This may cause issues. So, preserving partitioning changes currently.

Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ void Partitioner::identifySubgraphs() {
// Apply partitioning changes to the original model
// but first cache all nodes to identify by name
using NodeSPtr = std::shared_ptr<ov::Node>;
std::unordered_map<ov::Output<ov::Node>, LinkPtrFrom> result_cache;
std::unordered_map<ov::Output<ov::Node>, std::vector<LinkPtrFrom>> result_cache;
std::unordered_map<std::string, NodeSPtr> node_id_cache;
for (auto&& node_ptr : model->get_ordered_ops()) {
node_id_cache[node_ptr->get_friendly_name()] = node_ptr;
Expand Down Expand Up @@ -624,9 +624,18 @@ void Partitioner::identifySubgraphs() {
// so it is a cut-off point (see above, parameter_from()):
// - record connectivity between subgraphs.
// Exception: param is registered via slice or convert
const auto& link_from = result_cache.at(src_node);
const auto& links_from = result_cache.at(src_node);
NPUW_ASSERT(links_from.size() > 0);
// Note: It may happen that one output layer has more than one
// Result nodes, that are the same!
// Please see the `results_cache` filling below.
if (links_from.size() > 1) {
LOG_INFO("Parameter " << this_param->get_friendly_name()
<< " has more than one possible similar Result nodes to connect!"
<< " Will pick the first one: " << (*links_from.begin()).second);
}
const auto link_to = LinkPtrTo{this_group_idx, this_param};
subgraph_ptr_links[link_to] = link_from;
subgraph_ptr_links[link_to] = *links_from.begin();
}
} else {
// assert is_constant(), there's no other way
Expand Down Expand Up @@ -700,7 +709,15 @@ void Partitioner::identifySubgraphs() {
// v
// op102
bool has_external_readers = false;
NodeSPtr maybe_result = nullptr;
// NB: It turns out that sometime we may end up with multiple
// Result nodes from one output layer, but they should be equal.
// Ex.: OmniThinker multi-outputs case with cut LM head.
// Output embeddings (not logits) became a Result from the
// prefill/kvcache model when LM head is cut.
// However, last operation before LM head has had already
// a connected Result node corresponding to the second
// output of the original model.
std::vector<NodeSPtr> maybe_results;
auto readers = output_desc.get_target_inputs();
// This is possible then some of layer's outputs are not used in the model.
if (readers.empty()) {
Expand All @@ -714,19 +731,31 @@ void Partitioner::identifySubgraphs() {
// at the npuw::CompiledModel level)
auto reader_node_ptr = r.get_node()->shared_from_this();
if (ov::op::util::is_output(reader_node_ptr)) {
maybe_result = std::move(reader_node_ptr);
maybe_results.push_back(std::move(reader_node_ptr));
} else if (group_nodes.find(reader_node_ptr) == group_nodes.end()) {
has_external_readers = true;
}
}
if (maybe_result) {
if (!maybe_results.empty()) {
// This layer's output was connected to Result already.
// It happens when this layer is the original model's output
// Keep it to make the ugly top-level I/O matching procedure work.
// FIXME: This needs to be refactored
group.sg._results.push_back(ov::as_type_ptr<ov::op::v0::Result>(maybe_result));
result_cache[output_desc] =
LinkPtrFrom{this_group_idx, ov::as_type_ptr<ov::op::v0::Result>(maybe_result)};

// Sanity check that if layer output connects with multiple Result nodes,
// then all Result nodes share the same shape.
if (maybe_results.size() > 1) {
const auto shape = (*maybe_results.begin())->get_shape();
for (std::size_t i = 1; i < maybe_results.size(); ++i) {
OPENVINO_ASSERT(shape == maybe_results[i]->get_shape(),
"Multiple results from one output layer should be similar!");
}
}
for (auto&& mr : maybe_results) {
group.sg._results.push_back(ov::as_type_ptr<ov::op::v0::Result>(mr));
result_cache[output_desc].push_back(
LinkPtrFrom{this_group_idx, ov::as_type_ptr<ov::op::v0::Result>(mr)});
}
} else if (has_external_readers) {
// Introduce and record a new Result
// As the graph is processed in the topological order,
Expand Down Expand Up @@ -754,7 +783,7 @@ void Partitioner::identifySubgraphs() {
}
}
auto new_result = std::make_shared<ov::op::v0::Result>(result_src);
result_cache[output_desc] = LinkPtrFrom{this_group_idx, new_result};
result_cache[output_desc].push_back(LinkPtrFrom{this_group_idx, new_result});

ov::copy_runtime_info(output_desc.get_node_shared_ptr(), new_result);
group.sg._results.push_back(std::move(new_result));
Expand Down
Loading