openvinotoolkit · AsyaPronina · Jan 14, 2026 · Jan 16, 2026 · Jan 17, 2026 · Jan 17, 2026
@@ -907,11 +907,20 @@ class CutLMHead : public ov::pass::MatcherPass {
             auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
             auto matched_result = std::static_pointer_cast<ov::op::v0::Result>(matched_node_result);
 
-            // Some LLMs add intermediate hidden state outputs that can interfere with LM head detection.
-            // Skip Result nodes that were manually added (marked with "manually_added_output" in RT_INFO).
-            // For example, Eagle-3 target/draft models add "last_hidden_state" output which should be skipped.
-            const auto& rt_info = matched_result->get_rt_info();
-            if (rt_info.count("manually_added_output")) {
+            // Skip Result nodes that are not logits.
+            // Note: We can check that Result's output name is "logits" and it will be a
+            //       sufficiently reliable check for finding exatly logits output, because:
+            //       1. LLMInferRequest always rely on "logits" name to get logits from
+            ///         prefill/kvcache models.
+            //       2. - Following Exporter configs: OnnxConfig, OnnxConfigWithPast,
+            //            TextDecoderOnnxConfig and TextDecoderWithPositionIdsOnnxConfig
+            //            from optimum-onnx name LLM output with "logits".
+            //          - Most of optimum-intel OpenVINO Exporter configs are derived
+            //            from the configs above.
+            //          - optimum-intel `export()` function set names for output tensors
+            //            from Exporter config:
+            //            https://github.com/huggingface/optimum-intel/blob/main/optimum/exporters/openvino/convert.py#L442-L445
+            if (matched_result->output(0).get_names().count(ov::npuw::LLMCompiledModel::layer_names::logits) == 0) {
                 return false;
             }
 
@@ -924,14 +933,14 @@ class CutLMHead : public ov::pass::MatcherPass {
             //        ICompiledModel::ICompiledModel().
             //        As a WA, setting the same name to output from MatMul
             //        avoids the issue.
-            matmul_first_source.set_names({ov::npuw::LLMCompiledModel::output_embeds});
-            matched_result->output(0).set_names({ov::npuw::LLMCompiledModel::output_embeds});
+            matmul_first_source.set_names({ov::npuw::LLMCompiledModel::layer_names::output_embeds});
+            matched_result->output(0).set_names({ov::npuw::LLMCompiledModel::layer_names::output_embeds});
             matched_result->validate_and_infer_types();
 
             // Create an additional model after cut point:
             auto new_param = std::make_shared<ov::op::v0::Parameter>(matmul_first_source.get_element_type(),
                                                                      matmul_first_source.get_partial_shape());
-            new_param->output(0).add_names({ov::npuw::LLMCompiledModel::output_embeds});
+            new_param->output(0).add_names({ov::npuw::LLMCompiledModel::layer_names::output_embeds});
             matched_matmul->input(0).replace_source_output(new_param);
             auto new_result = std::make_shared<ov::op::v0::Result>(matched_node_last_op);
             lm_head_model =
@@ -1053,7 +1062,7 @@ void slice_out_embeds(std::shared_ptr<ov::Model> model,
                       std::size_t max_generation_token_len) {
     std::shared_ptr<ov::Node> embed_result;
     for (auto&& output : model->outputs()) {
-        if (output.get_any_name() == ov::npuw::LLMCompiledModel::output_embeds) {
+        if (output.get_any_name() == ov::npuw::LLMCompiledModel::layer_names::output_embeds) {
             embed_result = output.get_node_shared_ptr();
         }
     }
@@ -1759,6 +1768,9 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     merge_config_with(prefill_config, other_props);
     merge_config_with(generate_config, other_props);
     merge_config_with(prefill_config, prefill_config_addition_value);
+    if (lm_head_model) {
+        prefill_config.erase("NPUW_SLICE_OUT");
+    }
     merge_config_with(generate_config, generate_config_addition_value);
 
     // Convert LLM-specific attention hints to NPUW_ATTN

@@ -26,7 +26,10 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
         std::map<std::string, std::tuple<ov::PropertyMutability, std::function<ov::Any(const ::intel_npu::Config&)>>>;
 
 public:
-    static constexpr const char* output_embeds = "npuw_output_embed";
+    struct layer_names {
+        static constexpr const char* output_embeds = "npuw_output_embed";
+        static constexpr const char* logits = "logits";
+    };
 
     static constexpr uint32_t whisper_batch_dim = 0u;
     static constexpr uint32_t whisper_seq_len_dim = 2u;

@@ -147,18 +147,6 @@ void Eagle3Extension::prepare_inputs(const std::shared_ptr<ov::IAsyncInferReques
     pad_hidden_state_input(m_hidden_states, padded_hidden_states);
 }
 
-void Eagle3Extension::update_last_hidden_state(
-    const std::shared_ptr<ov::IAsyncInferRequest>& request,
-    const std::unordered_map<std::string, ov::Output<const ov::Node>>& out_ports) {
-    auto last_hidden_state_it = out_ports.find(Eagle3LayerNames::last_hidden_state);
-    OPENVINO_ASSERT(last_hidden_state_it != out_ports.end(), "Eagle3 model must have last_hidden_state output port");
-
-    m_last_hidden_state = request->get_tensor(last_hidden_state_it->second);
-
-    LOG_VERB("Eagle3 " << (m_role == Eagle3ModelRole::Draft ? "Draft" : "Target")
-                       << ": Retrieved last_hidden_state output tensor");
-}
-
 void Eagle3Extension::prepare_inputs_for_chunk(
     const std::shared_ptr<ov::IAsyncInferRequest>& request,
     const std::unordered_map<std::string, ov::Output<const ov::Node>>& in_ports,

@@ -832,8 +832,11 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
         m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at(layer_names::logits));
     }
 
-    if (m_eagle3_ext.is_eagle3_model()) {
-        m_eagle3_ext.update_last_hidden_state(m_prefill_request, m_prefill_out_ports);
+    for (auto&& [name, port] : m_prefill_out_ports) {
+        if (name == layer_names::logits) {
+            continue;
+        }
+        m_other_outputs[name] = m_prefill_request->get_tensor(port);
     }
 
     m_generate_initialized = false;
@@ -957,8 +960,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));
     }
 
-    if (m_eagle3_ext.is_eagle3_model()) {
-        m_eagle3_ext.update_last_hidden_state(m_kvcache_request, m_kvcache_out_ports);
+    for (auto&& [name, port] : m_kvcache_out_ports) {
+        if (name == layer_names::logits) {
+            continue;
+        }
+        m_other_outputs[name] = m_kvcache_request->get_tensor(port);
     }
 
     LOG_DEBUG("Done");
@@ -1045,13 +1051,12 @@ ov::SoPtr<ov::ITensor> ov::npuw::LLMInferRequest::get_tensor(const ov::Output<co
         return m_logits;
     }
 
-    if (m_eagle3_ext.is_eagle3_model()) {
-        if (port_names.count(Eagle3LayerNames::last_hidden_state) > 0) {
-            auto last_hidden_state = m_eagle3_ext.get_last_hidden_state();
-            if (!last_hidden_state) {
-                OPENVINO_THROW("Last hidden state tensor is not available. Please run inference first.");
+    for (auto&& [name, tensor] : m_other_outputs) {
+        if (port_names.count(name) > 0) {
+            if (!tensor) {
+                OPENVINO_THROW("Output tensor \"", name, "\" is not available. Please run inference first.");
             }
-            return last_hidden_state;
+            return tensor;
         }
     }
 

@@ -77,6 +77,7 @@ class LLMInferRequest : public ov::npuw::LLMInferBaseRequest {
     // This infer request is optional, so can be null.
     std::shared_ptr<ov::IAsyncInferRequest> m_lm_head_request;
     ov::SoPtr<ov::ITensor> m_logits;
+    std::unordered_map<std::string, ov::SoPtr<ov::ITensor>> m_other_outputs;
 
     std::unordered_map<std::string, ov::Output<const ov::Node>> m_prefill_in_ports;
     std::unordered_map<std::string, ov::Output<const ov::Node>> m_prefill_out_ports;

@@ -387,7 +387,7 @@ void Partitioner::identifySubgraphs() {
     // Apply partitioning changes to the original model
     // but first cache all nodes to identify by name
     using NodeSPtr = std::shared_ptr<ov::Node>;
-    std::unordered_map<ov::Output<ov::Node>, LinkPtrFrom> result_cache;
+    std::unordered_map<ov::Output<ov::Node>, std::vector<LinkPtrFrom>> result_cache;
     std::unordered_map<std::string, NodeSPtr> node_id_cache;
     for (auto&& node_ptr : model->get_ordered_ops()) {
         node_id_cache[node_ptr->get_friendly_name()] = node_ptr;
@@ -624,9 +624,18 @@ void Partitioner::identifySubgraphs() {
                     // so it is a cut-off point (see above, parameter_from()):
                     // - record connectivity between subgraphs.
                     // Exception: param is registered via slice or convert
-                    const auto& link_from = result_cache.at(src_node);
+                    const auto& links_from = result_cache.at(src_node);
+                    NPUW_ASSERT(links_from.size() > 0);
+                    // Note: It may happen that one output layer has more than one
+                    // Result nodes, that are the same!
+                    // Please see the `results_cache` filling below.
+                    if (links_from.size() > 1) {
+                        LOG_INFO("Parameter " << this_param->get_friendly_name()
+                                              << " has more than one possible similar Result nodes to connect!"
+                                              << " Will pick the first one: " << (*links_from.begin()).second);
+                    }
                     const auto link_to = LinkPtrTo{this_group_idx, this_param};
-                    subgraph_ptr_links[link_to] = link_from;
+                    subgraph_ptr_links[link_to] = *links_from.begin();
                 }
             } else {
                 // assert is_constant(), there's no other way
@@ -700,7 +709,15 @@ void Partitioner::identifySubgraphs() {
                 //    v
                 //    op102
                 bool has_external_readers = false;
-                NodeSPtr maybe_result = nullptr;
+                // NB: It turns out that sometime we may end up with multiple
+                // Result nodes from one output layer, but they should be equal.
+                // Ex.: OmniThinker multi-outputs case with cut LM head.
+                //      Output embeddings (not logits) became a Result from the
+                //      prefill/kvcache model when LM head is cut.
+                //      However, last operation before LM head has had already
+                //      a connected Result node corresponding to the second
+                //      output of the original model.
+                std::vector<NodeSPtr> maybe_results;
                 auto readers = output_desc.get_target_inputs();
                 // This is possible then some of layer's outputs are not used in the model.
                 if (readers.empty()) {
@@ -714,19 +731,31 @@ void Partitioner::identifySubgraphs() {
                     // at the npuw::CompiledModel level)
                     auto reader_node_ptr = r.get_node()->shared_from_this();
                     if (ov::op::util::is_output(reader_node_ptr)) {
-                        maybe_result = std::move(reader_node_ptr);
+                        maybe_results.push_back(std::move(reader_node_ptr));
                     } else if (group_nodes.find(reader_node_ptr) == group_nodes.end()) {
                         has_external_readers = true;
                     }
                 }
-                if (maybe_result) {
+                if (!maybe_results.empty()) {
                     // This layer's output was connected to Result already.
                     // It happens when this layer is the original model's output
                     // Keep it to make the ugly top-level I/O matching procedure work.
                     // FIXME: This needs to be refactored
-                    group.sg._results.push_back(ov::as_type_ptr<ov::op::v0::Result>(maybe_result));
-                    result_cache[output_desc] =
-                        LinkPtrFrom{this_group_idx, ov::as_type_ptr<ov::op::v0::Result>(maybe_result)};
+
+                    // Sanity check that if layer output connects with multiple Result nodes,
+                    // then all Result nodes share the same shape.
+                    if (maybe_results.size() > 1) {
+                        const auto shape = (*maybe_results.begin())->get_shape();
+                        for (std::size_t i = 1; i < maybe_results.size(); ++i) {
+                            OPENVINO_ASSERT(shape == maybe_results[i]->get_shape(),
+                                            "Multiple results from one output layer should be similar!");
+                        }
+                    }
+                    for (auto&& mr : maybe_results) {
+                        group.sg._results.push_back(ov::as_type_ptr<ov::op::v0::Result>(mr));
+                        result_cache[output_desc].push_back(
+                            LinkPtrFrom{this_group_idx, ov::as_type_ptr<ov::op::v0::Result>(mr)});
+                    }
                 } else if (has_external_readers) {
                     // Introduce and record a new Result
                     // As the graph is processed in the topological order,
@@ -754,7 +783,7 @@ void Partitioner::identifySubgraphs() {
                             }
                         }
                         auto new_result = std::make_shared<ov::op::v0::Result>(result_src);
-                        result_cache[output_desc] = LinkPtrFrom{this_group_idx, new_result};
+                        result_cache[output_desc].push_back(LinkPtrFrom{this_group_idx, new_result});
 
                         ov::copy_runtime_info(output_desc.get_node_shared_ptr(), new_result);
                         group.sg._results.push_back(std::move(new_result));