From 4e8a10eedc6bef21d0b2bc5a786748658eaa47a9 Mon Sep 17 00:00:00 2001
From: Arseniy Obolenskiy <arseniy.obolenskiy@intel.com>
Date: Wed, 4 Feb 2026 11:10:23 +0100
Subject: [PATCH 1/5] [Snippets][CPU] Introduce EliminateInplaceOps pass

---
 .../lowered/pass/eliminate_inplace_ops.hpp    | 64 +++++++++++++++
 .../lowered/pass/eliminate_inplace_ops.cpp    | 80 +++++++++++++++++++
 src/common/snippets/src/op/subgraph.cpp       | 22 ++++-
 .../snippets/aarch64/jit_fill_emitter.cpp     | 18 ++---
 .../snippets/x64/jit_fill_emitter.cpp         | 16 ++--
 5 files changed, 178 insertions(+), 22 deletions(-)
 create mode 100644 src/common/snippets/include/snippets/lowered/pass/eliminate_inplace_ops.hpp
 create mode 100644 src/common/snippets/src/lowered/pass/eliminate_inplace_ops.cpp
diff --git a/src/common/snippets/include/snippets/lowered/pass/eliminate_inplace_ops.hpp b/src/common/snippets/include/snippets/lowered/pass/eliminate_inplace_ops.hpp
new file mode 100644
index 00000000000000..75b9aef23a55c9
--- /dev/null
+++ b/src/common/snippets/include/snippets/lowered/pass/eliminate_inplace_ops.hpp
@@ -0,0 +1,64 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <functional>
+
+#include "openvino/core/rtti.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+
+namespace ov::snippets::lowered::pass {
+
+/**
+ * @interface EliminateInplaceOps
+ * @brief Eliminates operations that are effectively inplace (input == output).
+ *        Currently handles Fill operations where offset equals register capacity,
+ *        which means the operation doesn't actually fill any new data.
+ *        This pass should run after InsertSpecificIterations and before InitRegisters.
+ * @ingroup snippets
+ */
+class EliminateInplaceOps : public Pass {
+public:
+    OPENVINO_RTTI("EliminateInplaceOps", "", Pass);
+
+    /**
+     * @brief Callback type for determining if a Fill operation is inplace.
+     *        Takes offset and element size, returns true if the Fill is inplace.
+     */
+    using IsInplaceFillCallback = std::function<bool(size_t offset, size_t element_size)>;
+
+    /**
+     * @brief Constructor with callback for inplace detection
+     * @param is_inplace_fill_callback Function to determine if a Fill is inplace based on offset and element size
+     */
+    explicit EliminateInplaceOps(IsInplaceFillCallback is_inplace_fill_callback);
+
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass (true if any changes were made)
+     */
+    bool run(LinearIR& linear_ir) override;
+
+private:
+    /**
+     * @brief Check if a Fill operation is inplace using the configured callback
+     * @param fill_expr expression containing Fill operation
+     * @return true if the Fill operation is inplace and can be eliminated
+     */
+    bool is_inplace_fill(const ExpressionPtr& fill_expr) const;
+
+    /**
+     * @brief Remove inplace Fill operation from the linear IR
+     * @param linear_ir the target Linear IR
+     * @param fill_expr expression containing inplace Fill operation
+     */
+    static void eliminate_fill(LinearIR& linear_ir, const ExpressionPtr& fill_expr);
+
+    IsInplaceFillCallback m_is_inplace_fill_callback;
+};
+
+}  // namespace ov::snippets::lowered::pass
diff --git a/src/common/snippets/src/lowered/pass/eliminate_inplace_ops.cpp b/src/common/snippets/src/lowered/pass/eliminate_inplace_ops.cpp
new file mode 100644
index 00000000000000..bd5639b2a8ce69
--- /dev/null
+++ b/src/common/snippets/src/lowered/pass/eliminate_inplace_ops.cpp
@@ -0,0 +1,80 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/eliminate_inplace_ops.hpp"
+
+#include <utility>
+
+#include "openvino/core/except.hpp"
+#include "openvino/core/type.hpp"
+#include "snippets/itt.hpp"
+#include "snippets/lowered/expression.hpp"
+#include "snippets/lowered/expression_port.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/op/fill.hpp"
+
+namespace ov::snippets::lowered::pass {
+
+EliminateInplaceOps::EliminateInplaceOps(IsInplaceFillCallback is_inplace_fill_callback)
+    : m_is_inplace_fill_callback(std::move(is_inplace_fill_callback)) {
+    OPENVINO_ASSERT(m_is_inplace_fill_callback, "Callback for inplace Fill detection must be provided");
+}
+
+bool EliminateInplaceOps::is_inplace_fill(const ExpressionPtr& fill_expr) const {
+    const auto fill = ov::as_type_ptr<snippets::op::Fill>(fill_expr->get_node());
+    if (!fill) {
+        return false;
+    }
+
+    const auto offset = fill->get_offset();
+    const auto element_size = fill->get_output_element_type(0).size();
+
+    return m_is_inplace_fill_callback(offset, element_size);
+}
+
+void EliminateInplaceOps::eliminate_fill(LinearIR& linear_ir, const ExpressionPtr& fill_expr) {
+    // Inplace Fill has one input and one output
+    // We need to redirect all consumers of the Fill's output to use the Fill's input instead
+
+    OPENVINO_ASSERT(fill_expr->get_input_count() == 1, "Fill should have exactly one input");
+    OPENVINO_ASSERT(fill_expr->get_output_count() == 1, "Fill should have exactly one output");
+
+    const auto& fill_input_connector = fill_expr->get_input_port_connector(0);
+    const auto& fill_output_connector = fill_expr->get_output_port_connector(0);
+
+    // Get all consumers of this Fill operation
+    const auto consumers = fill_output_connector->get_consumers();
+
+    // Redirect all consumers to use the input of Fill directly
+    lowered::replace_input_port_connectors(consumers, fill_input_connector);
+
+    // Remove Fill from the linear IR
+    linear_ir.erase(linear_ir.find(fill_expr));
+}
+
+bool EliminateInplaceOps::run(LinearIR& linear_ir) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::EliminateInplaceOps");
+
+    bool modified = false;
+
+    // Iterate through all expressions and eliminate inplace Fill operations
+    // We iterate from the end to avoid iterator invalidation issues when erasing
+    auto expr_it = linear_ir.begin();
+    while (expr_it != linear_ir.end()) {
+        const auto& expr = *expr_it;
+
+        if (is_inplace_fill(expr)) {
+            eliminate_fill(linear_ir, expr);
+            modified = true;
+            // After erasing, we need to reset the iterator
+            expr_it = linear_ir.begin();
+        } else {
+            ++expr_it;
+        }
+    }
+
+    return modified;
+}
+
+}  // namespace ov::snippets::lowered::pass
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 1c0fb0386e5da6..cd89639b8520e4 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -48,6 +48,7 @@
 #include "snippets/lowered/pass/allocate_buffers.hpp"
 #include "snippets/lowered/pass/clean_repeated_ptr_shifts.hpp"
 #include "snippets/lowered/pass/cleanup_loop_offsets.hpp"
+#include "snippets/lowered/pass/eliminate_inplace_ops.hpp"
 #include "snippets/lowered/pass/extract_loop_invariants.hpp"
 #include "snippets/lowered/pass/fuse_loops.hpp"
 #include "snippets/lowered/pass/init_loops.hpp"
@@ -571,12 +572,29 @@ void Subgraph::control_flow_transformations(
     //    1. AssignRegisters must be called after InsertSpecificIterations since specific loops maybe have
     //       different expressions and connections each other. AssignRegisters should be performed on the expanded
     //       loops.
-    //    2. CleanupLoopOffsets must be called after InsertSpecificIterations to avoid violating the proportionality of
+    //    2. EliminateInplaceOps must be called after InsertSpecificIterations to eliminate inplace Fill operations
+    //       that appear with offset == register_capacity after loop decomposition. This resolves ticket 126270.
+    //    3. CleanupLoopOffsets must be called after InsertSpecificIterations to avoid violating the proportionality of
     //    the pointer increments
     //       (this might happen if tail loop and main loop have different increments)
-    //    3. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets
+    //    4. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets
     //       since CleanupLoopOffsets can't handle loops with evaluate_once = true
     gen_pipeline.register_pass<lowered::pass::InsertSpecificIterations>();
+    // Callback to determine if Fill operation is inplace based on actual register capacity
+    // get_lanes() returns the number of float32 (4-byte) elements that fit in a vector register
+    const size_t lanes_for_float32 = get_generator()->get_target_machine()->get_lanes();
+    auto is_inplace_fill_callback = [lanes_for_float32](size_t offset, size_t element_size) -> bool {
+        // When offset is 0, Fill fills the entire register (not inplace)
+        if (offset == 0) {
+            return false;
+        }
+        // Calculate register capacity for the given element size
+        // Scale from float32 capacity: capacity(T) = capacity(float32) * sizeof(float32) / sizeof(T)
+        const size_t register_capacity = (lanes_for_float32 * sizeof(float)) / element_size;
+        // Fill is inplace when offset equals the register capacity
+        return offset == register_capacity;
+    };
+    gen_pipeline.register_pass<lowered::pass::EliminateInplaceOps>(is_inplace_fill_callback);
     gen_pipeline.register_pass<lowered::pass::InitRegisters>(get_generator(), lowered_pass_config);
     gen_pipeline.register_pass<lowered::pass::NormalizeLoopIDs>();
     gen_pipeline.register_pass<lowered::pass::ValidateExpandedLoops>();
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp
index e299636a6c4981..d4936e2600a8b0 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp
@@ -14,6 +14,7 @@
 
 #include "emitters/plugin/aarch64/jit_emitter.hpp"
 #include "emitters/utils.hpp"
+#include "openvino/core/except.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "snippets/lowered/expression.hpp"
@@ -63,17 +64,12 @@ void jit_fill_emitter::emit_impl(const std::vector<size_t>& in, const std::vecto
 template <cpu_isa_t isa>
 void jit_fill_emitter::emit_isa(const std::vector<size_t>& in, const std::vector<size_t>& out) const {
     const size_t supported_et_size = dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::vlen / exec_prc_.size();
-    if (offset == supported_et_size) {
-        // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be
-        // removed from the LIR
-        // TODO: when inplace is supported, remove such Fill ops from the LIR and remove this logic.
-        // Ticket: 126270
-        auto src = in[0];
-        auto dst = out[0];
-        if (src != dst) {
-            h->mov(Xbyak_aarch64::VReg16B(dst), Xbyak_aarch64::VReg16B(src));
-        }
-    } else if (is_full_reg()) {
+    OPENVINO_ASSERT(offset <= supported_et_size,
+                    "Fill emitter offset ",
+                    offset,
+                    " exceeds register capacity ",
+                    supported_et_size);
+    if (is_full_reg()) {
         fill_full<isa>(out);
     } else {
         fill_tail<isa>(in, out);
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp
index 2013cc0ca770da..1114fd8a54e7c0 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp
@@ -15,6 +15,7 @@
 
 #include "emitters/plugin/x64/jit_emitter.hpp"
 #include "emitters/utils.hpp"
+#include "openvino/core/except.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "snippets/lowered/expression.hpp"
@@ -79,15 +80,12 @@ void jit_fill_emitter::emit_isa(const std::vector<size_t>& in, const std::vector
 
     const size_t supported_et_size = 4;
     const auto register_capacity = (src_vmm.getBit() / 8) / supported_et_size;
-    if (offset == register_capacity) {
-        // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be
-        // removed from the LIR
-        // TODO: when inplace is supported, remove such Fill ops from the LIR and remove this logic.
-        // Ticket: 126270
-        if (src_vmm.getIdx() != dst_vmm.getIdx()) {
-            h->uni_vmovups(dst_vmm, src_vmm);
-        }
-    } else if (is_full_reg()) {
+    OPENVINO_ASSERT(offset <= register_capacity,
+                    "Fill emitter offset ",
+                    offset,
+                    " exceeds register capacity ",
+                    register_capacity);
+    if (is_full_reg()) {
         fill_full<Vmm>(dst_vmm);
     } else {
         fill_tail<Vmm>(src_vmm, dst_vmm);

From 3aaed2cee4469fb60f71dbc523254034d1ae2a30 Mon Sep 17 00:00:00 2001
From: Arseniy Obolenskiy <arseniy.obolenskiy@intel.com>
Date: Wed, 4 Feb 2026 11:16:30 +0100
Subject: [PATCH 2/5] Fix assert condition

---
 .../src/emitters/snippets/aarch64/jit_fill_emitter.cpp          | 2 +-
 .../intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp
index d4936e2600a8b0..e40c78f416c437 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp
@@ -64,7 +64,7 @@ void jit_fill_emitter::emit_impl(const std::vector<size_t>& in, const std::vecto
 template <cpu_isa_t isa>
 void jit_fill_emitter::emit_isa(const std::vector<size_t>& in, const std::vector<size_t>& out) const {
     const size_t supported_et_size = dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::vlen / exec_prc_.size();
-    OPENVINO_ASSERT(offset <= supported_et_size,
+    OPENVINO_ASSERT(offset < supported_et_size,
                     "Fill emitter offset ",
                     offset,
                     " exceeds register capacity ",
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp
index 1114fd8a54e7c0..2c323d0275347c 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp
@@ -80,7 +80,7 @@ void jit_fill_emitter::emit_isa(const std::vector<size_t>& in, const std::vector
 
     const size_t supported_et_size = 4;
     const auto register_capacity = (src_vmm.getBit() / 8) / supported_et_size;
-    OPENVINO_ASSERT(offset <= register_capacity,
+    OPENVINO_ASSERT(offset < register_capacity,
                     "Fill emitter offset ",
                     offset,
                     " exceeds register capacity ",

From 66ed10db620cad3f1462f6fd7c44db509297c58e Mon Sep 17 00:00:00 2001
From: Arseniy Obolenskiy <arseniy.obolenskiy@intel.com>
Date: Mon, 9 Feb 2026 16:17:52 +0100
Subject: [PATCH 3/5] Apply review comment

---
 .../src/lowered/pass/reduce_decomposition.cpp | 145 +++++++++++++++---
 src/common/snippets/src/op/subgraph.cpp       |  19 +--
 .../snippets/reduce.cpp                       |   3 +
 3 files changed, 129 insertions(+), 38 deletions(-)

diff --git a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
index 03e23b6cc1b794..bb95aa6d3e7b0c 100644
--- a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
+++ b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
@@ -4,6 +4,7 @@
 
 #include "snippets/lowered/pass/reduce_decomposition.hpp"
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <map>
@@ -23,12 +24,12 @@
 #include "snippets/lowered/loop_info.hpp"
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/lowered/loop_port.hpp"
-#include "snippets/lowered/pass/iter_handler.hpp"
 #include "snippets/lowered/pass/pass.hpp"
 #include "snippets/lowered/specific_loop_iter_types.hpp"
 #include "snippets/op/fill.hpp"
 #include "snippets/op/horizon_max.hpp"
 #include "snippets/op/horizon_sum.hpp"
+#include "snippets/op/loop.hpp"
 #include "snippets/op/memory_access.hpp"
 #include "snippets/op/reduce.hpp"
 #include "snippets/op/vector_buffer.hpp"
@@ -36,20 +37,107 @@
 
 namespace ov::snippets::lowered::pass {
 
+namespace {
+uint32_t get_initial_value(const ov::DiscreteTypeInfo& type_info) {
+    static const std::map<ov::DiscreteTypeInfo, uint32_t> reduce_initial_values{
+        {op::ReduceMax::get_type_info_static(), static_cast<uint32_t>(0xff7fffff)},
+        {op::ReduceSum::get_type_info_static(), static_cast<uint32_t>(0x00000000)},
+    };
+    OPENVINO_ASSERT(reduce_initial_values.count(type_info), "Unexpected ReduceType");
+    return reduce_initial_values.at(type_info);
+}
+
+uint32_t get_fill_value_for_accumulation(const std::shared_ptr<ov::Node>& accumulation) {
+    if (ov::is_type<ov::op::v1::Maximum>(accumulation)) {
+        return get_initial_value(op::ReduceMax::get_type_info_static());
+    }
+    if (ov::is_type<ov::op::v1::Add>(accumulation)) {
+        return get_initial_value(op::ReduceSum::get_type_info_static());
+    }
+    OPENVINO_THROW("InsertTailFill supports only Maximum/Add accumulation but got: ", accumulation->get_type_info());
+}
+}  // namespace
+
+class InsertTailFill : public RangedPass {
+public:
+    explicit InsertTailFill(size_t offset) : RangedPass(), m_offset(offset) {}
+    OPENVINO_RTTI("InsertTailFill", "", RangedPass);
+
+    bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override {
+        OPENVINO_ASSERT(begin != end, "InsertTailFill expects non-empty range.");
+        const auto& loop_end = ov::as_type_ptr<op::LoopEnd>(end->get()->get_node());
+        OPENVINO_ASSERT(loop_end, "InsertTailFill expected LoopEnd node in iterator `end`.");
+        const auto& loop_info = linear_ir.get_loop_manager()->get_loop_info<ExpandedLoopInfo>(loop_end->get_id());
+        const auto& output_ports = loop_info->get_output_ports();
+        const auto accumulation_output_it =
+            std::find_if(output_ports.begin(), output_ports.end(), [](const LoopPort& output_loop_port) {
+                const auto& output_expr = output_loop_port.get_expr_port()->get_expr();
+                const auto& output_node = output_expr->get_node();
+                return ov::is_type_any_of<ov::op::v1::Maximum, ov::op::v1::Add>(output_node);
+            });
+        OPENVINO_ASSERT(accumulation_output_it != output_ports.end(),
+                        "InsertTailFill failed to find accumulation output port.");
+        const auto& accumulation_expr = accumulation_output_it->get_expr_port()->get_expr();
+        OPENVINO_ASSERT(accumulation_expr, "InsertTailFill failed to get accumulation expression.");
+
+        size_t recurrent_input_port_idx = utils::get_dynamic_value<size_t>();
+        for (const auto& input_loop_port : loop_info->get_input_ports()) {
+            const auto& input_port = input_loop_port.get_expr_port();
+            if (input_port->get_type() == ExpressionPort::Input && input_port->get_expr() == accumulation_expr) {
+                recurrent_input_port_idx = input_port->get_index();
+                break;
+            }
+        }
+        OPENVINO_ASSERT(!utils::is_dynamic_value(recurrent_input_port_idx),
+                        "InsertTailFill failed to find recurrent accumulation input port.");
+
+        size_t data_input_port_idx = utils::get_dynamic_value<size_t>();
+        for (size_t i = 0; i < accumulation_expr->get_input_count(); ++i) {
+            if (i != recurrent_input_port_idx) {
+                data_input_port_idx = i;
+                break;
+            }
+        }
+        OPENVINO_ASSERT(!utils::is_dynamic_value(data_input_port_idx),
+                        "InsertTailFill failed to find data accumulation input port.");
+
+        const auto accumulation_input_port = accumulation_expr->get_input_port(data_input_port_idx);
+        const auto accumulation_it = linear_ir.find(begin, end, accumulation_expr);
+
+        const auto source = accumulation_expr->get_input_port_connector(data_input_port_idx)->get_source();
+        const auto source_output = source.get_expr()->get_node()->output(source.get_index());
+        const auto fill_value = get_fill_value_for_accumulation(accumulation_expr->get_node());
+        const auto fill = linear_ir.insert_node<op::Fill>(accumulation_it, source_output, m_offset, fill_value);
+
+        fill.first->get()->set_loop_ids(accumulation_expr->get_loop_ids());
+        replace_input_port_connectors({accumulation_input_port}, fill.first->get()->get_output_port_connector(0));
+        linear_ir.get_loop_manager()->update_loop_ports(*fill.first);
+        accumulation_expr->updateShapes();
+
+        return true;
+    }
+
+    std::shared_ptr<PassBase> merge(const std::shared_ptr<PassBase>& other) override {
+        if (!other) {
+            return shared_from_this();
+        }
+        const auto casted_pass = ov::as_type_ptr<InsertTailFill>(other);
+        size_t merged_offset = 0;
+        if (!casted_pass || !ov::snippets::utils::merge_dynamic_dim(merged_offset, m_offset, casted_pass->m_offset)) {
+            return nullptr;
+        }
+        return std::make_shared<InsertTailFill>(merged_offset);
+    }
+
+private:
+    size_t m_offset = 0;
+};
+
 ReduceDecomposition::ReduceDecomposition(size_t vector_size) : RangedPass(), m_vector_size{vector_size} {}
 
 bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ReduceMaxDecompositionLowered")
 
-    auto get_initial_value = [](const ov::DiscreteTypeInfo& type_info) {
-        static const std::map<ov::DiscreteTypeInfo, uint32_t> reduce_initial_values{
-            {op::ReduceMax::get_type_info_static(), static_cast<uint32_t>(0xff7fffff)},
-            {op::ReduceSum::get_type_info_static(), static_cast<uint32_t>(0x00000000)},
-        };
-        OPENVINO_ASSERT(reduce_initial_values.count(type_info), "Unexpected ReduceType");
-        return reduce_initial_values.at(type_info);
-    };
-
     auto insert_accumulation_node =
         [&linear_ir](
             const LinearIR::constExprIt& expr_it,
@@ -98,33 +186,48 @@ bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin,
 
         // Float constant values in byte representation
         const auto fill_value = get_initial_value(reduce_type_info);
+        const auto is_single_iteration = !utils::is_dynamic_value(work_amount) && work_amount == increment;
+        const auto tail_size = utils::is_dynamic_value(work_amount) ? 1LU : work_amount % increment;
+        const bool insert_fill_in_loop = is_single_iteration;
+        const bool insert_fill_in_last_iter = !is_single_iteration && tail_size != 0;
         // Note: VectorBuffer is a special case, since it should go before the initial Load.
         // The buffer must be initialized with fill_value before reduction
         const auto vector_buffer = linear_ir.insert_node<op::VectorBuffer>(expr_it);
         const auto initial_fill = linear_ir.insert_node<op::Fill>(expr_it, vector_buffer.second, 0, fill_value);
 
-        // Reduce loop
-        const auto fill =
-            linear_ir.insert_node<op::Fill>(expr_it, reduce->get_input_source_output(0), increment, fill_value);
-        const auto accumulation = insert_accumulation_node(expr_it, fill.second, initial_fill.second, reduce_type_info);
+        ov::Output<ov::Node> accumulation_input = reduce->get_input_source_output(0);
+        LinearIR::constExprIt reduce_loop_begin = expr_it;
+        ExpressionPort reduce_loop_input_port;
+        if (insert_fill_in_loop) {
+            const auto fill = linear_ir.insert_node<op::Fill>(expr_it, accumulation_input, increment, fill_value);
+            accumulation_input = fill.second;
+            reduce_loop_begin = fill.first;
+            reduce_loop_input_port = (*fill.first)->get_input_port(0);
+        }
+
+        const auto accumulation =
+            insert_accumulation_node(expr_it, accumulation_input, initial_fill.second, reduce_type_info);
+        if (!insert_fill_in_loop) {
+            reduce_loop_begin = accumulation.first;
+            reduce_loop_input_port = (*accumulation.first)->get_input_port(0);
+        }
 
         const auto reduce_loop_id = loop_manager->mark_loop(
-            fill.first,
+            reduce_loop_begin,
             expr_it,
             work_amount,
             increment,
-            {LoopPort::create<LoopPort::Type::Incremented>((*fill.first)->get_input_port(0), 0),
+            {LoopPort::create<LoopPort::Type::Incremented>(reduce_loop_input_port, 0),
              LoopPort::create<LoopPort::Type::Incremented>((*accumulation.first)->get_input_port(1), 0)},
             {LoopPort::create<LoopPort::Type::Incremented>((*accumulation.first)->get_output_port(0), 0)});
-        const auto tail_size = utils::is_dynamic_value(work_amount) ? 1LU : work_amount % increment;
-        if (tail_size != 0) {
+        if (insert_fill_in_last_iter) {
             const auto loop_info = loop_manager->get_loop_info<UnifiedLoopInfo>(reduce_loop_id);
-            loop_info->register_pass_to_handler<SpecificLoopIterType::LAST_ITER, SetFillOffset>(tail_size);
+            loop_info->register_pass_to_handler<SpecificLoopIterType::LAST_ITER, InsertTailFill>(tail_size);
         }
         const auto horizon = insert_horizon_node(expr_it, accumulation.second, reduce_type_info);
 
         // Transfer original ExpressionPorts
-        replace_input_port_connectors({fill.first->get()->get_input_port(0)}, reduce_expr->get_input_port_connector(0));
+        replace_input_port_connectors({reduce_loop_input_port}, reduce_expr->get_input_port_connector(0));
         const auto reduce_consumers = reduce_expr->get_output_port_connector(0)->get_consumers();
         replace_input_port_connectors(reduce_consumers, horizon.first->get()->get_output_port_connector(0));
 
@@ -134,7 +237,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin,
         }
 
         // Update Loop info for outer loops
-        const std::vector<ExpressionPort> input_ports{(*fill.first)->get_input_port(0)};
+        const std::vector<ExpressionPort> input_ports{reduce_loop_input_port};
         const std::vector<ExpressionPort> output_ports{(*horizon.first)->get_output_port(0)};
         for (auto loop_id : reduce_expr->get_loop_ids()) {
             loop_manager
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index cd89639b8520e4..115f1bd814c9fc 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -572,29 +572,14 @@ void Subgraph::control_flow_transformations(
     //    1. AssignRegisters must be called after InsertSpecificIterations since specific loops maybe have
     //       different expressions and connections each other. AssignRegisters should be performed on the expanded
     //       loops.
-    //    2. EliminateInplaceOps must be called after InsertSpecificIterations to eliminate inplace Fill operations
-    //       that appear with offset == register_capacity after loop decomposition. This resolves ticket 126270.
-    //    3. CleanupLoopOffsets must be called after InsertSpecificIterations to avoid violating the proportionality of
+    //    2. CleanupLoopOffsets must be called after InsertSpecificIterations to avoid violating the proportionality of
     //    the pointer increments
     //       (this might happen if tail loop and main loop have different increments)
-    //    4. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets
+    //    3. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets
     //       since CleanupLoopOffsets can't handle loops with evaluate_once = true
     gen_pipeline.register_pass<lowered::pass::InsertSpecificIterations>();
     // Callback to determine if Fill operation is inplace based on actual register capacity
     // get_lanes() returns the number of float32 (4-byte) elements that fit in a vector register
-    const size_t lanes_for_float32 = get_generator()->get_target_machine()->get_lanes();
-    auto is_inplace_fill_callback = [lanes_for_float32](size_t offset, size_t element_size) -> bool {
-        // When offset is 0, Fill fills the entire register (not inplace)
-        if (offset == 0) {
-            return false;
-        }
-        // Calculate register capacity for the given element size
-        // Scale from float32 capacity: capacity(T) = capacity(float32) * sizeof(float32) / sizeof(T)
-        const size_t register_capacity = (lanes_for_float32 * sizeof(float)) / element_size;
-        // Fill is inplace when offset equals the register capacity
-        return offset == register_capacity;
-    };
-    gen_pipeline.register_pass<lowered::pass::EliminateInplaceOps>(is_inplace_fill_callback);
     gen_pipeline.register_pass<lowered::pass::InitRegisters>(get_generator(), lowered_pass_config);
     gen_pipeline.register_pass<lowered::pass::NormalizeLoopIDs>();
     gen_pipeline.register_pass<lowered::pass::ValidateExpandedLoops>();
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/reduce.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/reduce.cpp
index 12b58ebeeb1937..3b2926e2356bee 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/reduce.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/reduce.cpp
@@ -13,6 +13,9 @@ namespace snippets {
 namespace {
 
 const std::vector<InputShape> inputShape = {
+    {{}, {{1, 3, 128, 1}}},
+    {{}, {{1, 3, 128, 7}}},
+    {{}, {{1, 3, 128, 9}}},
     {{}, {{1, 3, 128, 128}}},
     {{}, {{1, 3, 128, 15}}},
     {{}, {{1, 3, 15, 16}}},

From a03abcdb8910ffe7f55bbdeccdce5c49f12571fd Mon Sep 17 00:00:00 2001
From: Arseniy Obolenskiy <arseniy.obolenskiy@intel.com>
Date: Mon, 9 Feb 2026 19:09:14 +0100
Subject: [PATCH 4/5] fixes

---
 .../lowered/pass/eliminate_inplace_ops.hpp    | 64 ---------------
 .../lowered/pass/eliminate_inplace_ops.cpp    | 80 -------------------
 .../src/lowered/pass/reduce_decomposition.cpp |  9 ++-
 src/common/snippets/src/op/subgraph.cpp       |  1 -
 4 files changed, 5 insertions(+), 149 deletions(-)
 delete mode 100644 src/common/snippets/include/snippets/lowered/pass/eliminate_inplace_ops.hpp
 delete mode 100644 src/common/snippets/src/lowered/pass/eliminate_inplace_ops.cpp

diff --git a/src/common/snippets/include/snippets/lowered/pass/eliminate_inplace_ops.hpp b/src/common/snippets/include/snippets/lowered/pass/eliminate_inplace_ops.hpp
deleted file mode 100644
index 75b9aef23a55c9..00000000000000
--- a/src/common/snippets/include/snippets/lowered/pass/eliminate_inplace_ops.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (C) 2018-2026 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <functional>
-
-#include "openvino/core/rtti.hpp"
-#include "snippets/lowered/linear_ir.hpp"
-#include "snippets/lowered/pass/pass.hpp"
-
-namespace ov::snippets::lowered::pass {
-
-/**
- * @interface EliminateInplaceOps
- * @brief Eliminates operations that are effectively inplace (input == output).
- *        Currently handles Fill operations where offset equals register capacity,
- *        which means the operation doesn't actually fill any new data.
- *        This pass should run after InsertSpecificIterations and before InitRegisters.
- * @ingroup snippets
- */
-class EliminateInplaceOps : public Pass {
-public:
-    OPENVINO_RTTI("EliminateInplaceOps", "", Pass);
-
-    /**
-     * @brief Callback type for determining if a Fill operation is inplace.
-     *        Takes offset and element size, returns true if the Fill is inplace.
-     */
-    using IsInplaceFillCallback = std::function<bool(size_t offset, size_t element_size)>;
-
-    /**
-     * @brief Constructor with callback for inplace detection
-     * @param is_inplace_fill_callback Function to determine if a Fill is inplace based on offset and element size
-     */
-    explicit EliminateInplaceOps(IsInplaceFillCallback is_inplace_fill_callback);
-
-    /**
-     * @brief Apply the pass to the Linear IR
-     * @param linear_ir the target Linear IR
-     * @return status of the pass (true if any changes were made)
-     */
-    bool run(LinearIR& linear_ir) override;
-
-private:
-    /**
-     * @brief Check if a Fill operation is inplace using the configured callback
-     * @param fill_expr expression containing Fill operation
-     * @return true if the Fill operation is inplace and can be eliminated
-     */
-    bool is_inplace_fill(const ExpressionPtr& fill_expr) const;
-
-    /**
-     * @brief Remove inplace Fill operation from the linear IR
-     * @param linear_ir the target Linear IR
-     * @param fill_expr expression containing inplace Fill operation
-     */
-    static void eliminate_fill(LinearIR& linear_ir, const ExpressionPtr& fill_expr);
-
-    IsInplaceFillCallback m_is_inplace_fill_callback;
-};
-
-}  // namespace ov::snippets::lowered::pass
diff --git a/src/common/snippets/src/lowered/pass/eliminate_inplace_ops.cpp b/src/common/snippets/src/lowered/pass/eliminate_inplace_ops.cpp
deleted file mode 100644
index bd5639b2a8ce69..00000000000000
--- a/src/common/snippets/src/lowered/pass/eliminate_inplace_ops.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (C) 2018-2026 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/lowered/pass/eliminate_inplace_ops.hpp"
-
-#include <utility>
-
-#include "openvino/core/except.hpp"
-#include "openvino/core/type.hpp"
-#include "snippets/itt.hpp"
-#include "snippets/lowered/expression.hpp"
-#include "snippets/lowered/expression_port.hpp"
-#include "snippets/lowered/linear_ir.hpp"
-#include "snippets/op/fill.hpp"
-
-namespace ov::snippets::lowered::pass {
-
-EliminateInplaceOps::EliminateInplaceOps(IsInplaceFillCallback is_inplace_fill_callback)
-    : m_is_inplace_fill_callback(std::move(is_inplace_fill_callback)) {
-    OPENVINO_ASSERT(m_is_inplace_fill_callback, "Callback for inplace Fill detection must be provided");
-}
-
-bool EliminateInplaceOps::is_inplace_fill(const ExpressionPtr& fill_expr) const {
-    const auto fill = ov::as_type_ptr<snippets::op::Fill>(fill_expr->get_node());
-    if (!fill) {
-        return false;
-    }
-
-    const auto offset = fill->get_offset();
-    const auto element_size = fill->get_output_element_type(0).size();
-
-    return m_is_inplace_fill_callback(offset, element_size);
-}
-
-void EliminateInplaceOps::eliminate_fill(LinearIR& linear_ir, const ExpressionPtr& fill_expr) {
-    // Inplace Fill has one input and one output
-    // We need to redirect all consumers of the Fill's output to use the Fill's input instead
-
-    OPENVINO_ASSERT(fill_expr->get_input_count() == 1, "Fill should have exactly one input");
-    OPENVINO_ASSERT(fill_expr->get_output_count() == 1, "Fill should have exactly one output");
-
-    const auto& fill_input_connector = fill_expr->get_input_port_connector(0);
-    const auto& fill_output_connector = fill_expr->get_output_port_connector(0);
-
-    // Get all consumers of this Fill operation
-    const auto consumers = fill_output_connector->get_consumers();
-
-    // Redirect all consumers to use the input of Fill directly
-    lowered::replace_input_port_connectors(consumers, fill_input_connector);
-
-    // Remove Fill from the linear IR
-    linear_ir.erase(linear_ir.find(fill_expr));
-}
-
-bool EliminateInplaceOps::run(LinearIR& linear_ir) {
-    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::EliminateInplaceOps");
-
-    bool modified = false;
-
-    // Iterate through all expressions and eliminate inplace Fill operations
-    // We iterate from the end to avoid iterator invalidation issues when erasing
-    auto expr_it = linear_ir.begin();
-    while (expr_it != linear_ir.end()) {
-        const auto& expr = *expr_it;
-
-        if (is_inplace_fill(expr)) {
-            eliminate_fill(linear_ir, expr);
-            modified = true;
-            // After erasing, we need to reset the iterator
-            expr_it = linear_ir.begin();
-        } else {
-            ++expr_it;
-        }
-    }
-
-    return modified;
-}
-
-}  // namespace ov::snippets::lowered::pass
diff --git a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
index bb95aa6d3e7b0c..9aeb5ed2438aaf 100644
--- a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
+++ b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
@@ -15,6 +15,7 @@
 #include "openvino/core/except.hpp"
 #include "openvino/core/node.hpp"
 #include "openvino/core/node_output.hpp"
+#include "openvino/core/rtti.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/op/add.hpp"
 #include "openvino/op/maximum.hpp"
@@ -80,7 +81,7 @@ class InsertTailFill : public RangedPass {
         const auto& accumulation_expr = accumulation_output_it->get_expr_port()->get_expr();
         OPENVINO_ASSERT(accumulation_expr, "InsertTailFill failed to get accumulation expression.");
 
-        size_t recurrent_input_port_idx = utils::get_dynamic_value<size_t>();
+        auto recurrent_input_port_idx = utils::get_dynamic_value<size_t>();
         for (const auto& input_loop_port : loop_info->get_input_ports()) {
             const auto& input_port = input_loop_port.get_expr_port();
             if (input_port->get_type() == ExpressionPort::Input && input_port->get_expr() == accumulation_expr) {
@@ -91,7 +92,7 @@ class InsertTailFill : public RangedPass {
         OPENVINO_ASSERT(!utils::is_dynamic_value(recurrent_input_port_idx),
                         "InsertTailFill failed to find recurrent accumulation input port.");
 
-        size_t data_input_port_idx = utils::get_dynamic_value<size_t>();
+        auto data_input_port_idx = utils::get_dynamic_value<size_t>();
         for (size_t i = 0; i < accumulation_expr->get_input_count(); ++i) {
             if (i != recurrent_input_port_idx) {
                 data_input_port_idx = i;
@@ -188,7 +189,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin,
         const auto fill_value = get_initial_value(reduce_type_info);
         const auto is_single_iteration = !utils::is_dynamic_value(work_amount) && work_amount == increment;
         const auto tail_size = utils::is_dynamic_value(work_amount) ? 1LU : work_amount % increment;
-        const bool insert_fill_in_loop = is_single_iteration;
+        const bool insert_fill_in_loop = is_single_iteration && increment < m_vector_size;
         const bool insert_fill_in_last_iter = !is_single_iteration && tail_size != 0;
         // Note: VectorBuffer is a special case, since it should go before the initial Load.
         // The buffer must be initialized with fill_value before reduction
@@ -196,7 +197,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin,
         const auto initial_fill = linear_ir.insert_node<op::Fill>(expr_it, vector_buffer.second, 0, fill_value);
 
         ov::Output<ov::Node> accumulation_input = reduce->get_input_source_output(0);
-        LinearIR::constExprIt reduce_loop_begin = expr_it;
+        auto reduce_loop_begin = expr_it;
         ExpressionPort reduce_loop_input_port;
         if (insert_fill_in_loop) {
             const auto fill = linear_ir.insert_node<op::Fill>(expr_it, accumulation_input, increment, fill_value);
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 115f1bd814c9fc..d437fe6bdbcd8d 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -48,7 +48,6 @@
 #include "snippets/lowered/pass/allocate_buffers.hpp"
 #include "snippets/lowered/pass/clean_repeated_ptr_shifts.hpp"
 #include "snippets/lowered/pass/cleanup_loop_offsets.hpp"
-#include "snippets/lowered/pass/eliminate_inplace_ops.hpp"
 #include "snippets/lowered/pass/extract_loop_invariants.hpp"
 #include "snippets/lowered/pass/fuse_loops.hpp"
 #include "snippets/lowered/pass/init_loops.hpp"

From 544e5432e0a446ac19adb3dd7684b840d8dcb414 Mon Sep 17 00:00:00 2001
From: Arseniy Obolenskiy <arseniy.obolenskiy@intel.com>
Date: Tue, 10 Feb 2026 19:48:39 +0100
Subject: [PATCH 5/5] Address review comments

---
 .../snippets/lowered/pass/iter_handler.hpp    | 17 ----
 .../src/lowered/pass/iter_handler.cpp         | 25 ------
 .../src/lowered/pass/reduce_decomposition.cpp | 82 +++++++++++--------
 src/common/snippets/src/op/subgraph.cpp       |  2 -
 4 files changed, 49 insertions(+), 77 deletions(-)

diff --git a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp
index 450edd324dc1f0..ffdd45376d8061 100644
--- a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp
@@ -31,23 +31,6 @@ class UpdateMemoryAccessCounts : public pass::RangedPass {
     size_t m_count;
 };
 
-/**
- * @interface SetFillOffset
- * @brief The pass changes offset of all Fill ops
- * @param m_offset - offset which must be set
- * @ingroup snippets
- */
-class SetFillOffset : public pass::RangedPass {
-public:
-    explicit SetFillOffset(size_t offset);
-    OPENVINO_RTTI("SetFillOffset", "", RangedPass);
-    bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
-    std::shared_ptr<pass::PassBase> merge(const std::shared_ptr<pass::PassBase>& other) override;
-
-private:
-    size_t m_offset;
-};
-
 /**
  * @interface SetLoopIncrementOne
  * @brief The pass set `increment = 1` to ExpandedLoopInfo which is mapped on LoopEnd in the passed iterator `end` and
diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp
index b69aba2fed588d..f59cb1b5123394 100644
--- a/src/common/snippets/src/lowered/pass/iter_handler.cpp
+++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp
@@ -14,7 +14,6 @@
 #include "snippets/lowered/loop_info.hpp"
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/lowered/pass/pass.hpp"
-#include "snippets/op/fill.hpp"
 #include "snippets/op/loop.hpp"
 #include "snippets/op/memory_access.hpp"
 #include "snippets/utils/utils.hpp"
@@ -67,30 +66,6 @@ std::shared_ptr<pass::PassBase> UpdateMemoryAccessCounts::merge(const std::share
     return std::make_shared<UpdateMemoryAccessCounts>(merged_count);
 }
 
-SetFillOffset::SetFillOffset(size_t offset) : RangedPass(), m_offset(offset) {}
-
-bool SetFillOffset::run([[maybe_unused]] LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
-    for (auto expr_it = begin; expr_it != end; expr_it++) {
-        const auto& node = expr_it->get()->get_node();
-        if (const auto fill = ov::as_type_ptr<ov::snippets::op::Fill>(node)) {
-            fill->set_offset(m_offset);
-        }
-    }
-    return true;
-}
-
-std::shared_ptr<pass::PassBase> SetFillOffset::merge(const std::shared_ptr<pass::PassBase>& other) {
-    if (!other) {
-        return shared_from_this();
-    }
-    const auto casted_pass = ov::as_type_ptr<SetFillOffset>(other);
-    size_t merged_offset = 0;
-    if (!casted_pass || !ov::snippets::utils::merge_dynamic_dim(merged_offset, m_offset, casted_pass->m_offset)) {
-        return nullptr;
-    }
-    return std::make_shared<SetFillOffset>(merged_offset);
-}
-
 bool SetLoopIncrementOne::run(LinearIR& linear_ir,
                               [[maybe_unused]] LinearIR::constExprIt begin,
                               LinearIR::constExprIt end) {
diff --git a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
index 9aeb5ed2438aaf..239c8c8237be94 100644
--- a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
+++ b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
@@ -9,6 +9,8 @@
 #include <cstdint>
 #include <map>
 #include <memory>
+#include <optional>
+#include <set>
 #include <utility>
 #include <vector>
 
@@ -20,6 +22,7 @@
 #include "openvino/op/add.hpp"
 #include "openvino/op/maximum.hpp"
 #include "snippets/itt.hpp"
+#include "snippets/lowered/expression.hpp"
 #include "snippets/lowered/expression_port.hpp"
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/loop_info.hpp"
@@ -57,6 +60,39 @@ uint32_t get_fill_value_for_accumulation(const std::shared_ptr<ov::Node>& accumu
     }
     OPENVINO_THROW("InsertTailFill supports only Maximum/Add accumulation but got: ", accumulation->get_type_info());
 }
+
+bool is_fill_from_vector_buffer(const ExpressionPtr& expr) {
+    if (!expr || !ov::is_type<op::Fill>(expr->get_node())) {
+        return false;
+    }
+    const auto& parent_expr = expr->get_input_expr_ptr(0);
+    return parent_expr && ov::is_type<op::VectorBuffer>(parent_expr->get_node());
+}
+
+bool is_supported_accumulation(const ExpressionPtr& accumulation_expr) {
+    return accumulation_expr && ov::is_type_any_of<ov::op::v1::Maximum, ov::op::v1::Add>(accumulation_expr->get_node());
+}
+
+std::optional<size_t> find_data_input_port_idx(const ExpressionPtr& accumulation_expr) {
+    if (!accumulation_expr || accumulation_expr->get_input_count() != 2) {
+        return std::nullopt;
+    }
+    const auto input0_is_initial_fill = is_fill_from_vector_buffer(accumulation_expr->get_input_expr_ptr(0));
+    const auto input1_is_initial_fill = is_fill_from_vector_buffer(accumulation_expr->get_input_expr_ptr(1));
+    if (input0_is_initial_fill == input1_is_initial_fill) {
+        return std::nullopt;
+    }
+    return input0_is_initial_fill ? 1 : 0;
+}
+
+size_t get_data_input_port_idx(const ExpressionPtr& accumulation_expr) {
+    OPENVINO_ASSERT(is_supported_accumulation(accumulation_expr),
+                    "InsertTailFill expected Maximum/Add accumulation expression.");
+    const auto data_input_port_idx = find_data_input_port_idx(accumulation_expr);
+    OPENVINO_ASSERT(data_input_port_idx.has_value(),
+                    "InsertTailFill failed to detect unique Fill(VectorBuffer) accumulation input.");
+    return *data_input_port_idx;
+}
 }  // namespace
 
 class InsertTailFill : public RangedPass {
@@ -72,47 +108,27 @@ class InsertTailFill : public RangedPass {
         const auto& output_ports = loop_info->get_output_ports();
         const auto accumulation_output_it =
             std::find_if(output_ports.begin(), output_ports.end(), [](const LoopPort& output_loop_port) {
-                const auto& output_expr = output_loop_port.get_expr_port()->get_expr();
-                const auto& output_node = output_expr->get_node();
-                return ov::is_type_any_of<ov::op::v1::Maximum, ov::op::v1::Add>(output_node);
+                const auto& accumulation_expr = output_loop_port.get_expr_port()->get_expr();
+                return is_supported_accumulation(accumulation_expr) &&
+                       find_data_input_port_idx(accumulation_expr).has_value();
             });
         OPENVINO_ASSERT(accumulation_output_it != output_ports.end(),
-                        "InsertTailFill failed to find accumulation output port.");
+                        "InsertTailFill failed to find accumulation output port with Fill(VectorBuffer) input.");
         const auto& accumulation_expr = accumulation_output_it->get_expr_port()->get_expr();
-        OPENVINO_ASSERT(accumulation_expr, "InsertTailFill failed to get accumulation expression.");
-
-        auto recurrent_input_port_idx = utils::get_dynamic_value<size_t>();
-        for (const auto& input_loop_port : loop_info->get_input_ports()) {
-            const auto& input_port = input_loop_port.get_expr_port();
-            if (input_port->get_type() == ExpressionPort::Input && input_port->get_expr() == accumulation_expr) {
-                recurrent_input_port_idx = input_port->get_index();
-                break;
-            }
-        }
-        OPENVINO_ASSERT(!utils::is_dynamic_value(recurrent_input_port_idx),
-                        "InsertTailFill failed to find recurrent accumulation input port.");
-
-        auto data_input_port_idx = utils::get_dynamic_value<size_t>();
-        for (size_t i = 0; i < accumulation_expr->get_input_count(); ++i) {
-            if (i != recurrent_input_port_idx) {
-                data_input_port_idx = i;
-                break;
-            }
-        }
-        OPENVINO_ASSERT(!utils::is_dynamic_value(data_input_port_idx),
-                        "InsertTailFill failed to find data accumulation input port.");
-
+        const auto data_input_port_idx = get_data_input_port_idx(accumulation_expr);
         const auto accumulation_input_port = accumulation_expr->get_input_port(data_input_port_idx);
         const auto accumulation_it = linear_ir.find(begin, end, accumulation_expr);
 
         const auto source = accumulation_expr->get_input_port_connector(data_input_port_idx)->get_source();
         const auto source_output = source.get_expr()->get_node()->output(source.get_index());
         const auto fill_value = get_fill_value_for_accumulation(accumulation_expr->get_node());
-        const auto fill = linear_ir.insert_node<op::Fill>(accumulation_it, source_output, m_offset, fill_value);
-
-        fill.first->get()->set_loop_ids(accumulation_expr->get_loop_ids());
-        replace_input_port_connectors({accumulation_input_port}, fill.first->get()->get_output_port_connector(0));
-        linear_ir.get_loop_manager()->update_loop_ports(*fill.first);
+        const auto fill_node = std::make_shared<op::Fill>(source_output, m_offset, fill_value);
+        linear_ir.insert_node(fill_node,
+                              std::vector<ExpressionPort>{source},
+                              accumulation_expr->get_loop_ids(),
+                              true,
+                              accumulation_it,
+                              std::set<ExpressionPort>{accumulation_input_port});
         accumulation_expr->updateShapes();
 
         return true;
@@ -187,7 +203,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin,
 
         // Float constant values in byte representation
         const auto fill_value = get_initial_value(reduce_type_info);
-        const auto is_single_iteration = !utils::is_dynamic_value(work_amount) && work_amount == increment;
+        const auto is_single_iteration = work_amount == increment;
         const auto tail_size = utils::is_dynamic_value(work_amount) ? 1LU : work_amount % increment;
         const bool insert_fill_in_loop = is_single_iteration && increment < m_vector_size;
         const bool insert_fill_in_last_iter = !is_single_iteration && tail_size != 0;
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index d437fe6bdbcd8d..1c0fb0386e5da6 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -577,8 +577,6 @@ void Subgraph::control_flow_transformations(
     //    3. OptimizeLoopSingleEvaluation must be called after CleanupLoopOffsets
     //       since CleanupLoopOffsets can't handle loops with evaluate_once = true
     gen_pipeline.register_pass<lowered::pass::InsertSpecificIterations>();
-    // Callback to determine if Fill operation is inplace based on actual register capacity
-    // get_lanes() returns the number of float32 (4-byte) elements that fit in a vector register
     gen_pipeline.register_pass<lowered::pass::InitRegisters>(get_generator(), lowered_pass_config);
     gen_pipeline.register_pass<lowered::pass::NormalizeLoopIDs>();
     gen_pipeline.register_pass<lowered::pass::ValidateExpandedLoops>();