diff --git a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp index 450edd324dc1f0..ffdd45376d8061 100644 --- a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp @@ -31,23 +31,6 @@ class UpdateMemoryAccessCounts : public pass::RangedPass { size_t m_count; }; -/** - * @interface SetFillOffset - * @brief The pass changes offset of all Fill ops - * @param m_offset - offset which must be set - * @ingroup snippets - */ -class SetFillOffset : public pass::RangedPass { -public: - explicit SetFillOffset(size_t offset); - OPENVINO_RTTI("SetFillOffset", "", RangedPass); - bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override; - std::shared_ptr merge(const std::shared_ptr& other) override; - -private: - size_t m_offset; -}; - /** * @interface SetLoopIncrementOne * @brief The pass set `increment = 1` to ExpandedLoopInfo which is mapped on LoopEnd in the passed iterator `end` and diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp index b69aba2fed588d..f59cb1b5123394 100644 --- a/src/common/snippets/src/lowered/pass/iter_handler.cpp +++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp @@ -14,7 +14,6 @@ #include "snippets/lowered/loop_info.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/pass/pass.hpp" -#include "snippets/op/fill.hpp" #include "snippets/op/loop.hpp" #include "snippets/op/memory_access.hpp" #include "snippets/utils/utils.hpp" @@ -67,30 +66,6 @@ std::shared_ptr UpdateMemoryAccessCounts::merge(const std::share return std::make_shared(merged_count); } -SetFillOffset::SetFillOffset(size_t offset) : RangedPass(), m_offset(offset) {} - -bool SetFillOffset::run([[maybe_unused]] LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { - for (auto expr_it = begin; expr_it != end; expr_it++) { - const auto& node = expr_it->get()->get_node(); - if (const auto fill = ov::as_type_ptr(node)) { - fill->set_offset(m_offset); - } - } - return true; -} - -std::shared_ptr SetFillOffset::merge(const std::shared_ptr& other) { - if (!other) { - return shared_from_this(); - } - const auto casted_pass = ov::as_type_ptr(other); - size_t merged_offset = 0; - if (!casted_pass || !ov::snippets::utils::merge_dynamic_dim(merged_offset, m_offset, casted_pass->m_offset)) { - return nullptr; - } - return std::make_shared(merged_offset); -} - bool SetLoopIncrementOne::run(LinearIR& linear_ir, [[maybe_unused]] LinearIR::constExprIt begin, LinearIR::constExprIt end) { diff --git a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp index 03e23b6cc1b794..239c8c8237be94 100644 --- a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp +++ b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp @@ -4,31 +4,36 @@ #include "snippets/lowered/pass/reduce_decomposition.hpp" +#include #include #include #include #include +#include +#include #include #include #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" #include "openvino/core/node_output.hpp" +#include "openvino/core/rtti.hpp" #include "openvino/core/type.hpp" #include "openvino/op/add.hpp" #include "openvino/op/maximum.hpp" #include "snippets/itt.hpp" +#include "snippets/lowered/expression.hpp" #include "snippets/lowered/expression_port.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_info.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/loop_port.hpp" -#include "snippets/lowered/pass/iter_handler.hpp" #include "snippets/lowered/pass/pass.hpp" #include "snippets/lowered/specific_loop_iter_types.hpp" #include "snippets/op/fill.hpp" #include "snippets/op/horizon_max.hpp" #include "snippets/op/horizon_sum.hpp" +#include "snippets/op/loop.hpp" #include "snippets/op/memory_access.hpp" #include "snippets/op/reduce.hpp" #include "snippets/op/vector_buffer.hpp" @@ -36,20 +41,120 @@ namespace ov::snippets::lowered::pass { +namespace { +uint32_t get_initial_value(const ov::DiscreteTypeInfo& type_info) { + static const std::map reduce_initial_values{ + {op::ReduceMax::get_type_info_static(), static_cast(0xff7fffff)}, + {op::ReduceSum::get_type_info_static(), static_cast(0x00000000)}, + }; + OPENVINO_ASSERT(reduce_initial_values.count(type_info), "Unexpected ReduceType"); + return reduce_initial_values.at(type_info); +} + +uint32_t get_fill_value_for_accumulation(const std::shared_ptr& accumulation) { + if (ov::is_type(accumulation)) { + return get_initial_value(op::ReduceMax::get_type_info_static()); + } + if (ov::is_type(accumulation)) { + return get_initial_value(op::ReduceSum::get_type_info_static()); + } + OPENVINO_THROW("InsertTailFill supports only Maximum/Add accumulation but got: ", accumulation->get_type_info()); +} + +bool is_fill_from_vector_buffer(const ExpressionPtr& expr) { + if (!expr || !ov::is_type(expr->get_node())) { + return false; + } + const auto& parent_expr = expr->get_input_expr_ptr(0); + return parent_expr && ov::is_type(parent_expr->get_node()); +} + +bool is_supported_accumulation(const ExpressionPtr& accumulation_expr) { + return accumulation_expr && ov::is_type_any_of(accumulation_expr->get_node()); +} + +std::optional find_data_input_port_idx(const ExpressionPtr& accumulation_expr) { + if (!accumulation_expr || accumulation_expr->get_input_count() != 2) { + return std::nullopt; + } + const auto input0_is_initial_fill = is_fill_from_vector_buffer(accumulation_expr->get_input_expr_ptr(0)); + const auto input1_is_initial_fill = is_fill_from_vector_buffer(accumulation_expr->get_input_expr_ptr(1)); + if (input0_is_initial_fill == input1_is_initial_fill) { + return std::nullopt; + } + return input0_is_initial_fill ? 1 : 0; +} + +size_t get_data_input_port_idx(const ExpressionPtr& accumulation_expr) { + OPENVINO_ASSERT(is_supported_accumulation(accumulation_expr), + "InsertTailFill expected Maximum/Add accumulation expression."); + const auto data_input_port_idx = find_data_input_port_idx(accumulation_expr); + OPENVINO_ASSERT(data_input_port_idx.has_value(), + "InsertTailFill failed to detect unique Fill(VectorBuffer) accumulation input."); + return *data_input_port_idx; +} +} // namespace + +class InsertTailFill : public RangedPass { +public: + explicit InsertTailFill(size_t offset) : RangedPass(), m_offset(offset) {} + OPENVINO_RTTI("InsertTailFill", "", RangedPass); + + bool run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override { + OPENVINO_ASSERT(begin != end, "InsertTailFill expects non-empty range."); + const auto& loop_end = ov::as_type_ptr(end->get()->get_node()); + OPENVINO_ASSERT(loop_end, "InsertTailFill expected LoopEnd node in iterator `end`."); + const auto& loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id()); + const auto& output_ports = loop_info->get_output_ports(); + const auto accumulation_output_it = + std::find_if(output_ports.begin(), output_ports.end(), [](const LoopPort& output_loop_port) { + const auto& accumulation_expr = output_loop_port.get_expr_port()->get_expr(); + return is_supported_accumulation(accumulation_expr) && + find_data_input_port_idx(accumulation_expr).has_value(); + }); + OPENVINO_ASSERT(accumulation_output_it != output_ports.end(), + "InsertTailFill failed to find accumulation output port with Fill(VectorBuffer) input."); + const auto& accumulation_expr = accumulation_output_it->get_expr_port()->get_expr(); + const auto data_input_port_idx = get_data_input_port_idx(accumulation_expr); + const auto accumulation_input_port = accumulation_expr->get_input_port(data_input_port_idx); + const auto accumulation_it = linear_ir.find(begin, end, accumulation_expr); + + const auto source = accumulation_expr->get_input_port_connector(data_input_port_idx)->get_source(); + const auto source_output = source.get_expr()->get_node()->output(source.get_index()); + const auto fill_value = get_fill_value_for_accumulation(accumulation_expr->get_node()); + const auto fill_node = std::make_shared(source_output, m_offset, fill_value); + linear_ir.insert_node(fill_node, + std::vector{source}, + accumulation_expr->get_loop_ids(), + true, + accumulation_it, + std::set{accumulation_input_port}); + accumulation_expr->updateShapes(); + + return true; + } + + std::shared_ptr merge(const std::shared_ptr& other) override { + if (!other) { + return shared_from_this(); + } + const auto casted_pass = ov::as_type_ptr(other); + size_t merged_offset = 0; + if (!casted_pass || !ov::snippets::utils::merge_dynamic_dim(merged_offset, m_offset, casted_pass->m_offset)) { + return nullptr; + } + return std::make_shared(merged_offset); + } + +private: + size_t m_offset = 0; +}; + ReduceDecomposition::ReduceDecomposition(size_t vector_size) : RangedPass(), m_vector_size{vector_size} {} bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ReduceMaxDecompositionLowered") - auto get_initial_value = [](const ov::DiscreteTypeInfo& type_info) { - static const std::map reduce_initial_values{ - {op::ReduceMax::get_type_info_static(), static_cast(0xff7fffff)}, - {op::ReduceSum::get_type_info_static(), static_cast(0x00000000)}, - }; - OPENVINO_ASSERT(reduce_initial_values.count(type_info), "Unexpected ReduceType"); - return reduce_initial_values.at(type_info); - }; - auto insert_accumulation_node = [&linear_ir]( const LinearIR::constExprIt& expr_it, @@ -98,33 +203,48 @@ bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin, // Float constant values in byte representation const auto fill_value = get_initial_value(reduce_type_info); + const auto is_single_iteration = work_amount == increment; + const auto tail_size = utils::is_dynamic_value(work_amount) ? 1LU : work_amount % increment; + const bool insert_fill_in_loop = is_single_iteration && increment < m_vector_size; + const bool insert_fill_in_last_iter = !is_single_iteration && tail_size != 0; // Note: VectorBuffer is a special case, since it should go before the initial Load. // The buffer must be initialized with fill_value before reduction const auto vector_buffer = linear_ir.insert_node(expr_it); const auto initial_fill = linear_ir.insert_node(expr_it, vector_buffer.second, 0, fill_value); - // Reduce loop - const auto fill = - linear_ir.insert_node(expr_it, reduce->get_input_source_output(0), increment, fill_value); - const auto accumulation = insert_accumulation_node(expr_it, fill.second, initial_fill.second, reduce_type_info); + ov::Output accumulation_input = reduce->get_input_source_output(0); + auto reduce_loop_begin = expr_it; + ExpressionPort reduce_loop_input_port; + if (insert_fill_in_loop) { + const auto fill = linear_ir.insert_node(expr_it, accumulation_input, increment, fill_value); + accumulation_input = fill.second; + reduce_loop_begin = fill.first; + reduce_loop_input_port = (*fill.first)->get_input_port(0); + } + + const auto accumulation = + insert_accumulation_node(expr_it, accumulation_input, initial_fill.second, reduce_type_info); + if (!insert_fill_in_loop) { + reduce_loop_begin = accumulation.first; + reduce_loop_input_port = (*accumulation.first)->get_input_port(0); + } const auto reduce_loop_id = loop_manager->mark_loop( - fill.first, + reduce_loop_begin, expr_it, work_amount, increment, - {LoopPort::create((*fill.first)->get_input_port(0), 0), + {LoopPort::create(reduce_loop_input_port, 0), LoopPort::create((*accumulation.first)->get_input_port(1), 0)}, {LoopPort::create((*accumulation.first)->get_output_port(0), 0)}); - const auto tail_size = utils::is_dynamic_value(work_amount) ? 1LU : work_amount % increment; - if (tail_size != 0) { + if (insert_fill_in_last_iter) { const auto loop_info = loop_manager->get_loop_info(reduce_loop_id); - loop_info->register_pass_to_handler(tail_size); + loop_info->register_pass_to_handler(tail_size); } const auto horizon = insert_horizon_node(expr_it, accumulation.second, reduce_type_info); // Transfer original ExpressionPorts - replace_input_port_connectors({fill.first->get()->get_input_port(0)}, reduce_expr->get_input_port_connector(0)); + replace_input_port_connectors({reduce_loop_input_port}, reduce_expr->get_input_port_connector(0)); const auto reduce_consumers = reduce_expr->get_output_port_connector(0)->get_consumers(); replace_input_port_connectors(reduce_consumers, horizon.first->get()->get_output_port_connector(0)); @@ -134,7 +254,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin, } // Update Loop info for outer loops - const std::vector input_ports{(*fill.first)->get_input_port(0)}; + const std::vector input_ports{reduce_loop_input_port}; const std::vector output_ports{(*horizon.first)->get_output_port(0)}; for (auto loop_id : reduce_expr->get_loop_ids()) { loop_manager diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp index e299636a6c4981..e40c78f416c437 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp @@ -14,6 +14,7 @@ #include "emitters/plugin/aarch64/jit_emitter.hpp" #include "emitters/utils.hpp" +#include "openvino/core/except.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "snippets/lowered/expression.hpp" @@ -63,17 +64,12 @@ void jit_fill_emitter::emit_impl(const std::vector& in, const std::vecto template void jit_fill_emitter::emit_isa(const std::vector& in, const std::vector& out) const { const size_t supported_et_size = dnnl::impl::cpu::aarch64::cpu_isa_traits::vlen / exec_prc_.size(); - if (offset == supported_et_size) { - // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be - // removed from the LIR - // TODO: when inplace is supported, remove such Fill ops from the LIR and remove this logic. - // Ticket: 126270 - auto src = in[0]; - auto dst = out[0]; - if (src != dst) { - h->mov(Xbyak_aarch64::VReg16B(dst), Xbyak_aarch64::VReg16B(src)); - } - } else if (is_full_reg()) { + OPENVINO_ASSERT(offset < supported_et_size, + "Fill emitter offset ", + offset, + " exceeds register capacity ", + supported_et_size); + if (is_full_reg()) { fill_full(out); } else { fill_tail(in, out); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp index 2013cc0ca770da..2c323d0275347c 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp @@ -15,6 +15,7 @@ #include "emitters/plugin/x64/jit_emitter.hpp" #include "emitters/utils.hpp" +#include "openvino/core/except.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "snippets/lowered/expression.hpp" @@ -79,15 +80,12 @@ void jit_fill_emitter::emit_isa(const std::vector& in, const std::vector const size_t supported_et_size = 4; const auto register_capacity = (src_vmm.getBit() / 8) / supported_et_size; - if (offset == register_capacity) { - // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be - // removed from the LIR - // TODO: when inplace is supported, remove such Fill ops from the LIR and remove this logic. - // Ticket: 126270 - if (src_vmm.getIdx() != dst_vmm.getIdx()) { - h->uni_vmovups(dst_vmm, src_vmm); - } - } else if (is_full_reg()) { + OPENVINO_ASSERT(offset < register_capacity, + "Fill emitter offset ", + offset, + " exceeds register capacity ", + register_capacity); + if (is_full_reg()) { fill_full(dst_vmm); } else { fill_tail(src_vmm, dst_vmm); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/reduce.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/reduce.cpp index 12b58ebeeb1937..3b2926e2356bee 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/reduce.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/reduce.cpp @@ -13,6 +13,9 @@ namespace snippets { namespace { const std::vector inputShape = { + {{}, {{1, 3, 128, 1}}}, + {{}, {{1, 3, 128, 7}}}, + {{}, {{1, 3, 128, 9}}}, {{}, {{1, 3, 128, 128}}}, {{}, {{1, 3, 128, 15}}}, {{}, {{1, 3, 15, 16}}},