openvinotoolkit · p-wysocki · Jan 22, 2026 · Jan 22, 2026 · Jan 27, 2026 · Feb 10, 2026
@@ -6,7 +6,6 @@
 
 #define OUTPUT_TYPE_BLOCK               MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
 #define TO_TYPE(type, val)              CAT(convert_, type)(val)
-#define TO_TYPE_SAT(type, val)          CAT(CAT(convert_, type), _sat)(val)
 
 #if ELTWISE_BROADCAST
     #define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX_SAFE)(idx_order)
@@ -75,13 +74,9 @@ KERNEL(eltwise_blocked_opt)(INPUTS_DECLS
 #if HAS_FUSED_OPS
     FUSED_OPS;
     OUTPUT_TYPE_BLOCK out = TO_TYPE(MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE), FUSED_OPS_RESULT);
-#else
-#if QUANTIZATION_TERM && !OUTPUT_IS_FP
-    OUTPUT_TYPE_BLOCK out = ACTIVATION_TYPED(TO_TYPE_SAT(MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE), res), ACTIVATION_PARAMS_TYPED);
 #else
     OUTPUT_TYPE_BLOCK out = ACTIVATION_TYPED(TO_TYPE(MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE), res), ACTIVATION_PARAMS_TYPED);
 #endif
-#endif
 
 #ifdef LEFTOVERS
     // Overwrite
@@ -105,4 +100,3 @@ KERNEL(eltwise_blocked_opt)(INPUTS_DECLS
 
 #undef OUTPUT_TYPE_BLOCK
 #undef TO_TYPE
-#undef TO_TYPE_SAT
@@ -190,7 +190,7 @@ KERNEL(eltwise)(
 #endif
 
 #if QUANTIZATION_TERM && !OUTPUT_IS_FP
-    output[output_offset] = TO_OUTPUT_TYPE_SAT(ACTIVATION(out, ACTIVATION_PARAMS));
+    output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(out, ACTIVATION_PARAMS));
 #else
     output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED));
 #endif

@@ -2634,6 +2634,100 @@ TEST(eltwise_gpu_int, basic_in4x4x4x4) {
     }
 }
 
+TEST(eltwise_gpu_int, i8_overflow_wraparound) {
+    // Test that int8 eltwise operations correctly wrap around on overflow
+    // instead of saturating. This tests values that overflow the [-128, 127] range.
+    // Subtraction examples: -100 - 100 = -200 -> wraps to 56
+    // Addition examples: 100 + 100 = 200 -> wraps to -56
+
+    auto& engine = get_test_engine();
+
+    auto input1 = engine.allocate_memory({ data_types::i8, format::bfyx, { 1, 1, 4, 1 } });
+    auto input2 = engine.allocate_memory({ data_types::i8, format::bfyx, { 1, 1, 4, 1 } });
+
+    std::vector<int8_t> input1_data = { -100, 100, 127, -128 };
+    std::vector<int8_t> input2_data = {  100, -100, -1,   1 };
+
+    set_values(input1, input1_data);
+    set_values(input2, input2_data);
+
+    for (auto mode : { eltwise_mode::sub, eltwise_mode::sum }) {
+        topology topology;
+        topology.add(input_layout("input1", input1->get_layout()));
+        topology.add(input_layout("input2", input2->get_layout()));
+        topology.add(eltwise("eltwise", { input_info("input1"), input_info("input2") }, mode));
+
+        network network(engine, topology, get_test_default_config(engine));
+        network.set_input_data("input1", input1);
+        network.set_input_data("input2", input2);
+        auto outputs = network.execute();
+
+        auto output = outputs.at("eltwise").get_memory();
+        cldnn::mem_lock<int8_t> output_ptr(output, get_test_stream());
+
+        for (size_t i = 0; i < input1_data.size(); ++i) {
+            int16_t wide_result = (mode == eltwise_mode::sub)
+                ? static_cast<int16_t>(input1_data[i]) - static_cast<int16_t>(input2_data[i])
+                : static_cast<int16_t>(input1_data[i]) + static_cast<int16_t>(input2_data[i]);
+            int8_t expected = static_cast<int8_t>(wide_result);
+            ASSERT_EQ(expected, output_ptr[i])
+                << "Mode: " << (mode == eltwise_mode::sub ? "sub" : "sum")
+                << ", index " << i << ": " << static_cast<int>(input1_data[i])
+                << (mode == eltwise_mode::sub ? " - " : " + ") << static_cast<int>(input2_data[i]);
+        }
+    }
+}
+
+TEST(eltwise_gpu_int, i8_overflow_wraparound_blocked_format) {
+    auto& engine = get_test_engine();
+
+    const int batch = 1, features = 32, height = 1, width = 4;
+    tensor input_tensor(batch, features, width, height);
+
+    auto input1 = engine.allocate_memory({ data_types::i8, format::bfyx, input_tensor });
+    auto input2 = engine.allocate_memory({ data_types::i8, format::bfyx, input_tensor });
+
+    std::vector<int8_t> input1_data(batch * features * height * width);
+    std::vector<int8_t> input2_data(batch * features * height * width);
+
+    for (size_t i = 0; i < input1_data.size(); ++i) {
+        input1_data[i] = (i % 4 == 0) ? -100 : ((i % 4 == 1) ? 100 : ((i % 4 == 2) ? 127 : -128));
+        input2_data[i] = (i % 4 == 0) ?  100 : ((i % 4 == 1) ? -100 : ((i % 4 == 2) ? -1 : 1));
+    }
+
+    set_values(input1, input1_data);
+    set_values(input2, input2_data);
+
+    for (auto mode : { eltwise_mode::sub, eltwise_mode::sum }) {
+        topology topology;
+        topology.add(input_layout("input1", input1->get_layout()));
+        topology.add(input_layout("input2", input2->get_layout()));
+        topology.add(reorder("reorder1", input_info("input1"), layout(data_types::i8, format::b_fs_yx_fsv16, input_tensor)));
+        topology.add(reorder("reorder2", input_info("input2"), layout(data_types::i8, format::b_fs_yx_fsv16, input_tensor)));
+        topology.add(eltwise("eltwise", { input_info("reorder1"), input_info("reorder2") }, mode));
+        topology.add(reorder("output_reorder", input_info("eltwise"), layout(data_types::i8, format::bfyx, input_tensor)));
+
+        network network(engine, topology, get_test_default_config(engine));
+        network.set_input_data("input1", input1);
+        network.set_input_data("input2", input2);
+        auto outputs = network.execute();
+
+        auto output = outputs.at("output_reorder").get_memory();
+        cldnn::mem_lock<int8_t> output_ptr(output, get_test_stream());
+
+        for (size_t i = 0; i < input1_data.size(); ++i) {
+            int16_t wide_result = (mode == eltwise_mode::sub)
+                ? static_cast<int16_t>(input1_data[i]) - static_cast<int16_t>(input2_data[i])
+                : static_cast<int16_t>(input1_data[i]) + static_cast<int16_t>(input2_data[i]);
+            int8_t expected = static_cast<int8_t>(wide_result);
+            ASSERT_EQ(expected, output_ptr[i])
+                << "Mode: " << (mode == eltwise_mode::sub ? "sub" : "sum")
+                << ", index " << i << ": " << static_cast<int>(input1_data[i])
+                << (mode == eltwise_mode::sub ? " - " : " + ") << static_cast<int>(input2_data[i]);
+        }
+    }
+}
+
 TEST(eltwise_gpu_int, div_gather_fusing) {
     auto& engine = get_test_engine();