From 24cb45134c40b4c3a7079e1c675201d391dd3b5b Mon Sep 17 00:00:00 2001 From: Przemyslaw Wysocki Date: Thu, 22 Jan 2026 11:00:10 +0000 Subject: [PATCH 1/3] Fix GPU overflow --- .../src/kernel_selector/cl_kernels/generic_eltwise_ref.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl index 22c16c7ddf60ee..19a3cc83a8ad68 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl @@ -190,7 +190,7 @@ KERNEL(eltwise)( #endif #if QUANTIZATION_TERM && !OUTPUT_IS_FP - output[output_offset] = TO_OUTPUT_TYPE_SAT(ACTIVATION(out, ACTIVATION_PARAMS)); + output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(out, ACTIVATION_PARAMS)); #else output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED)); #endif From 6a7d2e6fcde244ee9f2b3fae9043a1a8adc031f6 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wysocki Date: Thu, 22 Jan 2026 12:08:26 +0000 Subject: [PATCH 2/3] Add test --- .../unit/test_cases/eltwise_gpu_test.cpp | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp index 92dc429cf89f6b..41d0e40c84d3ee 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp @@ -2634,6 +2634,50 @@ TEST(eltwise_gpu_int, basic_in4x4x4x4) { } } +TEST(eltwise_gpu_int, i8_overflow_wraparound) { + // Test that int8 eltwise operations correctly wrap around on overflow + // instead of saturating. This tests values that overflow the [-128, 127] range. + // Subtraction examples: -100 - 100 = -200 -> wraps to 56 + // Addition examples: 100 + 100 = 200 -> wraps to -56 + + auto& engine = get_test_engine(); + + auto input1 = engine.allocate_memory({ data_types::i8, format::bfyx, { 1, 1, 4, 1 } }); + auto input2 = engine.allocate_memory({ data_types::i8, format::bfyx, { 1, 1, 4, 1 } }); + + std::vector input1_data = { -100, 100, 127, -128 }; + std::vector input2_data = { 100, -100, -1, 1 }; + + set_values(input1, input1_data); + set_values(input2, input2_data); + + for (auto mode : { eltwise_mode::sub, eltwise_mode::sum }) { + topology topology; + topology.add(input_layout("input1", input1->get_layout())); + topology.add(input_layout("input2", input2->get_layout())); + topology.add(eltwise("eltwise", { input_info("input1"), input_info("input2") }, mode)); + + network network(engine, topology, get_test_default_config(engine)); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + auto output = outputs.at("eltwise").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + for (size_t i = 0; i < input1_data.size(); ++i) { + int16_t wide_result = (mode == eltwise_mode::sub) + ? static_cast(input1_data[i]) - static_cast(input2_data[i]) + : static_cast(input1_data[i]) + static_cast(input2_data[i]); + int8_t expected = static_cast(wide_result); + ASSERT_EQ(expected, output_ptr[i]) + << "Mode: " << (mode == eltwise_mode::sub ? "sub" : "sum") + << ", index " << i << ": " << static_cast(input1_data[i]) + << (mode == eltwise_mode::sub ? " - " : " + ") << static_cast(input2_data[i]); + } + } +} + TEST(eltwise_gpu_int, div_gather_fusing) { auto& engine = get_test_engine(); From bdeab9bc4f5e957b056c24c332e7cd7328419942 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wysocki Date: Tue, 27 Jan 2026 13:23:07 +0000 Subject: [PATCH 3/3] CR --- .../cl_kernels/eltwise_blocked_opt.cl | 6 --- .../unit/test_cases/eltwise_gpu_test.cpp | 50 +++++++++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_blocked_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_blocked_opt.cl index 4ef60f026b4155..e3edd8f4c3a8d6 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_blocked_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_blocked_opt.cl @@ -6,7 +6,6 @@ #define OUTPUT_TYPE_BLOCK MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE) #define TO_TYPE(type, val) CAT(convert_, type)(val) -#define TO_TYPE_SAT(type, val) CAT(CAT(convert_, type), _sat)(val) #if ELTWISE_BROADCAST #define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX_SAFE)(idx_order) @@ -75,13 +74,9 @@ KERNEL(eltwise_blocked_opt)(INPUTS_DECLS #if HAS_FUSED_OPS FUSED_OPS; OUTPUT_TYPE_BLOCK out = TO_TYPE(MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE), FUSED_OPS_RESULT); -#else -#if QUANTIZATION_TERM && !OUTPUT_IS_FP - OUTPUT_TYPE_BLOCK out = ACTIVATION_TYPED(TO_TYPE_SAT(MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE), res), ACTIVATION_PARAMS_TYPED); #else OUTPUT_TYPE_BLOCK out = ACTIVATION_TYPED(TO_TYPE(MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE), res), ACTIVATION_PARAMS_TYPED); #endif -#endif #ifdef LEFTOVERS // Overwrite @@ -105,4 +100,3 @@ KERNEL(eltwise_blocked_opt)(INPUTS_DECLS #undef OUTPUT_TYPE_BLOCK #undef TO_TYPE -#undef TO_TYPE_SAT diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp index 41d0e40c84d3ee..78dc018c4d43df 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp @@ -2678,6 +2678,56 @@ TEST(eltwise_gpu_int, i8_overflow_wraparound) { } } +TEST(eltwise_gpu_int, i8_overflow_wraparound_blocked_format) { + auto& engine = get_test_engine(); + + const int batch = 1, features = 32, height = 1, width = 4; + tensor input_tensor(batch, features, width, height); + + auto input1 = engine.allocate_memory({ data_types::i8, format::bfyx, input_tensor }); + auto input2 = engine.allocate_memory({ data_types::i8, format::bfyx, input_tensor }); + + std::vector input1_data(batch * features * height * width); + std::vector input2_data(batch * features * height * width); + + for (size_t i = 0; i < input1_data.size(); ++i) { + input1_data[i] = (i % 4 == 0) ? -100 : ((i % 4 == 1) ? 100 : ((i % 4 == 2) ? 127 : -128)); + input2_data[i] = (i % 4 == 0) ? 100 : ((i % 4 == 1) ? -100 : ((i % 4 == 2) ? -1 : 1)); + } + + set_values(input1, input1_data); + set_values(input2, input2_data); + + for (auto mode : { eltwise_mode::sub, eltwise_mode::sum }) { + topology topology; + topology.add(input_layout("input1", input1->get_layout())); + topology.add(input_layout("input2", input2->get_layout())); + topology.add(reorder("reorder1", input_info("input1"), layout(data_types::i8, format::b_fs_yx_fsv16, input_tensor))); + topology.add(reorder("reorder2", input_info("input2"), layout(data_types::i8, format::b_fs_yx_fsv16, input_tensor))); + topology.add(eltwise("eltwise", { input_info("reorder1"), input_info("reorder2") }, mode)); + topology.add(reorder("output_reorder", input_info("eltwise"), layout(data_types::i8, format::bfyx, input_tensor))); + + network network(engine, topology, get_test_default_config(engine)); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + auto output = outputs.at("output_reorder").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + for (size_t i = 0; i < input1_data.size(); ++i) { + int16_t wide_result = (mode == eltwise_mode::sub) + ? static_cast(input1_data[i]) - static_cast(input2_data[i]) + : static_cast(input1_data[i]) + static_cast(input2_data[i]); + int8_t expected = static_cast(wide_result); + ASSERT_EQ(expected, output_ptr[i]) + << "Mode: " << (mode == eltwise_mode::sub ? "sub" : "sum") + << ", index " << i << ": " << static_cast(input1_data[i]) + << (mode == eltwise_mode::sub ? " - " : " + ") << static_cast(input2_data[i]); + } + } +} + TEST(eltwise_gpu_int, div_gather_fusing) { auto& engine = get_test_engine();