From 6585c607bfeab661b0a12d7aa5a59530beeff987 Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Tue, 27 Jan 2026 10:36:39 -0500 Subject: [PATCH 1/2] Convert scalar arguments to CreateMasked{Load,Store} to 1-element vectors. Fixes #8922 --- src/CodeGen_LLVM.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 4a5b45475533..b5c576773591 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -2325,6 +2325,12 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) { store = dyn_cast(value); } else { + if (!slice_val->getType()->isVectorTy()) { + slice_val = create_broadcast(slice_val, 1); + } + if (!slice_mask->getType()->isVectorTy()) { + slice_mask = create_broadcast(slice_mask, 1); + } store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask); } add_tbaa_metadata(store, op->name, slice_index); @@ -2444,6 +2450,9 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri load_inst = dyn_cast(value); } else { if (slice_mask != nullptr) { + if (!slice_mask->getType()->isVectorTy()) { + slice_mask = create_broadcast(slice_mask, 1); + } load_inst = builder->CreateMaskedLoad(slice_type, vec_ptr, llvm::Align(align_bytes), slice_mask); } else { load_inst = builder->CreateAlignedLoad(slice_type, vec_ptr, llvm::Align(align_bytes)); From 65b7db9981dd51ca451504758f6bb86cd8761948 Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Tue, 27 Jan 2026 10:46:20 -0500 Subject: [PATCH 2/2] Add a test for single-element predicated loads --- test/correctness/CMakeLists.txt | 1 + .../predicated_store_load_single_lane.cpp | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 test/correctness/predicated_store_load_single_lane.cpp diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 4bce8789875e..690081f5ce4b 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -254,6 +254,7 @@ tests(GROUPS correctness plain_c_includes.c popc_clz_ctz_bounds.cpp predicated_store_load.cpp + predicated_store_load_single_lane.cpp prefetch.cpp print.cpp print_loop_nest.cpp diff --git a/test/correctness/predicated_store_load_single_lane.cpp b/test/correctness/predicated_store_load_single_lane.cpp new file mode 100644 index 000000000000..3e1f3b3b4ca0 --- /dev/null +++ b/test/correctness/predicated_store_load_single_lane.cpp @@ -0,0 +1,35 @@ +#include "Halide.h" + +using namespace Halide; + +int main(int argc, char **argv) { + // This test exercises predicated vector loads and stores with a single + // lane. These require special handling because Halide's IR does not + // distinguish between scalars and single-element vectors, while LLVM + // does. + + int w = get_jit_target_from_environment().natural_vector_size(); + + Func f1{"f1"}, f2{"f2"}; + Var x{"x"}, xo{"xo"}, xi{"xi"}; + + ImageParam input(Float(32), 1); + + f1(x) = input(x) * 2; + f2(x) = select(x < w, 0, f1(x) + f1(x + 1)); + + // This schedule creates a situation where f1 is computed with a + // vectorized loop that requires predicated loads/stores for the + // final single element. + f2.split(x, xo, xi, w); + f1.compute_at(f2, xo).vectorize(x); // effective vector width = w + 1 + + // Compile to check that codegen succeeds. This would crash before the fix + // with "Call parameter type does not match function signature" because + // the masked load/store intrinsics received scalar masks instead of + // vector masks. + f2.compile_jit(); + + printf("Success!\n"); + return 0; +}