From 5b840b94a5501cb943b163ccd1662e09197f2c32 Mon Sep 17 00:00:00 2001 From: Syrus Akbary Date: Sun, 1 Feb 2026 20:16:28 +0100 Subject: [PATCH 01/18] Implemented Relaxed SIMD in Cranelift --- build.rs | 5 + .../src/translator/code_translator.rs | 141 +++++++++-- tests/compilers/wast.rs | 4 + tests/ignores.txt | 3 + tests/lib/wast/src/wast.rs | 3 + .../relaxed-simd/i16x8_relaxed_q15mulr_s.wast | 28 +++ .../relaxed-simd/i32x4_relaxed_trunc.wast | 124 ++++++++++ .../relaxed-simd/i8x16_relaxed_swizzle.wast | 45 ++++ .../relaxed-simd/relaxed_dot_product.wast | 107 +++++++++ .../relaxed-simd/relaxed_laneselect.wast | 103 ++++++++ .../relaxed-simd/relaxed_madd_nmadd.wast | 224 ++++++++++++++++++ .../relaxed-simd/relaxed_min_max.wast | 184 ++++++++++++++ 12 files changed, 949 insertions(+), 22 deletions(-) create mode 100644 tests/wast/spec/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast create mode 100644 tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast create mode 100644 tests/wast/spec/proposals/relaxed-simd/i8x16_relaxed_swizzle.wast create mode 100644 tests/wast/spec/proposals/relaxed-simd/relaxed_dot_product.wast create mode 100644 tests/wast/spec/proposals/relaxed-simd/relaxed_laneselect.wast create mode 100644 tests/wast/spec/proposals/relaxed-simd/relaxed_madd_nmadd.wast create mode 100644 tests/wast/spec/proposals/relaxed-simd/relaxed_min_max.wast diff --git a/build.rs b/build.rs index 07de79eaad7..cb0a5c11b96 100644 --- a/build.rs +++ b/build.rs @@ -59,6 +59,11 @@ fn main() -> anyhow::Result<()> { wast_processor, )?; test_directory_module(spectests, "tests/wast/spec/proposals/simd", wast_processor)?; + test_directory_module( + spectests, + "tests/wast/spec/proposals/relaxed-simd", + wast_processor, + )?; test_directory_module( spectests, "tests/wast/spec/proposals/exception-handling", diff --git a/lib/compiler-cranelift/src/translator/code_translator.rs b/lib/compiler-cranelift/src/translator/code_translator.rs index 54a80a608a9..2bab41c7810 100644 --- a/lib/compiler-cranelift/src/translator/code_translator.rs +++ b/lib/compiler-cranelift/src/translator/code_translator.rs @@ -1740,6 +1740,10 @@ pub fn translate_operator( let (a, b) = pop2_with_bitcast(state, I8X16, builder); state.push1(builder.ins().swizzle(a, b)) } + Operator::I8x16RelaxedSwizzle => { + let (a, b) = pop2_with_bitcast(state, I8X16, builder); + state.push1(builder.ins().swizzle(a, b)) + } Operator::I8x16Add | Operator::I16x8Add | Operator::I32x4Add | Operator::I64x2Add => { let (a, b) = pop2_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().iadd(a, b)) @@ -1852,6 +1856,19 @@ pub fn translate_operator( // operands must match (hence the bitcast). state.push1(builder.ins().bitselect(bitcast_c, bitcast_a, bitcast_b)) } + Operator::I8x16RelaxedLaneselect + | Operator::I16x8RelaxedLaneselect + | Operator::I32x4RelaxedLaneselect + | Operator::I64x2RelaxedLaneselect => { + let (a, b, c) = state.pop3(); + let ty = type_of(op); + let bitcast_a = optionally_bitcast_vector(a, ty, builder); + let bitcast_b = optionally_bitcast_vector(b, ty, builder); + let bitcast_c = optionally_bitcast_vector(c, ty, builder); + // The CLIF operand ordering is slightly different and the types of all three + // operands must match (hence the bitcast). + state.push1(builder.ins().bitselect(bitcast_c, bitcast_a, bitcast_b)) + } Operator::V128AnyTrue => { let a = pop1_with_bitcast(state, type_of(op), builder); let bool_result = builder.ins().vany_true(a); @@ -1935,6 +1952,25 @@ pub fn translate_operator( let (a, b) = pop2_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().fmul(a, b)) } + Operator::F32x4RelaxedMadd | Operator::F64x2RelaxedMadd => { + let ty = type_of(op); + let (a, b, c) = state.pop3(); + let a = optionally_bitcast_vector(a, ty, builder); + let b = optionally_bitcast_vector(b, ty, builder); + let c = optionally_bitcast_vector(c, ty, builder); + let mul = builder.ins().fmul(a, b); + state.push1(builder.ins().fadd(mul, c)) + } + Operator::F32x4RelaxedNmadd | Operator::F64x2RelaxedNmadd => { + let ty = type_of(op); + let (a, b, c) = state.pop3(); + let a = optionally_bitcast_vector(a, ty, builder); + let b = optionally_bitcast_vector(b, ty, builder); + let c = optionally_bitcast_vector(c, ty, builder); + let a = builder.ins().fneg(a); + let mul = builder.ins().fmul(a, b); + state.push1(builder.ins().fadd(mul, c)) + } Operator::F32x4Div | Operator::F64x2Div => { let (a, b) = pop2_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().fdiv(a, b)) @@ -1943,10 +1979,18 @@ pub fn translate_operator( let (a, b) = pop2_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().fmax(a, b)) } + Operator::F32x4RelaxedMax | Operator::F64x2RelaxedMax => { + let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + state.push1(builder.ins().fmax(a, b)) + } Operator::F32x4Min | Operator::F64x2Min => { let (a, b) = pop2_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().fmin(a, b)) } + Operator::F32x4RelaxedMin | Operator::F64x2RelaxedMin => { + let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + state.push1(builder.ins().fmin(a, b)) + } Operator::F32x4PMax | Operator::F64x2PMax => { // Note the careful ordering here with respect to `fcmp` and // `bitselect`. This matches the spec definition of: @@ -2014,6 +2058,10 @@ pub fn translate_operator( let a = pop1_with_bitcast(state, F32X4, builder); state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a)) } + Operator::I32x4RelaxedTruncF32x4S => { + let a = pop1_with_bitcast(state, F32X4, builder); + state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a)) + } Operator::I32x4TruncSatF64x2SZero => { let a = pop1_with_bitcast(state, F64X2, builder); let converted_a = builder.ins().fcvt_to_sint_sat(I64X2, a); @@ -2022,10 +2070,22 @@ pub fn translate_operator( state.push1(builder.ins().snarrow(converted_a, zero)); } + Operator::I32x4RelaxedTruncF64x2SZero => { + let a = pop1_with_bitcast(state, F64X2, builder); + let converted_a = builder.ins().fcvt_to_sint_sat(I64X2, a); + let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into()); + let zero = builder.ins().vconst(I64X2, handle); + + state.push1(builder.ins().snarrow(converted_a, zero)); + } Operator::I32x4TruncSatF32x4U => { let a = pop1_with_bitcast(state, F32X4, builder); state.push1(builder.ins().fcvt_to_uint_sat(I32X4, a)) } + Operator::I32x4RelaxedTruncF32x4U => { + let a = pop1_with_bitcast(state, F32X4, builder); + state.push1(builder.ins().fcvt_to_uint_sat(I32X4, a)) + } Operator::I32x4TruncSatF64x2UZero => { let a = pop1_with_bitcast(state, F64X2, builder); let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a); @@ -2034,6 +2094,14 @@ pub fn translate_operator( state.push1(builder.ins().uunarrow(converted_a, zero)); } + Operator::I32x4RelaxedTruncF64x2UZero => { + let a = pop1_with_bitcast(state, F64X2, builder); + let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a); + let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into()); + let zero = builder.ins().vconst(I64X2, handle); + + state.push1(builder.ins().uunarrow(converted_a, zero)); + } Operator::I8x16NarrowI16x8S => { let (a, b) = pop2_with_bitcast(state, I16X8, builder); state.push1(builder.ins().snarrow(a, b)) @@ -2152,6 +2220,16 @@ pub fn translate_operator( let high = builder.ins().imul(ahigh, bhigh); state.push1(builder.ins().iadd_pairwise(low, high)); } + Operator::I16x8RelaxedDotI8x16I7x16S => { + let (a, b) = pop2_with_bitcast(state, I8X16, builder); + let alow = builder.ins().swiden_low(a); + let blow = builder.ins().swiden_low(b); + let low = builder.ins().imul(alow, blow); + let ahigh = builder.ins().swiden_high(a); + let bhigh = builder.ins().swiden_high(b); + let high = builder.ins().imul(ahigh, bhigh); + state.push1(builder.ins().iadd_pairwise(low, high)); + } Operator::I8x16Popcnt => { let arg = pop1_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().popcnt(arg)); @@ -2160,6 +2238,27 @@ pub fn translate_operator( let (a, b) = pop2_with_bitcast(state, I16X8, builder); state.push1(builder.ins().sqmul_round_sat(a, b)) } + Operator::I16x8RelaxedQ15mulrS => { + let (a, b) = pop2_with_bitcast(state, I16X8, builder); + state.push1(builder.ins().sqmul_round_sat(a, b)) + } + Operator::I32x4RelaxedDotI8x16I7x16AddS => { + let (a, b, c) = state.pop3(); + let a = optionally_bitcast_vector(a, I8X16, builder); + let b = optionally_bitcast_vector(b, I8X16, builder); + let c = optionally_bitcast_vector(c, I32X4, builder); + let alow = builder.ins().swiden_low(a); + let blow = builder.ins().swiden_low(b); + let low = builder.ins().imul(alow, blow); + let ahigh = builder.ins().swiden_high(a); + let bhigh = builder.ins().swiden_high(b); + let high = builder.ins().imul(ahigh, bhigh); + let dot = builder.ins().iadd_pairwise(low, high); + let dotlo = builder.ins().swiden_low(dot); + let dothi = builder.ins().swiden_high(dot); + let dot32 = builder.ins().iadd_pairwise(dotlo, dothi); + state.push1(builder.ins().iadd(dot32, c)); + } Operator::I16x8ExtMulLowI8x16S => { let (a, b) = pop2_with_bitcast(state, I8X16, builder); let a_low = builder.ins().swiden_low(a); @@ -2235,28 +2334,6 @@ pub fn translate_operator( Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => { return Err(wasm_unsupported!("proposed tail-call operator {:?}", op)); } - Operator::I8x16RelaxedSwizzle - | Operator::I32x4RelaxedTruncF32x4S - | Operator::I32x4RelaxedTruncF32x4U - | Operator::I32x4RelaxedTruncF64x2SZero - | Operator::I32x4RelaxedTruncF64x2UZero - | Operator::F32x4RelaxedNmadd - | Operator::F32x4RelaxedMadd - | Operator::I8x16RelaxedLaneselect - | Operator::I16x8RelaxedLaneselect - | Operator::I32x4RelaxedLaneselect - | Operator::I64x2RelaxedLaneselect - | Operator::F32x4RelaxedMin - | Operator::F32x4RelaxedMax - | Operator::F64x2RelaxedMin - | Operator::F64x2RelaxedMax - | Operator::F64x2RelaxedMadd - | Operator::F64x2RelaxedNmadd - | Operator::I16x8RelaxedDotI8x16I7x16S - | Operator::I32x4RelaxedDotI8x16I7x16AddS - | Operator::I16x8RelaxedQ15mulrS => { - return Err(wasm_unsupported!("proposed relaxed-simd operator {:?}", op)); - } Operator::RefEq | Operator::StructNew { .. } | Operator::StructNewDefault { .. } @@ -3113,6 +3190,8 @@ fn type_of(operator: &Operator) -> Type { | Operator::I8x16ExtractLaneS { .. } | Operator::I8x16ExtractLaneU { .. } | Operator::I8x16ReplaceLane { .. } + | Operator::I8x16RelaxedSwizzle + | Operator::I8x16RelaxedLaneselect | Operator::I8x16Eq | Operator::I8x16Ne | Operator::I8x16LtS @@ -3150,6 +3229,7 @@ fn type_of(operator: &Operator) -> Type { | Operator::I16x8ExtractLaneS { .. } | Operator::I16x8ExtractLaneU { .. } | Operator::I16x8ReplaceLane { .. } + | Operator::I16x8RelaxedLaneselect | Operator::I16x8Eq | Operator::I16x8Ne | Operator::I16x8LtS @@ -3178,6 +3258,8 @@ fn type_of(operator: &Operator) -> Type { | Operator::I16x8MaxU | Operator::I16x8AvgrU | Operator::I16x8Mul + | Operator::I16x8RelaxedQ15mulrS + | Operator::I16x8RelaxedDotI8x16I7x16S | Operator::I16x8Bitmask => I16X8, Operator::I32x4Splat @@ -3186,6 +3268,7 @@ fn type_of(operator: &Operator) -> Type { | Operator::V128Store32Lane { .. } | Operator::I32x4ExtractLane { .. } | Operator::I32x4ReplaceLane { .. } + | Operator::I32x4RelaxedLaneselect | Operator::I32x4Eq | Operator::I32x4Ne | Operator::I32x4LtS @@ -3212,6 +3295,11 @@ fn type_of(operator: &Operator) -> Type { | Operator::I32x4Bitmask | Operator::I32x4TruncSatF32x4S | Operator::I32x4TruncSatF32x4U + | Operator::I32x4RelaxedTruncF32x4S + | Operator::I32x4RelaxedTruncF32x4U + | Operator::I32x4RelaxedTruncF64x2SZero + | Operator::I32x4RelaxedTruncF64x2UZero + | Operator::I32x4RelaxedDotI8x16I7x16AddS | Operator::V128Load32Zero { .. } => I32X4, Operator::I64x2Splat @@ -3220,6 +3308,7 @@ fn type_of(operator: &Operator) -> Type { | Operator::V128Store64Lane { .. } | Operator::I64x2ExtractLane { .. } | Operator::I64x2ReplaceLane { .. } + | Operator::I64x2RelaxedLaneselect | Operator::I64x2Eq | Operator::I64x2Ne | Operator::I64x2LtS @@ -3258,6 +3347,10 @@ fn type_of(operator: &Operator) -> Type { | Operator::F32x4Max | Operator::F32x4PMin | Operator::F32x4PMax + | Operator::F32x4RelaxedMin + | Operator::F32x4RelaxedMax + | Operator::F32x4RelaxedMadd + | Operator::F32x4RelaxedNmadd | Operator::F32x4ConvertI32x4S | Operator::F32x4ConvertI32x4U | Operator::F32x4Ceil @@ -3285,6 +3378,10 @@ fn type_of(operator: &Operator) -> Type { | Operator::F64x2Max | Operator::F64x2PMin | Operator::F64x2PMax + | Operator::F64x2RelaxedMin + | Operator::F64x2RelaxedMax + | Operator::F64x2RelaxedMadd + | Operator::F64x2RelaxedNmadd | Operator::F64x2Ceil | Operator::F64x2Floor | Operator::F64x2Trunc diff --git a/tests/compilers/wast.rs b/tests/compilers/wast.rs index c637d8a32dc..eee48e7db74 100644 --- a/tests/compilers/wast.rs +++ b/tests/compilers/wast.rs @@ -22,6 +22,7 @@ pub fn run_wast(mut config: crate::Config, wast_path: &str) -> anyhow::Result<() let mut features = Features::default(); let is_bulkmemory = wast_path.contains("bulk-memory"); let is_simd = wast_path.contains("simd"); + let is_relaxed_simd = wast_path.contains("relaxed-simd"); let is_threads = wast_path.contains("threads"); let is_exception_handling = wast_path.contains("exception-handling"); if is_bulkmemory { @@ -30,6 +31,9 @@ pub fn run_wast(mut config: crate::Config, wast_path: &str) -> anyhow::Result<() if is_simd { features.simd(true); } + if is_relaxed_simd { + features.relaxed_simd(true); + } if is_threads { features.threads(true); } diff --git a/tests/ignores.txt b/tests/ignores.txt index c671963d48e..3769315d50a 100644 --- a/tests/ignores.txt +++ b/tests/ignores.txt @@ -1,7 +1,9 @@ # Compilers singlepass spec::simd # Singlepass doesn't support yet SIMD (no one asked for this feature) +singlepass spec::relaxed_simd # Singlepass doesn't support relaxed SIMD yet singlepass wasmer::simd_generated_ext_ops singlepass wasmer::simd +llvm spec::relaxed_simd # LLVM compiler doesn't support relaxed SIMD yet singlepass spec::exception_handling # Singlepass doesn't support EH yet (no one asked for this feature) singlepass wasmer::exception_handling windows spec::exception_handling # No EH support on Windows yet @@ -69,6 +71,7 @@ cranelift+riscv64 spec::r#if::cranelift # no SIMD on riscv, Cranelift will not handle them cranelift+riscv64 spec::simd +cranelift+riscv64 spec::relaxed_simd # 6078 cranelift+riscv64 wasmer::simd_generated_ext_ops cranelift+riscv64 wasmer::simd diff --git a/tests/lib/wast/src/wast.rs b/tests/lib/wast/src/wast.rs index 5368e89cf85..c1fff494629 100644 --- a/tests/lib/wast/src/wast.rs +++ b/tests/lib/wast/src/wast.rs @@ -557,6 +557,9 @@ impl Wast { (Value::F32(a), WastRetCore::F32(b)) => f32_matches(*a, b), (Value::F64(a), WastRetCore::F64(b)) => f64_matches(*a, b), (Value::V128(a), WastRetCore::V128(b)) => v128_matches(*a, b), + (actual, WastRetCore::Either(cases)) => cases + .iter() + .any(|case| self.val_matches(actual, case).unwrap_or(false)), ( Value::FuncRef(None), WastRetCore::RefNull(Some(wast::core::HeapType::Abstract { diff --git a/tests/wast/spec/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast b/tests/wast/spec/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast new file mode 100644 index 00000000000..00f901cbc2a --- /dev/null +++ b/tests/wast/spec/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast @@ -0,0 +1,28 @@ +;; Tests for i16x8.relaxed_q15mulr_s. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i16x8.relaxed_q15mulr_s") (param v128 v128) (result v128) (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))) + + (func (export "i16x8.relaxed_q15mulr_s_cmp") (param v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)) + (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)))) +) + +;; INT16_MIN = -32768 +(assert_return (invoke "i16x8.relaxed_q15mulr_s" + (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0) + (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0)) + ;; overflows, return either INT16_MIN or INT16_MAX + (either (v128.const i16x8 -32768 32767 32766 0 0 0 0 0) + (v128.const i16x8 32767 32767 32766 0 0 0 0 0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i16x8.relaxed_q15mulr_s_cmp" + (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0) + (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0)) + ;; overflows, return either INT16_MIN or INT16_MAX + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + diff --git a/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast b/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast new file mode 100644 index 00000000000..cca3ecb958a --- /dev/null +++ b/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast @@ -0,0 +1,124 @@ +;; Tests for i32x4.relaxed_trunc_f32x4_s, i32x4.relaxed_trunc_f32x4_u, i32x4.relaxed_trunc_f64x2_s_zero, and i32x4.relaxed_trunc_f64x2_u_zero. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i32x4.relaxed_trunc_f32x4_s") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_s (local.get 0))) + (func (export "i32x4.relaxed_trunc_f32x4_u") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_u (local.get 0))) + (func (export "i32x4.relaxed_trunc_f64x2_s_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))) + (func (export "i32x4.relaxed_trunc_f64x2_u_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))) + + (func (export "i32x4.relaxed_trunc_f32x4_s_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f32x4_s (local.get 0)) + (i32x4.relaxed_trunc_f32x4_s (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f32x4_u_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f32x4_u (local.get 0)) + (i32x4.relaxed_trunc_f32x4_u (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f64x2_s_zero_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)) + (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f64x2_u_zero_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)) + (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)))) +) + +;; Test some edge cases around min/max to ensure that the instruction either +;; saturates correctly or returns INT_MIN. +;; +;; Note, though, that INT_MAX itself is not tested. The value for INT_MAX is +;; 2147483647 but that is not representable in a `f32` since it requires 31 bits +;; when a f32 has only 24 bits available. This means that the closest integers +;; to INT_MAX which can be represented are 2147483520 and 2147483648, meaning +;; that the INT_MAX test case cannot be tested. +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s" + ;; INT32_MIN INT32_MAX + (v128.const f32x4 -2147483648.0 -2147483904.0 2.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (either (v128.const i32x4 -2147483648 -2147483648 2 2147483647) + (v128.const i32x4 -2147483648 -2147483648 2 -2147483648))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or INT32_MIN + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u" + ;; UINT32_MIN UINT32_MIN-1 saturate or UINT32_MAX + (either (v128.const i32x4 0 0 4294967040 0xffffffff) + (v128.const i32x4 0 0xffffffff 4294967040 0xffffffff))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or UINT32_MAX + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero" + (v128.const f64x2 -2147483904.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (either (v128.const i32x4 -2147483648 2147483647 0 0) + (v128.const i32x4 -2147483648 -2147483648 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero" + (v128.const f64x2 nan -nan)) + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0x80000000 0x80000000 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero" + (v128.const f64x2 -1.0 4294967296.0)) + ;; out of range -> saturate or UINT32_MAX + (either (v128.const i32x4 0 0xffffffff 0 0) + (v128.const i32x4 0xffffffff 0xffffffff 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero" + (v128.const f64x2 nan -nan)) + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0 0 0xffffffff 0xffffffff))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp" + ;; INT32_MIN INT32_MAX + (v128.const f32x4 -2147483648.0 -2147483904.0 2147483647.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp" + ;; UINT32_MIN UINT32_MIN-1 saturate or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp" + (v128.const f64x2 -2147483904.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp" + (v128.const f64x2 nan -nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp" + (v128.const f64x2 -1.0 4294967296.0)) + ;; out of range -> saturate or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp" + (v128.const f64x2 nan -nan)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/tests/wast/spec/proposals/relaxed-simd/i8x16_relaxed_swizzle.wast b/tests/wast/spec/proposals/relaxed-simd/i8x16_relaxed_swizzle.wast new file mode 100644 index 00000000000..f1bcb455209 --- /dev/null +++ b/tests/wast/spec/proposals/relaxed-simd/i8x16_relaxed_swizzle.wast @@ -0,0 +1,45 @@ +;; Tests for relaxed i8x16 swizzle. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i8x16.relaxed_swizzle") (param v128 v128) (result v128) (i8x16.relaxed_swizzle (local.get 0) (local.get 1))) + + (func (export "i8x16.relaxed_swizzle_cmp") (param v128 v128) (result v128) + (i8x16.eq + (i8x16.relaxed_swizzle (local.get 0) (local.get 1)) + (i8x16.relaxed_swizzle (local.get 0) (local.get 1)))) +) + +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)) + (either (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; out of range, returns 0 or modulo 15 if < 128 +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)) + (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; out of range, returns 0 if >= 128 +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255)) + (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; out of range, returns 0 or modulo 15 if < 128 +(assert_return (invoke "i8x16.relaxed_swizzle_cmp" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; out of range, returns 0 if >= 128 +(assert_return (invoke "i8x16.relaxed_swizzle_cmp" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) diff --git a/tests/wast/spec/proposals/relaxed-simd/relaxed_dot_product.wast b/tests/wast/spec/proposals/relaxed-simd/relaxed_dot_product.wast new file mode 100644 index 00000000000..48714b87bd8 --- /dev/null +++ b/tests/wast/spec/proposals/relaxed-simd/relaxed_dot_product.wast @@ -0,0 +1,107 @@ +;; Tests for relaxed dot products. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i16x8.relaxed_dot_i8x16_i7x16_s") (param v128 v128) (result v128) (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))) + (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s") (param v128 v128 v128) (result v128) (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))) + + (func (export "i16x8.relaxed_dot_i8x16_i7x16_s_cmp") (param v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)) + (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)))) + (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp") (param v128 v128 v128) (result v128) + (i16x8.eq + (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)) + (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)))) +) + +;; Simple values to ensure things are functional. +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)) + (v128.const i16x8 1 13 41 85 145 221 313 421)) + +;; Test max and min i8 values; +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -32512 32258 0 0 0 0 0 0)) + +;; signed * unsigned : -128 * 129 * 2 = -33,024 saturated to -32,768 +;; signed * signed : -128 * -127 * 2 = 32,512 +;; unsigned * unsigned : 128 * 129 * 2 = 33,024 +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) + (either + (v128.const i16x8 -32768 0 0 0 0 0 0 0) + (v128.const i16x8 32512 0 0 0 0 0 0 0) + (v128.const i16x8 33024 0 0 0 0 0 0 0))) + +;; Simple values to ensure things are functional. +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i32x4 0 1 2 3)) + ;; intermediate result is [14, 126, 366, 734] + (v128.const i32x4 14 127 368 737)) + +;; Test max and min i8 values; +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + ;; intermediate result is [-65024, 64516, 0, 0] + (v128.const i32x4 -65023 64518 3 4)) + +;; signed * unsigned : -128 * 129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI +;; signed * unsigned with intermediate saturation : +;; (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW) +;; -32768 + -32768 = -65536 (+ 1) +;; signed * signed : -128 * -127 * 4 = 65,024 (+ 1) +;; unsigned * unsigned : 128 * 129 * 2 = 66,048 (+ 1) +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + (either + (v128.const i32x4 -66047 2 3 4) + (v128.const i32x4 -65535 2 3 4) + (v128.const i32x4 65025 2 3 4) + (v128.const i32x4 66049 2 3 4))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; Test max and min i8 values; +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp" + (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; Test max and min i8 values; +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp" + (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + ;; intermediate result is [-65024, 64516, 0, 0] + (v128.const i32x4 -1 -1 -1 -1)) + +;; signed * unsigned : -128 * 129 * 2 = -33,024 saturated to -32,768 +;; signed * signed : -128 * -127 * 2 = 32,512 +;; unsigned * unsigned : 128 * 129 * 2 = 33,024 +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp" + (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; signed * unsigned : -128 * 129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI +;; signed * unsigned with intermediate saturation : +;; (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW) +;; -32768 + -32768 = -65536 (+ 1) +;; signed * signed : -128 * -127 * 4 = 65,024 (+ 1) +;; unsigned * unsigned : 128 * 129 * 2 = 66,048 (+ 1) +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp" + (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/tests/wast/spec/proposals/relaxed-simd/relaxed_laneselect.wast b/tests/wast/spec/proposals/relaxed-simd/relaxed_laneselect.wast new file mode 100644 index 00000000000..10913816b0b --- /dev/null +++ b/tests/wast/spec/proposals/relaxed-simd/relaxed_laneselect.wast @@ -0,0 +1,103 @@ +;; Tests for i8x16.relaxed_laneselect, i16x8.relaxed_laneselect, i32x4.relaxed_laneselect, and i64x2.relaxed_laneselect. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i8x16.relaxed_laneselect") (param v128 v128 v128) (result v128) (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i16x8.relaxed_laneselect") (param v128 v128 v128) (result v128) (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i32x4.relaxed_laneselect") (param v128 v128 v128) (result v128) (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i64x2.relaxed_laneselect") (param v128 v128 v128) (result v128) (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + + (func (export "i8x16.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i8x16.eq + (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i16x8.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i32x4.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i32x4.eq + (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i64x2.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i64x2.eq + (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) +) + +(assert_return (invoke "i8x16.relaxed_laneselect" + (v128.const i8x16 0 1 0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0xff 0 0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0)) + (either (v128.const i8x16 0 17 0x14 0x32 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0 17 0x12 0x34 20 21 22 23 24 25 26 27 28 29 30 31))) + +(assert_return (invoke "i16x8.relaxed_laneselect" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0)) + (either (v128.const i16x8 0 9 0x1278 0x5634 12 13 14 15) + (v128.const i16x8 0 9 0x1234 0x5678 12 13 14 15))) + +;; special case for i16x8 to allow pblendvb +(assert_return (invoke "i16x8.relaxed_laneselect" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x0080 0 0 0 0)) ;; 0x0080 is the special case + (either (v128.const i16x8 0 9 0x1278 0x5678 12 13 14 15) ;; bitselect + (v128.const i16x8 0 9 0x1234 0x5678 12 13 14 15) ;; top bit of i16 lane examined + (v128.const i16x8 0 9 0x1278 0x5634 12 13 14 15) ;; top bit of each byte + )) + +(assert_return (invoke "i32x4.relaxed_laneselect" + (v128.const i32x4 0 1 0x12341234 0x12341234) + (v128.const i32x4 4 5 0x56785678 0x56785678) + (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff)) + (either (v128.const i32x4 0 5 0x12345678 0x56781234) + (v128.const i32x4 0 5 0x12341234 0x56785678))) + +(assert_return (invoke "i64x2.relaxed_laneselect" + (v128.const i64x2 0 1) + (v128.const i64x2 2 3) + (v128.const i64x2 0xffffffffffffffff 0)) + (either (v128.const i64x2 0 3) + (v128.const i64x2 0 3))) + +(assert_return (invoke "i64x2.relaxed_laneselect" + (v128.const i64x2 0x1234123412341234 0x1234123412341234) + (v128.const i64x2 0x5678567856785678 0x5678567856785678) + (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff)) + (either (v128.const i64x2 0x1234123456785678 0x5678567812341234) + (v128.const i64x2 0x1234123412341234 0x5678567856785678))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i8x16.relaxed_laneselect_cmp" + (v128.const i8x16 0 1 0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0xff 0 0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) + +(assert_return (invoke "i16x8.relaxed_laneselect_cmp" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_laneselect_cmp" + (v128.const i32x4 0 1 0x12341234 0x12341234) + (v128.const i32x4 4 5 0x56785678 0x56785678) + (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i64x2.relaxed_laneselect_cmp" + (v128.const i64x2 0 1) + (v128.const i64x2 2 3) + (v128.const i64x2 0xffffffffffffffff 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "i64x2.relaxed_laneselect_cmp" + (v128.const i64x2 0x1234123412341234 0x1234123412341234) + (v128.const i64x2 0x5678567856785678 0x5678567856785678) + (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff)) + (v128.const i64x2 -1 -1)) diff --git a/tests/wast/spec/proposals/relaxed-simd/relaxed_madd_nmadd.wast b/tests/wast/spec/proposals/relaxed-simd/relaxed_madd_nmadd.wast new file mode 100644 index 00000000000..187b71d5a3f --- /dev/null +++ b/tests/wast/spec/proposals/relaxed-simd/relaxed_madd_nmadd.wast @@ -0,0 +1,224 @@ +;; Tests for f32x4.relaxed_madd, f32x4.relaxed_nmadd, f64x2.relaxed_madd, and f64x2.relaxed_nmadd. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "f32x4.relaxed_madd") (param v128 v128 v128) (result v128) (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f32x4.relaxed_nmadd") (param v128 v128 v128) (result v128) (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f64x2.relaxed_nmadd") (param v128 v128 v128) (result v128) (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f64x2.relaxed_madd") (param v128 v128 v128) (result v128) (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))) + + (func (export "f32x4.relaxed_madd_cmp") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)) + (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f32x4.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)) + (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f64x2.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128) + (f64x2.eq + (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)) + (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f64x2.relaxed_madd_cmp") (param v128 v128 v128) (result v128) + (f64x2.eq + (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)) + (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))) +) + + +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f32x4.relaxed_madd" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (either (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127) + (v128.const f32x4 inf inf inf inf))) + +;; Special values for float: +;; x = 0x1.000004p+0 (1 + 2^-22) +;; y = 0x1.0002p+0 (1 + 2^-15) +;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0) +;; = -0x1.000204p+0 +;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = (0x1p-37) 2^-37 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f32x4.relaxed_madd" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd" + (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) + +;; DBL_MAX = 0x1.fffffffffffffp+1023 +;; DLB_MAX * 2 - DLB_MAX == +;; DLB_MAX (if fma) +;; 0 (if no fma) +;; form https://www.vinc17.net/software/fma-tests.c +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f64x2.relaxed_madd" + (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 2.0 2.0) + (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023)) + (either (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 inf inf))) + +;; Special values for double: +;; x = 0x1.00000004p+0 (1 + 2^-30) +;; y = 0x1.000002p+0 (1 + 2^-23) +;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0) +;; = -0x1.00000204p+0 +;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = 0x1p-53 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f64x2.relaxed_madd" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd" + (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f32x4.relaxed_madd_cmp" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (v128.const i32x4 -1 -1 -1 -1)) + +;; Special values for float: +;; x = 0x1.000004p+0 (1 + 2^-22) +;; y = 0x1.0002p+0 (1 + 2^-15) +;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0) +;; = -0x1.000204p+0 +;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = (0x1p-37) 2^-37 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f32x4.relaxed_madd_cmp" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd_cmp" + (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd_cmp" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) + +;; DBL_MAX = 0x1.fffffffffffffp+1023 +;; DLB_MAX * 2 - DLB_MAX == +;; DLB_MAX (if fma) +;; 0 (if no fma) +;; form https://www.vinc17.net/software/fma-tests.c +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f64x2.relaxed_madd_cmp" + (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 2.0 2.0) + (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023)) + (v128.const i64x2 -1 -1)) + +;; Special values for double: +;; x = 0x1.00000004p+0 (1 + 2^-30) +;; y = 0x1.000002p+0 (1 + 2^-23) +;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0) +;; = -0x1.00000204p+0 +;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = 0x1p-53 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f64x2.relaxed_madd_cmp" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd_cmp" + (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd_cmp" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) + +;; Test that the non-deterministic choice of fusing and then rounding or +;; rounding multiple times in `relaxed_madd` is consistent throughout a +;; program's execution. +;; +;; This property is impossible to test exhaustively, so this is just a simple +;; smoke test for when the operands to a `relaxed_madd` are known statically +;; versus when they are dynamically supplied. This should, at least, catch +;; illegal constant-folding and -propagation by the compiler that leads to +;; inconsistent rounding behavior at compile time versus at run time. +;; +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(module + (func (export "test-consistent-nondeterminism") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_madd (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (f32x4.relaxed_madd (local.get 0) + (local.get 1) + (local.get 2)) + ) + ) +) +(assert_return (invoke "test-consistent-nondeterminism" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/tests/wast/spec/proposals/relaxed-simd/relaxed_min_max.wast b/tests/wast/spec/proposals/relaxed-simd/relaxed_min_max.wast new file mode 100644 index 00000000000..ac3ebb07cac --- /dev/null +++ b/tests/wast/spec/proposals/relaxed-simd/relaxed_min_max.wast @@ -0,0 +1,184 @@ +;; Tests for f32x4.min, f32x4.max, f64x2.min, and f64x2.max. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "f32x4.relaxed_min") (param v128 v128) (result v128) (f32x4.relaxed_min (local.get 0) (local.get 1))) + (func (export "f32x4.relaxed_max") (param v128 v128) (result v128) (f32x4.relaxed_max (local.get 0) (local.get 1))) + (func (export "f64x2.relaxed_min") (param v128 v128) (result v128) (f64x2.relaxed_min (local.get 0) (local.get 1))) + (func (export "f64x2.relaxed_max") (param v128 v128) (result v128) (f64x2.relaxed_max (local.get 0) (local.get 1))) + + (func (export "f32x4.relaxed_min_cmp") (param v128 v128) (result v128) + (i32x4.eq + (f32x4.relaxed_min (local.get 0) (local.get 1)) + (f32x4.relaxed_min (local.get 0) (local.get 1)))) + (func (export "f32x4.relaxed_max_cmp") (param v128 v128) (result v128) + (i32x4.eq + (f32x4.relaxed_max (local.get 0) (local.get 1)) + (f32x4.relaxed_max (local.get 0) (local.get 1)))) + (func (export "f64x2.relaxed_min_cmp") (param v128 v128) (result v128) + (i64x2.eq + (f64x2.relaxed_min (local.get 0) (local.get 1)) + (f64x2.relaxed_min (local.get 0) (local.get 1)))) + (func (export "f64x2.relaxed_max_cmp") (param v128 v128) (result v128) + (i64x2.eq + (f64x2.relaxed_max (local.get 0) (local.get 1)) + (f64x2.relaxed_max (local.get 0) (local.get 1)))) +) + +(assert_return (invoke "f32x4.relaxed_min" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical) + (v128.const f32x4 nan:canonical nan:canonical 0 0) + (v128.const f32x4 0 0 nan:canonical nan:canonical) + (v128.const f32x4 0 0 0 0))) + +(assert_return (invoke "f32x4.relaxed_min" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (either (v128.const f32x4 -0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 -0.0 +0.0 -0.0))) + +(assert_return (invoke "f32x4.relaxed_max" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical) + (v128.const f32x4 nan:canonical nan:canonical 0 0) + (v128.const f32x4 0 0 nan:canonical nan:canonical) + (v128.const f32x4 0 0 0 0))) + +(assert_return (invoke "f32x4.relaxed_max" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (either (v128.const f32x4 +0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 -0.0 +0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (either (v128.const f64x2 -0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0) + (v128.const f64x2 -0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (either (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (either (v128.const f64x2 +0.0 +0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0) + (v128.const f64x2 -0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (either (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "f32x4.relaxed_min_cmp" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_min_cmp" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_max_cmp" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_max_cmp" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (v128.const i64x2 -1 -1)) From 662231311c98c04cb772114d09fa9e01c3cc0fe0 Mon Sep 17 00:00:00 2001 From: Syrus Akbary Date: Sun, 1 Feb 2026 20:31:07 +0100 Subject: [PATCH 02/18] Added Relaxed SIMD LLVM implementation --- lib/compiler-llvm/src/translator/code.rs | 295 ++++++++++++++++++++++- tests/ignores.txt | 1 - 2 files changed, 285 insertions(+), 11 deletions(-) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index ce663fe7898..33acf0f1d96 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -3733,7 +3733,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } - Operator::I16x8Q15MulrSatS => { + Operator::I16x8Q15MulrSatS | Operator::I16x8RelaxedQ15mulrS => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, _) = self.v128_into_i16x8(v1, i1)?; let (v2, _) = self.v128_into_i16x8(v2, i2)?; @@ -4018,6 +4018,174 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } + Operator::I16x8RelaxedDotI8x16I7x16S => { + let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; + let (v1, _) = self.v128_into_i8x16(v1, i1)?; + let (v2, _) = self.v128_into_i8x16(v2, i2)?; + + let left_indices = [ + self.intrinsics.i32_consts[0], + self.intrinsics.i32_consts[2], + self.intrinsics.i32_consts[4], + self.intrinsics.i32_consts[6], + self.intrinsics.i32_consts[8], + self.intrinsics.i32_consts[10], + self.intrinsics.i32_consts[12], + self.intrinsics.i32_consts[14], + ]; + let right_indices = [ + self.intrinsics.i32_consts[1], + self.intrinsics.i32_consts[3], + self.intrinsics.i32_consts[5], + self.intrinsics.i32_consts[7], + self.intrinsics.i32_consts[9], + self.intrinsics.i32_consts[11], + self.intrinsics.i32_consts[13], + self.intrinsics.i32_consts[15], + ]; + + let v1_left = err!(self.builder.build_shuffle_vector( + v1, + v1.get_type().get_undef(), + VectorType::const_vector(&left_indices), + "", + )); + let v1_left = + err!(self.builder.build_int_s_extend(v1_left, self.intrinsics.i16x8_ty, "")); + let v1_right = err!(self.builder.build_shuffle_vector( + v1, + v1.get_type().get_undef(), + VectorType::const_vector(&right_indices), + "", + )); + let v1_right = + err!(self.builder.build_int_s_extend(v1_right, self.intrinsics.i16x8_ty, "")); + + let v2_left = err!(self.builder.build_shuffle_vector( + v2, + v2.get_type().get_undef(), + VectorType::const_vector(&left_indices), + "", + )); + let v2_left = + err!(self.builder.build_int_s_extend(v2_left, self.intrinsics.i16x8_ty, "")); + let v2_right = err!(self.builder.build_shuffle_vector( + v2, + v2.get_type().get_undef(), + VectorType::const_vector(&right_indices), + "", + )); + let v2_right = + err!(self.builder.build_int_s_extend(v2_right, self.intrinsics.i16x8_ty, "")); + + let prod_left = err!(self.builder.build_int_mul(v1_left, v2_left, "")); + let prod_right = err!(self.builder.build_int_mul(v1_right, v2_right, "")); + let res = err!(self.builder.build_int_add(prod_left, prod_right, "")); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } + Operator::I32x4RelaxedDotI8x16I7x16AddS => { + let ((v1, i1), (v2, i2), (acc, acc_info)) = self.state.pop3_extra()?; + let (v1, _) = self.v128_into_i8x16(v1, i1)?; + let (v2, _) = self.v128_into_i8x16(v2, i2)?; + let (acc, _) = self.v128_into_i32x4(acc, acc_info)?; + + let left_indices = [ + self.intrinsics.i32_consts[0], + self.intrinsics.i32_consts[2], + self.intrinsics.i32_consts[4], + self.intrinsics.i32_consts[6], + self.intrinsics.i32_consts[8], + self.intrinsics.i32_consts[10], + self.intrinsics.i32_consts[12], + self.intrinsics.i32_consts[14], + ]; + let right_indices = [ + self.intrinsics.i32_consts[1], + self.intrinsics.i32_consts[3], + self.intrinsics.i32_consts[5], + self.intrinsics.i32_consts[7], + self.intrinsics.i32_consts[9], + self.intrinsics.i32_consts[11], + self.intrinsics.i32_consts[13], + self.intrinsics.i32_consts[15], + ]; + + let v1_left = err!(self.builder.build_shuffle_vector( + v1, + v1.get_type().get_undef(), + VectorType::const_vector(&left_indices), + "", + )); + let v1_left = + err!(self.builder.build_int_s_extend(v1_left, self.intrinsics.i16x8_ty, "")); + let v1_right = err!(self.builder.build_shuffle_vector( + v1, + v1.get_type().get_undef(), + VectorType::const_vector(&right_indices), + "", + )); + let v1_right = + err!(self.builder.build_int_s_extend(v1_right, self.intrinsics.i16x8_ty, "")); + + let v2_left = err!(self.builder.build_shuffle_vector( + v2, + v2.get_type().get_undef(), + VectorType::const_vector(&left_indices), + "", + )); + let v2_left = + err!(self.builder.build_int_s_extend(v2_left, self.intrinsics.i16x8_ty, "")); + let v2_right = err!(self.builder.build_shuffle_vector( + v2, + v2.get_type().get_undef(), + VectorType::const_vector(&right_indices), + "", + )); + let v2_right = + err!(self.builder.build_int_s_extend(v2_right, self.intrinsics.i16x8_ty, "")); + + let prod_left = err!(self.builder.build_int_mul(v1_left, v2_left, "")); + let prod_right = err!(self.builder.build_int_mul(v1_right, v2_right, "")); + let dot16 = err!(self.builder.build_int_add(prod_left, prod_right, "")); + + let pair_left = err!(self.builder.build_shuffle_vector( + dot16, + dot16.get_type().get_undef(), + VectorType::const_vector(&[ + self.intrinsics.i32_consts[0], + self.intrinsics.i32_consts[2], + self.intrinsics.i32_consts[4], + self.intrinsics.i32_consts[6], + ]), + "", + )); + let pair_left = + err!(self.builder.build_int_s_extend(pair_left, self.intrinsics.i32x4_ty, "")); + let pair_right = err!(self.builder.build_shuffle_vector( + dot16, + dot16.get_type().get_undef(), + VectorType::const_vector(&[ + self.intrinsics.i32_consts[1], + self.intrinsics.i32_consts[3], + self.intrinsics.i32_consts[5], + self.intrinsics.i32_consts[7], + ]), + "", + )); + let pair_right = + err!(self.builder.build_int_s_extend(pair_right, self.intrinsics.i32x4_ty, "")); + let dot32 = err!(self.builder.build_int_add(pair_left, pair_right, "")); + let res = err!(self.builder.build_int_add(dot32, acc, "")); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } Operator::I32DivS | Operator::I64DivS => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let v1 = self.apply_pending_canonicalization(v1, i1)?; @@ -4139,7 +4307,11 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { let res = err!(self.builder.build_and(v1, v2, "")); self.state.push1(res); } - Operator::V128Bitselect => { + Operator::I8x16RelaxedLaneselect + | Operator::I16x8RelaxedLaneselect + | Operator::I32x4RelaxedLaneselect + | Operator::I64x2RelaxedLaneselect + | Operator::V128Bitselect => { let ((v1, i1), (v2, i2), (cond, cond_info)) = self.state.pop3_extra()?; let v1 = self.apply_pending_canonicalization(v1, i1)?; let v2 = self.apply_pending_canonicalization(v2, i2)?; @@ -5301,6 +5473,52 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f32_nan())?, ); } + Operator::F32x4RelaxedMadd | Operator::F32x4RelaxedNmadd => { + let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?; + let (v1, i1) = self.v128_into_f32x4(v1, i1)?; + let (v2, i2) = self.v128_into_f32x4(v2, i2)?; + let (v3, i3) = self.v128_into_f32x4(v3, i3)?; + + let v1 = match op { + Operator::F32x4RelaxedNmadd => err!(self.builder.build_float_neg(v1, "")), + _ => v1, + }; + let mul = self + .build_call_with_param_attributes( + self.intrinsics.mul_f32x4, + &[ + v1.into(), + v2.into(), + self.intrinsics.fp_rounding_md, + self.intrinsics.fp_exception_md, + ], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let mul = mul.into_vector_value(); + let res = self + .build_call_with_param_attributes( + self.intrinsics.add_f32x4, + &[ + mul.into(), + v3.into(), + self.intrinsics.fp_rounding_md, + self.intrinsics.fp_exception_md, + ], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + let info = (i1.strip_pending() & i2.strip_pending())?; + let info = (info & i3.strip_pending())?; + let info = (info | ExtraInfo::pending_f32_nan())?; + self.state.push1_extra(res, info); + } Operator::F64x2Mul => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, i1) = self.v128_into_f64x2(v1, i1)?; @@ -5327,6 +5545,52 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f64_nan())?, ); } + Operator::F64x2RelaxedMadd | Operator::F64x2RelaxedNmadd => { + let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?; + let (v1, i1) = self.v128_into_f64x2(v1, i1)?; + let (v2, i2) = self.v128_into_f64x2(v2, i2)?; + let (v3, i3) = self.v128_into_f64x2(v3, i3)?; + + let v1 = match op { + Operator::F64x2RelaxedNmadd => err!(self.builder.build_float_neg(v1, "")), + _ => v1, + }; + let mul = self + .build_call_with_param_attributes( + self.intrinsics.mul_f64x2, + &[ + v1.into(), + v2.into(), + self.intrinsics.fp_rounding_md, + self.intrinsics.fp_exception_md, + ], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let mul = mul.into_vector_value(); + let res = self + .build_call_with_param_attributes( + self.intrinsics.add_f64x2, + &[ + mul.into(), + v3.into(), + self.intrinsics.fp_rounding_md, + self.intrinsics.fp_exception_md, + ], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + let info = (i1.strip_pending() & i2.strip_pending())?; + let info = (info & i3.strip_pending())?; + let info = (info | ExtraInfo::pending_f64_nan())?; + self.state.push1_extra(res, info); + } Operator::F32Div => { let (v1, v2) = self.state.pop2()?; let (v1, v2) = (v1.into_float_value(), v2.into_float_value()); @@ -5505,7 +5769,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { self.state.push1_extra(res, ExtraInfo::pending_f64_nan()); } - Operator::F32x4Min => { + Operator::F32x4Min | Operator::F32x4RelaxedMin => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, i1) = self.v128_into_f32x4(v1, i1)?; let (v2, i2) = self.v128_into_f32x4(v2, i2)?; @@ -5546,7 +5810,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } - Operator::F64x2Min => { + Operator::F64x2Min | Operator::F64x2RelaxedMin => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, i1) = self.v128_into_f64x2(v1, i1)?; let (v2, i2) = self.v128_into_f64x2(v2, i2)?; @@ -5633,7 +5897,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { self.state.push1_extra(res, ExtraInfo::pending_f64_nan()); } - Operator::F32x4Max => { + Operator::F32x4Max | Operator::F32x4RelaxedMax => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, i1) = self.v128_into_f32x4(v1, i1)?; let (v2, i2) = self.v128_into_f32x4(v2, i2)?; @@ -5675,7 +5939,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } - Operator::F64x2Max => { + Operator::F64x2Max | Operator::F64x2RelaxedMax => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, i1) = self.v128_into_f64x2(v1, i1)?; let (v2, i2) = self.v128_into_f64x2(v2, i2)?; @@ -7768,7 +8032,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } - Operator::I32x4TruncSatF32x4S => { + Operator::I32x4TruncSatF32x4S | Operator::I32x4RelaxedTruncF32x4S => { let (v, i) = self.state.pop1_extra()?; let v = self.apply_pending_canonicalization(v, i)?; let v = v.into_int_value(); @@ -7783,7 +8047,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { )?; self.state.push1(res); } - Operator::I32x4TruncSatF32x4U => { + Operator::I32x4TruncSatF32x4U | Operator::I32x4RelaxedTruncF32x4U => { let (v, i) = self.state.pop1_extra()?; let v = self.apply_pending_canonicalization(v, i)?; let v = v.into_int_value(); @@ -7798,7 +8062,10 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { )?; self.state.push1(res); } - Operator::I32x4TruncSatF64x2SZero | Operator::I32x4TruncSatF64x2UZero => { + Operator::I32x4TruncSatF64x2SZero + | Operator::I32x4TruncSatF64x2UZero + | Operator::I32x4RelaxedTruncF64x2SZero + | Operator::I32x4RelaxedTruncF64x2UZero => { let ((min, max), (cmp_min, cmp_max)) = match op { Operator::I32x4TruncSatF64x2SZero => ( (i32::MIN as u64, i32::MAX as u64), @@ -7808,6 +8075,14 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { (u32::MIN as u64, u32::MAX as u64), (LEF64_GEQ_U32_MIN, GEF64_LEQ_U32_MAX), ), + Operator::I32x4RelaxedTruncF64x2SZero => ( + (i32::MIN as u64, i32::MAX as u64), + (LEF64_GEQ_I32_MIN, GEF64_LEQ_I32_MAX), + ), + Operator::I32x4RelaxedTruncF64x2UZero => ( + (u32::MIN as u64, u32::MAX as u64), + (LEF64_GEQ_U32_MIN, GEF64_LEQ_U32_MAX), + ), _ => unreachable!("Unhandled internal variant"), }; let (v, i) = self.state.pop1_extra()?; @@ -9549,7 +9824,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { }; self.state.push1_extra(res, info); } - Operator::I8x16Swizzle => { + Operator::I8x16Swizzle | Operator::I8x16RelaxedSwizzle => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let v1 = self.apply_pending_canonicalization(v1, i1)?; let v1 = err!( diff --git a/tests/ignores.txt b/tests/ignores.txt index 3769315d50a..bfd9f05411a 100644 --- a/tests/ignores.txt +++ b/tests/ignores.txt @@ -3,7 +3,6 @@ singlepass spec::simd # Singlepass doesn't support yet SIMD (no one asked for th singlepass spec::relaxed_simd # Singlepass doesn't support relaxed SIMD yet singlepass wasmer::simd_generated_ext_ops singlepass wasmer::simd -llvm spec::relaxed_simd # LLVM compiler doesn't support relaxed SIMD yet singlepass spec::exception_handling # Singlepass doesn't support EH yet (no one asked for this feature) singlepass wasmer::exception_handling windows spec::exception_handling # No EH support on Windows yet From dfa9fce7de816eb6178fadad2cd4c7bf340861a7 Mon Sep 17 00:00:00 2001 From: Syrus Akbary Date: Sun, 1 Feb 2026 20:49:29 +0100 Subject: [PATCH 03/18] Improved linting --- lib/compiler-llvm/src/translator/code.rs | 70 +++++++++++++++++------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index 33acf0f1d96..6f42980425f 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -4050,16 +4050,22 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { VectorType::const_vector(&left_indices), "", )); - let v1_left = - err!(self.builder.build_int_s_extend(v1_left, self.intrinsics.i16x8_ty, "")); + let v1_left = err!(self.builder.build_int_s_extend( + v1_left, + self.intrinsics.i16x8_ty, + "" + )); let v1_right = err!(self.builder.build_shuffle_vector( v1, v1.get_type().get_undef(), VectorType::const_vector(&right_indices), "", )); - let v1_right = - err!(self.builder.build_int_s_extend(v1_right, self.intrinsics.i16x8_ty, "")); + let v1_right = err!(self.builder.build_int_s_extend( + v1_right, + self.intrinsics.i16x8_ty, + "" + )); let v2_left = err!(self.builder.build_shuffle_vector( v2, @@ -4067,16 +4073,22 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { VectorType::const_vector(&left_indices), "", )); - let v2_left = - err!(self.builder.build_int_s_extend(v2_left, self.intrinsics.i16x8_ty, "")); + let v2_left = err!(self.builder.build_int_s_extend( + v2_left, + self.intrinsics.i16x8_ty, + "" + )); let v2_right = err!(self.builder.build_shuffle_vector( v2, v2.get_type().get_undef(), VectorType::const_vector(&right_indices), "", )); - let v2_right = - err!(self.builder.build_int_s_extend(v2_right, self.intrinsics.i16x8_ty, "")); + let v2_right = err!(self.builder.build_int_s_extend( + v2_right, + self.intrinsics.i16x8_ty, + "" + )); let prod_left = err!(self.builder.build_int_mul(v1_left, v2_left, "")); let prod_right = err!(self.builder.build_int_mul(v1_right, v2_right, "")); @@ -4120,16 +4132,22 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { VectorType::const_vector(&left_indices), "", )); - let v1_left = - err!(self.builder.build_int_s_extend(v1_left, self.intrinsics.i16x8_ty, "")); + let v1_left = err!(self.builder.build_int_s_extend( + v1_left, + self.intrinsics.i16x8_ty, + "" + )); let v1_right = err!(self.builder.build_shuffle_vector( v1, v1.get_type().get_undef(), VectorType::const_vector(&right_indices), "", )); - let v1_right = - err!(self.builder.build_int_s_extend(v1_right, self.intrinsics.i16x8_ty, "")); + let v1_right = err!(self.builder.build_int_s_extend( + v1_right, + self.intrinsics.i16x8_ty, + "" + )); let v2_left = err!(self.builder.build_shuffle_vector( v2, @@ -4137,16 +4155,22 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { VectorType::const_vector(&left_indices), "", )); - let v2_left = - err!(self.builder.build_int_s_extend(v2_left, self.intrinsics.i16x8_ty, "")); + let v2_left = err!(self.builder.build_int_s_extend( + v2_left, + self.intrinsics.i16x8_ty, + "" + )); let v2_right = err!(self.builder.build_shuffle_vector( v2, v2.get_type().get_undef(), VectorType::const_vector(&right_indices), "", )); - let v2_right = - err!(self.builder.build_int_s_extend(v2_right, self.intrinsics.i16x8_ty, "")); + let v2_right = err!(self.builder.build_int_s_extend( + v2_right, + self.intrinsics.i16x8_ty, + "" + )); let prod_left = err!(self.builder.build_int_mul(v1_left, v2_left, "")); let prod_right = err!(self.builder.build_int_mul(v1_right, v2_right, "")); @@ -4163,8 +4187,11 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ]), "", )); - let pair_left = - err!(self.builder.build_int_s_extend(pair_left, self.intrinsics.i32x4_ty, "")); + let pair_left = err!(self.builder.build_int_s_extend( + pair_left, + self.intrinsics.i32x4_ty, + "" + )); let pair_right = err!(self.builder.build_shuffle_vector( dot16, dot16.get_type().get_undef(), @@ -4176,8 +4203,11 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ]), "", )); - let pair_right = - err!(self.builder.build_int_s_extend(pair_right, self.intrinsics.i32x4_ty, "")); + let pair_right = err!(self.builder.build_int_s_extend( + pair_right, + self.intrinsics.i32x4_ty, + "" + )); let dot32 = err!(self.builder.build_int_add(pair_left, pair_right, "")); let res = err!(self.builder.build_int_add(dot32, acc, "")); let res = err!( From 685da768954e37ad7c615410aa97bb8d592417a6 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Sun, 15 Feb 2026 20:12:22 +0100 Subject: [PATCH 04/18] proper detection of relaxed SIMD feature for WASM --- lib/compiler-cranelift/src/config.rs | 1 + lib/compiler-llvm/src/config.rs | 1 + lib/package/src/utils.rs | 3 +++ lib/types/src/features.rs | 5 ++++- 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/compiler-cranelift/src/config.rs b/lib/compiler-cranelift/src/config.rs index 787ba9d7b41..39ac34f358a 100644 --- a/lib/compiler-cranelift/src/config.rs +++ b/lib/compiler-cranelift/src/config.rs @@ -305,6 +305,7 @@ impl CompilerConfig for Cranelift { if target.triple().operating_system == OperatingSystem::Linux { feats.exceptions(true); } + feats.relaxed_simd(true); feats } } diff --git a/lib/compiler-llvm/src/config.rs b/lib/compiler-llvm/src/config.rs index 90534135922..902bda116c2 100644 --- a/lib/compiler-llvm/src/config.rs +++ b/lib/compiler-llvm/src/config.rs @@ -384,6 +384,7 @@ impl CompilerConfig for LLVM { fn supported_features_for_target(&self, _target: &Target) -> wasmer_types::Features { let mut feats = Features::default(); feats.exceptions(true); + feats.relaxed_simd(true); feats } } diff --git a/lib/package/src/utils.rs b/lib/package/src/utils.rs index 073e5686e6d..83bbf9c3ce5 100644 --- a/lib/package/src/utils.rs +++ b/lib/package/src/utils.rs @@ -197,6 +197,9 @@ pub fn wasm_annotations_to_features(feature_strings: &[String]) -> Features { "memory64" => { features.memory64(true); } + "relaxed-simd" => { + features.relaxed_simd(true); + } // Ignore unrecognized features _ => {} } diff --git a/lib/types/src/features.rs b/lib/types/src/features.rs index 8cfdb812381..d8fc16b8e37 100644 --- a/lib/types/src/features.rs +++ b/lib/types/src/features.rs @@ -371,6 +371,7 @@ impl Features { wasm_features.set(WasmFeatures::TAIL_CALL, true); wasm_features.set(WasmFeatures::MULTI_MEMORY, true); wasm_features.set(WasmFeatures::MEMORY64, true); + wasm_features.set(WasmFeatures::RELAXED_SIMD, false); let mut validator = Validator::new_with_features(wasm_features); match validator.validate_all(wasm_bytes) { @@ -390,7 +391,9 @@ impl Features { features.reference_types(true); } - if err_msg.contains("simd") { + if err_msg.contains("relaxed simd") { + features.relaxed_simd(true); + } else if err_msg.contains("simd") { features.simd(true); } From 31b0792ab967814e6fb5fa90c4128b29375eb64e Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Sun, 15 Feb 2026 20:21:02 +0100 Subject: [PATCH 05/18] x86_64: fast I8x16RelaxedSwizzle implementation --- Cargo.lock | 1 + lib/compiler-llvm/Cargo.toml | 1 + lib/compiler-llvm/src/compiler.rs | 2 ++ lib/compiler-llvm/src/translator/code.rs | 29 ++++++++++++++++++- .../src/translator/intrinsics.rs | 16 ++++++++++ 5 files changed, 48 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 14c20b4d8e4..c472db8818b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7110,6 +7110,7 @@ dependencies = [ "byteorder", "cc", "crossbeam-channel", + "enumset", "inkwell", "itertools 0.14.0", "libc", diff --git a/lib/compiler-llvm/Cargo.toml b/lib/compiler-llvm/Cargo.toml index 57c072139e6..6c5ed4bd995 100644 --- a/lib/compiler-llvm/Cargo.toml +++ b/lib/compiler-llvm/Cargo.toml @@ -29,6 +29,7 @@ itertools.workspace = true rayon.workspace = true phf = { workspace = true, features = ["macros"] } tracing = { workspace = true, features = ["log"] } +enumset.workspace = true inkwell = { workspace = true, features = [ "llvm21-1-prefer-static", "target-x86", diff --git a/lib/compiler-llvm/src/compiler.rs b/lib/compiler-llvm/src/compiler.rs index 2c0df6624d7..05d3f1d8fd0 100644 --- a/lib/compiler-llvm/src/compiler.rs +++ b/lib/compiler-llvm/src/compiler.rs @@ -200,6 +200,7 @@ impl LLVMCompiler { Some(target_machine_no_opt), binary_format, pointer_width, + *target.cpu_features(), ) .unwrap() }, @@ -449,6 +450,7 @@ impl Compiler for LLVMCompiler { Some(target_machine_no_opt), binary_format, pointer_width, + *target.cpu_features(), ) .unwrap() }, diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index 0cbc1cf3199..d227385b003 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -8,6 +8,7 @@ use super::{ state::{ControlFrame, ExtraInfo, IfElseState, State, TagCatchInfo}, }; use crate::compiler::ModuleBasedSymbolRegistry; +use enumset::EnumSet; use inkwell::{ AddressSpace, AtomicOrdering, AtomicRMWBinOp, DLLStorageClass, FloatPredicate, IntPredicate, attributes::{Attribute, AttributeLoc}, @@ -51,7 +52,7 @@ use wasmer_compiler::{ }; use wasmer_types::{ CompileError, FunctionIndex, FunctionType, GlobalIndex, LocalFunctionIndex, MemoryIndex, - ModuleInfo, SignatureIndex, TableIndex, Type, + ModuleInfo, SignatureIndex, TableIndex, Type, target::CpuFeature, }; use wasmer_types::{TagIndex, entity::PrimaryMap}; use wasmer_vm::{MemoryStyle, TableStyle, VMOffsets}; @@ -76,6 +77,7 @@ pub struct FuncTranslator { binary_fmt: BinaryFormat, func_section: String, pointer_width: u8, + cpu_features: EnumSet, } impl wasmer_compiler::FuncTranslator for FuncTranslator {} @@ -87,6 +89,7 @@ impl FuncTranslator { target_machine_no_opt: Option, binary_fmt: BinaryFormat, pointer_width: u8, + cpu_features: EnumSet, ) -> Result { let abi = get_abi(&target_machine); Ok(Self { @@ -106,6 +109,7 @@ impl FuncTranslator { }, binary_fmt, pointer_width, + cpu_features, }) } @@ -339,6 +343,7 @@ impl FuncTranslator { target_triple: self.target_triple.clone(), tags_cache: HashMap::new(), binary_fmt: self.binary_fmt, + cpu_features: self.cpu_features, }; fcg.ctx.add_func( @@ -1931,6 +1936,7 @@ pub struct LLVMFunctionCodeGenerator<'ctx, 'a> { target_triple: Triple, tags_cache: HashMap>, binary_fmt: target_lexicon::BinaryFormat, + cpu_features: EnumSet, } impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { @@ -9854,6 +9860,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { }; self.state.push1_extra(res, info); } + Operator::I8x16RelaxedSwizzle if self.cpu_features.contains(CpuFeature::SSSE3) => { + let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; + let v1 = self.apply_pending_canonicalization(v1, i1)?; + let v2 = self.apply_pending_canonicalization(v2, i2)?; + + let (v1, _) = self.v128_into_i8x16(v1, i1)?; + let (v2, _) = self.v128_into_i8x16(v2, i2)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.pshufb128, + &[v1.into(), v2.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } Operator::I8x16Swizzle | Operator::I8x16RelaxedSwizzle => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let v1 = self.apply_pending_canonicalization(v1, i1)?; diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index 2a0153cb3cf..0ab626bb04a 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -56,6 +56,12 @@ pub fn type_to_llvm<'ctx>( } } +/// Struct containing x86_64 SIMD LLVM intrinsics. +#[allow(dead_code)] +pub struct X86_64Intrinsics<'ctx> { + pub pshufb128: FunctionValue<'ctx>, +} + /// Struct containing LLVM and VM intrinsics. #[allow(dead_code)] pub struct Intrinsics<'ctx> { @@ -195,6 +201,8 @@ pub struct Intrinsics<'ctx> { pub ptr_ty: PointerType<'ctx>, + pub x86_64: X86_64Intrinsics<'ctx>, + pub anyfunc_ty: StructType<'ctx>, pub i1_zero: IntValue<'ctx>, @@ -1255,6 +1263,14 @@ impl<'ctx> Intrinsics<'ctx> { // LLVM > 15 has a single type for pointers. ptr_ty, + + x86_64: X86_64Intrinsics { + pshufb128: add_function_with_attrs( + "llvm.x86.ssse3.pshuf.b.128", + ret_i8x16_take_i8x16_i8x16, + None, + ), + }, }; let noreturn = From 59249890d99f733161d6d31983724e7aa307f717 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Sun, 15 Feb 2026 20:43:17 +0100 Subject: [PATCH 06/18] x86_64: fast implementation of I32x4RelaxedTruncF32x4S --- lib/compiler-llvm/src/translator/code.rs | 19 +++++++++++++++++++ .../src/translator/intrinsics.rs | 7 +++++++ 2 files changed, 26 insertions(+) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index d227385b003..c4f3d950cdf 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -8068,6 +8068,25 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } + Operator::I32x4RelaxedTruncF32x4S + if self.cpu_features.contains(CpuFeature::SSE2) => + { + let (v, i) = self.state.pop1_extra()?; + let (v, _) = self.v128_into_f32x4(v, i)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.cvtps2dq, + &[v.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } Operator::I32x4TruncSatF32x4S | Operator::I32x4RelaxedTruncF32x4S => { let (v, i) = self.state.pop1_extra()?; let v = self.apply_pending_canonicalization(v, i)?; diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index 0ab626bb04a..875964f311d 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -60,6 +60,7 @@ pub fn type_to_llvm<'ctx>( #[allow(dead_code)] pub struct X86_64Intrinsics<'ctx> { pub pshufb128: FunctionValue<'ctx>, + pub cvtps2dq: FunctionValue<'ctx>, } /// Struct containing LLVM and VM intrinsics. @@ -490,6 +491,7 @@ impl<'ctx> Intrinsics<'ctx> { ], false, ); + let ret_i32x4_take_f32x4 = i32x4_ty.fn_type(&[f32x4_ty_basic_md], false); let add_function_with_attrs = |name: &str, ty: FunctionType<'ctx>, linkage: Option| -> FunctionValue<'ctx> { @@ -1270,6 +1272,11 @@ impl<'ctx> Intrinsics<'ctx> { ret_i8x16_take_i8x16_i8x16, None, ), + cvtps2dq: add_function_with_attrs( + "llvm.x86.sse2.cvtps2dq", + ret_i32x4_take_f32x4, + None, + ), }, }; From f8a74b9cf1a0870e4e7de11e44794d4ef05b5ed3 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Sun, 15 Feb 2026 20:48:20 +0100 Subject: [PATCH 07/18] x86_64: fast implementation of I32x4RelaxedTruncF32x4U --- lib/compiler-llvm/src/translator/code.rs | 24 +++++++++++++++++++ .../src/translator/intrinsics.rs | 11 +++++++++ 2 files changed, 35 insertions(+) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index c4f3d950cdf..a12937a9ac2 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -8102,6 +8102,30 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { )?; self.state.push1(res); } + Operator::I32x4RelaxedTruncF32x4U + if self.cpu_features.contains(CpuFeature::AVX512F) + && self.cpu_features.contains(CpuFeature::AVX512VL) => + { + let (v, i) = self.state.pop1_extra()?; + let (v, _) = self.v128_into_f32x4(v, i)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.cvtps2udq128, + &[ + v.into(), + self.intrinsics.i32x4_ty.const_zero().into(), + self.intrinsics.i8_ty.const_int(0xff, false).into(), + ], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } Operator::I32x4TruncSatF32x4U | Operator::I32x4RelaxedTruncF32x4U => { let (v, i) = self.state.pop1_extra()?; let v = self.apply_pending_canonicalization(v, i)?; diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index 875964f311d..9c4f2f2ab9e 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -61,6 +61,7 @@ pub fn type_to_llvm<'ctx>( pub struct X86_64Intrinsics<'ctx> { pub pshufb128: FunctionValue<'ctx>, pub cvtps2dq: FunctionValue<'ctx>, + pub cvtps2udq128: FunctionValue<'ctx>, } /// Struct containing LLVM and VM intrinsics. @@ -370,6 +371,7 @@ impl<'ctx> Intrinsics<'ctx> { let f64_ty_basic_md: BasicMetadataTypeEnum = f64_ty.into(); let i8x16_ty_basic_md: BasicMetadataTypeEnum = i8x16_ty.into(); let i16x8_ty_basic_md: BasicMetadataTypeEnum = i16x8_ty.into(); + let i32x4_ty_basic_md: BasicMetadataTypeEnum = i32x4_ty.into(); let f32x4_ty_basic_md: BasicMetadataTypeEnum = f32x4_ty.into(); let f64x2_ty_basic_md: BasicMetadataTypeEnum = f64x2_ty.into(); let md_ty_basic_md: BasicMetadataTypeEnum = md_ty.into(); @@ -492,6 +494,10 @@ impl<'ctx> Intrinsics<'ctx> { false, ); let ret_i32x4_take_f32x4 = i32x4_ty.fn_type(&[f32x4_ty_basic_md], false); + let ret_i32x4_take_f32x4_i32x4_i8 = i32x4_ty.fn_type( + &[f32x4_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()], + false, + ); let add_function_with_attrs = |name: &str, ty: FunctionType<'ctx>, linkage: Option| -> FunctionValue<'ctx> { @@ -1277,6 +1283,11 @@ impl<'ctx> Intrinsics<'ctx> { ret_i32x4_take_f32x4, None, ), + cvtps2udq128: add_function_with_attrs( + "llvm.x86.avx512.mask.cvtps2udq.128", + ret_i32x4_take_f32x4_i32x4_i8, + None, + ), }, }; From 95fec257748899976c93387bb7a7509eab9ee182 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Sun, 15 Feb 2026 23:27:01 +0100 Subject: [PATCH 08/18] x86_64: fast implementation of 2 more trunc instructions --- lib/compiler-llvm/src/translator/code.rs | 47 +++++++++++++++++-- .../src/translator/intrinsics.rs | 17 +++++++ .../relaxed-simd/i32x4_relaxed_trunc.wast | 2 +- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index a12937a9ac2..253a3c57a8d 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -8068,9 +8068,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } - Operator::I32x4RelaxedTruncF32x4S - if self.cpu_features.contains(CpuFeature::SSE2) => - { + Operator::I32x4RelaxedTruncF32x4S if self.cpu_features.contains(CpuFeature::SSE2) => { let (v, i) = self.state.pop1_extra()?; let (v, _) = self.v128_into_f32x4(v, i)?; let res = self @@ -8141,6 +8139,49 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { )?; self.state.push1(res); } + Operator::I32x4RelaxedTruncF64x2SZero + if self.cpu_features.contains(CpuFeature::SSE2) => + { + let (v, i) = self.state.pop1_extra()?; + let (v, _) = self.v128_into_f64x2(v, i)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.cvtpd2dq, + &[v.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } + Operator::I32x4RelaxedTruncF64x2UZero + if self.cpu_features.contains(CpuFeature::AVX512F) + && self.cpu_features.contains(CpuFeature::AVX512VL) => + { + let (v, i) = self.state.pop1_extra()?; + let (v, _) = self.v128_into_f64x2(v, i)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.cvtpd2udq128, + &[ + v.into(), + self.intrinsics.i32x4_ty.const_zero().into(), + self.intrinsics.i8_ty.const_int(0xff, false).into(), + ], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } Operator::I32x4TruncSatF64x2SZero | Operator::I32x4TruncSatF64x2UZero | Operator::I32x4RelaxedTruncF64x2SZero diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index 9c4f2f2ab9e..94925c8fb2d 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -62,6 +62,8 @@ pub struct X86_64Intrinsics<'ctx> { pub pshufb128: FunctionValue<'ctx>, pub cvtps2dq: FunctionValue<'ctx>, pub cvtps2udq128: FunctionValue<'ctx>, + pub cvtpd2dq: FunctionValue<'ctx>, + pub cvtpd2udq128: FunctionValue<'ctx>, } /// Struct containing LLVM and VM intrinsics. @@ -498,6 +500,11 @@ impl<'ctx> Intrinsics<'ctx> { &[f32x4_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()], false, ); + let ret_i32x4_take_f64x2 = i32x4_ty.fn_type(&[f64x2_ty_basic_md], false); + let ret_i32x4_take_f64x2_i32x4_i8 = i32x4_ty.fn_type( + &[f64x2_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()], + false, + ); let add_function_with_attrs = |name: &str, ty: FunctionType<'ctx>, linkage: Option| -> FunctionValue<'ctx> { @@ -1288,6 +1295,16 @@ impl<'ctx> Intrinsics<'ctx> { ret_i32x4_take_f32x4_i32x4_i8, None, ), + cvtpd2dq: add_function_with_attrs( + "llvm.x86.sse2.cvtpd2dq", + ret_i32x4_take_f64x2, + None, + ), + cvtpd2udq128: add_function_with_attrs( + "llvm.x86.avx512.mask.cvtpd2udq.128", + ret_i32x4_take_f64x2_i32x4_i8, + None, + ), }, }; diff --git a/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast b/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast index cca3ecb958a..e4ea88e3643 100644 --- a/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast +++ b/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast @@ -79,7 +79,7 @@ (assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero" (v128.const f64x2 nan -nan)) (either (v128.const i32x4 0 0 0 0) - (v128.const i32x4 0 0 0xffffffff 0xffffffff))) + (v128.const i32x4 0xffffffff 0xffffffff 0 0))) ;; Check that multiple calls to the relaxed instruction with same inputs returns same results. From d8ab31b3258f0fac664317de525183b543a06503 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Sun, 15 Feb 2026 23:40:45 +0100 Subject: [PATCH 09/18] x86_64: fast implementation for FMA isntructions --- lib/compiler-llvm/src/translator/code.rs | 70 +++++++++++++++++++ .../src/translator/intrinsics.rs | 32 +++++++++ lib/types/src/target.rs | 6 ++ 3 files changed, 108 insertions(+) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index 253a3c57a8d..6ce7bde0d5b 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -5509,6 +5509,41 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f32_nan())?, ); } + Operator::F32x4RelaxedMadd | Operator::F32x4RelaxedNmadd + if self.cpu_features.contains(CpuFeature::FMA) => + { + let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?; + let (v1, i1) = self.v128_into_f32x4(v1, i1)?; + let (v2, i2) = self.v128_into_f32x4(v2, i2)?; + let (v3, i3) = self.v128_into_f32x4(v3, i3)?; + + let v1 = match op { + Operator::F32x4RelaxedNmadd => err!(self.builder.build_float_neg(v1, "")), + _ => v1, + }; + let res = self + .build_call_with_param_attributes( + self.intrinsics.muladd_f32x4, + &[ + v1.into(), + v2.into(), + v3.into(), + self.intrinsics.fp_rounding_md, + self.intrinsics.fp_exception_md, + ], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + let info = (i1.strip_pending() & i2.strip_pending())?; + let info = (info & i3.strip_pending())?; + let info = (info | ExtraInfo::pending_f32_nan())?; + self.state.push1_extra(res, info); + } Operator::F32x4RelaxedMadd | Operator::F32x4RelaxedNmadd => { let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?; let (v1, i1) = self.v128_into_f32x4(v1, i1)?; @@ -5581,6 +5616,41 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f64_nan())?, ); } + Operator::F64x2RelaxedMadd | Operator::F64x2RelaxedNmadd + if self.cpu_features.contains(CpuFeature::FMA) => + { + let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?; + let (v1, i1) = self.v128_into_f64x2(v1, i1)?; + let (v2, i2) = self.v128_into_f64x2(v2, i2)?; + let (v3, i3) = self.v128_into_f64x2(v3, i3)?; + + let v1 = match op { + Operator::F64x2RelaxedNmadd => err!(self.builder.build_float_neg(v1, "")), + _ => v1, + }; + let res = self + .build_call_with_param_attributes( + self.intrinsics.muladd_f64x2, + &[ + v1.into(), + v2.into(), + v3.into(), + self.intrinsics.fp_rounding_md, + self.intrinsics.fp_exception_md, + ], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + let info = (i1.strip_pending() & i2.strip_pending())?; + let info = (info & i3.strip_pending())?; + let info = (info | ExtraInfo::pending_f64_nan())?; + self.state.push1_extra(res, info); + } Operator::F64x2RelaxedMadd | Operator::F64x2RelaxedNmadd => { let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?; let (v1, i1) = self.v128_into_f64x2(v1, i1)?; diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index 94925c8fb2d..70570ebbc65 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -99,6 +99,8 @@ pub struct Intrinsics<'ctx> { pub mul_f64: FunctionValue<'ctx>, pub mul_f32x4: FunctionValue<'ctx>, pub mul_f64x2: FunctionValue<'ctx>, + pub muladd_f32x4: FunctionValue<'ctx>, + pub muladd_f64x2: FunctionValue<'ctx>, pub div_f32: FunctionValue<'ctx>, pub div_f64: FunctionValue<'ctx>, @@ -415,6 +417,26 @@ impl<'ctx> Intrinsics<'ctx> { f32x4_ty.fn_type(&[f32x4_ty_basic_md, f32x4_ty_basic_md], false); let ret_f64x2_take_f64x2_f64x2 = f64x2_ty.fn_type(&[f64x2_ty_basic_md, f64x2_ty_basic_md], false); + let ret_f32x4_take_f32x4_f32x4_f32x4_md_md = f32x4_ty.fn_type( + &[ + f32x4_ty_basic_md, + f32x4_ty_basic_md, + f32x4_ty_basic_md, + md_ty_basic_md, + md_ty_basic_md, + ], + false, + ); + let ret_f64x2_take_f64x2_f64x2_f64x2_md_md = f64x2_ty.fn_type( + &[ + f64x2_ty_basic_md, + f64x2_ty_basic_md, + f64x2_ty_basic_md, + md_ty_basic_md, + md_ty_basic_md, + ], + false, + ); let ret_f64_take_f32_md = f64_ty.fn_type(&[f32_ty_basic_md, md_ty_basic_md], false); let ret_f32_take_f64_md_md = @@ -649,6 +671,16 @@ impl<'ctx> Intrinsics<'ctx> { ret_f64x2_take_f64x2_f64x2_md_md, None, ), + muladd_f32x4: add_function_with_attrs( + "llvm.experimental.constrained.fmuladd.v4f32", + ret_f32x4_take_f32x4_f32x4_f32x4_md_md, + None, + ), + muladd_f64x2: add_function_with_attrs( + "llvm.experimental.constrained.fmuladd.v2f64", + ret_f64x2_take_f64x2_f64x2_f64x2_md_md, + None, + ), div_f32: add_function_with_attrs( "llvm.experimental.constrained.fdiv.f32", diff --git a/lib/types/src/target.rs b/lib/types/src/target.rs index c0e4eada1e0..7f782b4f87d 100644 --- a/lib/types/src/target.rs +++ b/lib/types/src/target.rs @@ -42,6 +42,7 @@ pub enum CpuFeature { BMI1, BMI2, AVX2, + FMA, AVX512DQ, AVX512VL, AVX512F, @@ -87,6 +88,9 @@ impl CpuFeature { if std::is_x86_feature_detected!("avx2") { features.insert(Self::AVX2); } + if std::is_x86_feature_detected!("fma") { + features.insert(Self::FMA); + } if std::is_x86_feature_detected!("avx512dq") { features.insert(Self::AVX512DQ); } @@ -154,6 +158,7 @@ impl FromStr for CpuFeature { "bmi" => Ok(Self::BMI1), "bmi2" => Ok(Self::BMI2), "avx2" => Ok(Self::AVX2), + "fma" => Ok(Self::FMA), "avx512dq" => Ok(Self::AVX512DQ), "avx512vl" => Ok(Self::AVX512VL), "avx512f" => Ok(Self::AVX512F), @@ -180,6 +185,7 @@ impl std::fmt::Display for CpuFeature { Self::BMI1 => "bmi", Self::BMI2 => "bmi2", Self::AVX2 => "avx2", + Self::FMA => "fma", Self::AVX512DQ => "avx512dq", Self::AVX512VL => "avx512vl", Self::AVX512F => "avx512f", From 20d03eb5336a0407cda9b1f37357d9a0879ae353 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Sun, 15 Feb 2026 23:50:28 +0100 Subject: [PATCH 10/18] x86_64: fast implementation of RelaxedLaneselect --- lib/compiler-llvm/src/translator/code.rs | 28 +++++++++++++++++++ .../src/translator/intrinsics.rs | 10 +++++++ 2 files changed, 38 insertions(+) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index 6ce7bde0d5b..d5c4e5f066f 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -4346,6 +4346,34 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { Operator::I8x16RelaxedLaneselect | Operator::I16x8RelaxedLaneselect | Operator::I32x4RelaxedLaneselect + | Operator::I64x2RelaxedLaneselect + if self.cpu_features.contains(CpuFeature::SSE41) => + { + let ((v1, i1), (v2, i2), (mask, mask_info)) = self.state.pop3_extra()?; + let v1 = self.apply_pending_canonicalization(v1, i1)?; + let v2 = self.apply_pending_canonicalization(v2, i2)?; + let mask = self.apply_pending_canonicalization(mask, mask_info)?; + + let (v1, _) = self.v128_into_i8x16(v1, i1)?; + let (v2, _) = self.v128_into_i8x16(v2, i2)?; + let (mask, _) = self.v128_into_i8x16(mask, mask_info)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.pblendvb, + &[v2.into(), v1.into(), mask.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } + Operator::I8x16RelaxedLaneselect + | Operator::I16x8RelaxedLaneselect + | Operator::I32x4RelaxedLaneselect | Operator::I64x2RelaxedLaneselect | Operator::V128Bitselect => { let ((v1, i1), (v2, i2), (cond, cond_info)) = self.state.pop3_extra()?; diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index 70570ebbc65..f6325b014cc 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -60,6 +60,7 @@ pub fn type_to_llvm<'ctx>( #[allow(dead_code)] pub struct X86_64Intrinsics<'ctx> { pub pshufb128: FunctionValue<'ctx>, + pub pblendvb: FunctionValue<'ctx>, pub cvtps2dq: FunctionValue<'ctx>, pub cvtps2udq128: FunctionValue<'ctx>, pub cvtpd2dq: FunctionValue<'ctx>, @@ -397,6 +398,10 @@ impl<'ctx> Intrinsics<'ctx> { let ret_i8x16_take_i8x16 = i8x16_ty.fn_type(&[i8x16_ty_basic_md], false); let ret_i8x16_take_i8x16_i8x16 = i8x16_ty.fn_type(&[i8x16_ty_basic_md, i8x16_ty_basic_md], false); + let ret_i8x16_take_i8x16_i8x16_i8x16 = i8x16_ty.fn_type( + &[i8x16_ty_basic_md, i8x16_ty_basic_md, i8x16_ty_basic_md], + false, + ); let ret_i16x8_take_i16x8_i16x8 = i16x8_ty.fn_type(&[i16x8_ty_basic_md, i16x8_ty_basic_md], false); @@ -1317,6 +1322,11 @@ impl<'ctx> Intrinsics<'ctx> { ret_i8x16_take_i8x16_i8x16, None, ), + pblendvb: add_function_with_attrs( + "llvm.x86.sse41.pblendvb", + ret_i8x16_take_i8x16_i8x16_i8x16, + None, + ), cvtps2dq: add_function_with_attrs( "llvm.x86.sse2.cvtps2dq", ret_i32x4_take_f32x4, From 4c4d146861a80cda70e5c9496d53cc007d52a6f0 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Sun, 15 Feb 2026 23:57:58 +0100 Subject: [PATCH 11/18] x86_64: fast implementation of RelaxedMin/Max --- lib/compiler-llvm/src/translator/code.rs | 84 +++++++++++++++++++ .../src/translator/intrinsics.rs | 24 ++++++ 2 files changed, 108 insertions(+) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index d5c4e5f066f..c6c6d5f3b02 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -5903,6 +5903,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { self.state.push1_extra(res, ExtraInfo::pending_f64_nan()); } + Operator::F32x4RelaxedMin if self.cpu_features.contains(CpuFeature::SSE2) => { + let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; + let (v1, i1) = self.v128_into_f32x4(v1, i1)?; + let (v2, i2) = self.v128_into_f32x4(v2, i2)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.min_ps, + &[v1.into(), v2.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1_extra( + res, + ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f32_nan())?, + ); + } Operator::F32x4Min | Operator::F32x4RelaxedMin => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, i1) = self.v128_into_f32x4(v1, i1)?; @@ -5944,6 +5965,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } + Operator::F64x2RelaxedMin if self.cpu_features.contains(CpuFeature::SSE2) => { + let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; + let (v1, i1) = self.v128_into_f64x2(v1, i1)?; + let (v2, i2) = self.v128_into_f64x2(v2, i2)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.min_pd, + &[v1.into(), v2.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1_extra( + res, + ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f64_nan())?, + ); + } Operator::F64x2Min | Operator::F64x2RelaxedMin => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, i1) = self.v128_into_f64x2(v1, i1)?; @@ -6031,6 +6073,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { self.state.push1_extra(res, ExtraInfo::pending_f64_nan()); } + Operator::F32x4RelaxedMax if self.cpu_features.contains(CpuFeature::SSE2) => { + let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; + let (v1, i1) = self.v128_into_f32x4(v1, i1)?; + let (v2, i2) = self.v128_into_f32x4(v2, i2)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.max_ps, + &[v1.into(), v2.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1_extra( + res, + ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f32_nan())?, + ); + } Operator::F32x4Max | Operator::F32x4RelaxedMax => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, i1) = self.v128_into_f32x4(v1, i1)?; @@ -6073,6 +6136,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } + Operator::F64x2RelaxedMax if self.cpu_features.contains(CpuFeature::SSE2) => { + let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; + let (v1, i1) = self.v128_into_f64x2(v1, i1)?; + let (v2, i2) = self.v128_into_f64x2(v2, i2)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.max_pd, + &[v1.into(), v2.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1_extra( + res, + ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f64_nan())?, + ); + } Operator::F64x2Max | Operator::F64x2RelaxedMax => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, i1) = self.v128_into_f64x2(v1, i1)?; diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index f6325b014cc..d3e16c017c0 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -61,6 +61,10 @@ pub fn type_to_llvm<'ctx>( pub struct X86_64Intrinsics<'ctx> { pub pshufb128: FunctionValue<'ctx>, pub pblendvb: FunctionValue<'ctx>, + pub min_ps: FunctionValue<'ctx>, + pub min_pd: FunctionValue<'ctx>, + pub max_ps: FunctionValue<'ctx>, + pub max_pd: FunctionValue<'ctx>, pub cvtps2dq: FunctionValue<'ctx>, pub cvtps2udq128: FunctionValue<'ctx>, pub cvtpd2dq: FunctionValue<'ctx>, @@ -1327,6 +1331,26 @@ impl<'ctx> Intrinsics<'ctx> { ret_i8x16_take_i8x16_i8x16_i8x16, None, ), + min_ps: add_function_with_attrs( + "llvm.x86.sse.min.ps", + ret_f32x4_take_f32x4_f32x4, + None, + ), + min_pd: add_function_with_attrs( + "llvm.x86.sse2.min.pd", + ret_f64x2_take_f64x2_f64x2, + None, + ), + max_ps: add_function_with_attrs( + "llvm.x86.sse.max.ps", + ret_f32x4_take_f32x4_f32x4, + None, + ), + max_pd: add_function_with_attrs( + "llvm.x86.sse2.max.pd", + ret_f64x2_take_f64x2_f64x2, + None, + ), cvtps2dq: add_function_with_attrs( "llvm.x86.sse2.cvtps2dq", ret_i32x4_take_f32x4, From 04ee7589b83d7615e618c35603f9390a7d077840 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 16 Feb 2026 00:05:30 +0100 Subject: [PATCH 12/18] x86_64: fast implementation of I16x8RelaxedQ15mulrS --- lib/compiler-llvm/src/translator/code.rs | 18 ++++++++++++++++++ lib/compiler-llvm/src/translator/intrinsics.rs | 6 ++++++ 2 files changed, 24 insertions(+) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index c6c6d5f3b02..7048c42da63 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -3739,6 +3739,24 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } + Operator::I16x8RelaxedQ15mulrS if self.cpu_features.contains(CpuFeature::SSSE3) => { + let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; + let (v1, _) = self.v128_into_i16x8(v1, i1)?; + let (v2, _) = self.v128_into_i16x8(v2, i2)?; + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.pmulhrsw128, + &[v1.into(), v2.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } Operator::I16x8Q15MulrSatS | Operator::I16x8RelaxedQ15mulrS => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, _) = self.v128_into_i16x8(v1, i1)?; diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index d3e16c017c0..a5b5be07a9a 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -60,6 +60,7 @@ pub fn type_to_llvm<'ctx>( #[allow(dead_code)] pub struct X86_64Intrinsics<'ctx> { pub pshufb128: FunctionValue<'ctx>, + pub pmulhrsw128: FunctionValue<'ctx>, pub pblendvb: FunctionValue<'ctx>, pub min_ps: FunctionValue<'ctx>, pub min_pd: FunctionValue<'ctx>, @@ -1326,6 +1327,11 @@ impl<'ctx> Intrinsics<'ctx> { ret_i8x16_take_i8x16_i8x16, None, ), + pmulhrsw128: add_function_with_attrs( + "llvm.x86.ssse3.pmul.hr.sw.128", + ret_i16x8_take_i16x8_i16x8, + None, + ), pblendvb: add_function_with_attrs( "llvm.x86.sse41.pblendvb", ret_i8x16_take_i8x16_i8x16_i8x16, From 013c5ce049872c6ccd77872d598430b65a56740a Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 16 Feb 2026 00:14:41 +0100 Subject: [PATCH 13/18] x86_64: fast implementation of I16x8RelaxedDotI8x16I7x16S --- lib/compiler-llvm/src/translator/code.rs | 22 +++++++++++++++++++ .../src/translator/intrinsics.rs | 6 +++++ 2 files changed, 28 insertions(+) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index 7048c42da63..01a05c9f9d8 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -4042,6 +4042,28 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } + Operator::I16x8RelaxedDotI8x16I7x16S + if self.cpu_features.contains(CpuFeature::SSSE3) => + { + let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; + let (a, _) = self.v128_into_i8x16(v1, i1)?; + let (b, _) = self.v128_into_i8x16(v2, i2)?; + + let res = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.pmaddubsw128, + &[b.into(), a.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic() + .into_vector_value(); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } Operator::I16x8RelaxedDotI8x16I7x16S => { let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?; let (v1, _) = self.v128_into_i8x16(v1, i1)?; diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index a5b5be07a9a..e49e033b9b9 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -60,6 +60,7 @@ pub fn type_to_llvm<'ctx>( #[allow(dead_code)] pub struct X86_64Intrinsics<'ctx> { pub pshufb128: FunctionValue<'ctx>, + pub pmaddubsw128: FunctionValue<'ctx>, pub pmulhrsw128: FunctionValue<'ctx>, pub pblendvb: FunctionValue<'ctx>, pub min_ps: FunctionValue<'ctx>, @@ -1327,6 +1328,11 @@ impl<'ctx> Intrinsics<'ctx> { ret_i8x16_take_i8x16_i8x16, None, ), + pmaddubsw128: add_function_with_attrs( + "llvm.x86.ssse3.pmadd.ub.sw.128", + i16x8_ty.fn_type(&[i8x16_ty_basic_md, i8x16_ty_basic_md], false), + None, + ), pmulhrsw128: add_function_with_attrs( "llvm.x86.ssse3.pmul.hr.sw.128", ret_i16x8_take_i16x8_i16x8, From b447a0539ce62e3be2664f2f52faa0b2a883920d Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 16 Feb 2026 00:19:20 +0100 Subject: [PATCH 14/18] x86_64: fast implementation of I32x4RelaxedDotI8x16I7x16AddS --- lib/compiler-llvm/src/translator/code.rs | 36 +++++++++++++++++++ .../src/translator/intrinsics.rs | 6 ++++ 2 files changed, 42 insertions(+) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index 01a05c9f9d8..40f64437d2e 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -4145,6 +4145,42 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { ); self.state.push1(res); } + Operator::I32x4RelaxedDotI8x16I7x16AddS + if self.cpu_features.contains(CpuFeature::SSSE3) => + { + let ((v1, i1), (v2, i2), (acc, acc_info)) = self.state.pop3_extra()?; + let (v1, _) = self.v128_into_i8x16(v1, i1)?; + let (v2, _) = self.v128_into_i8x16(v2, i2)?; + let (acc, _) = self.v128_into_i32x4(acc, acc_info)?; + + // PMADDUBSW computes pairwise u8*i8 with i16 saturation, which + // is one of the valid relaxed dot-product behaviors. + let dot16 = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.pmaddubsw128, + &[v2.into(), v1.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic() + .into_vector_value(); + let ones = VectorType::const_vector(&[self.intrinsics.i16_ty.const_int(1, false); 8]); + let dot32 = self + .build_call_with_param_attributes( + self.intrinsics.x86_64.pmaddwd128, + &[dot16.into(), ones.into()], + "", + )? + .try_as_basic_value() + .unwrap_basic() + .into_vector_value(); + let res = err!(self.builder.build_int_add(dot32, acc, "")); + let res = err!( + self.builder + .build_bit_cast(res, self.intrinsics.i128_ty, "") + ); + self.state.push1(res); + } Operator::I32x4RelaxedDotI8x16I7x16AddS => { let ((v1, i1), (v2, i2), (acc, acc_info)) = self.state.pop3_extra()?; let (v1, _) = self.v128_into_i8x16(v1, i1)?; diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index e49e033b9b9..09e99bc6e0a 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -61,6 +61,7 @@ pub fn type_to_llvm<'ctx>( pub struct X86_64Intrinsics<'ctx> { pub pshufb128: FunctionValue<'ctx>, pub pmaddubsw128: FunctionValue<'ctx>, + pub pmaddwd128: FunctionValue<'ctx>, pub pmulhrsw128: FunctionValue<'ctx>, pub pblendvb: FunctionValue<'ctx>, pub min_ps: FunctionValue<'ctx>, @@ -1333,6 +1334,11 @@ impl<'ctx> Intrinsics<'ctx> { i16x8_ty.fn_type(&[i8x16_ty_basic_md, i8x16_ty_basic_md], false), None, ), + pmaddwd128: add_function_with_attrs( + "llvm.x86.sse2.pmadd.wd", + i32x4_ty.fn_type(&[i16x8_ty_basic_md, i16x8_ty_basic_md], false), + None, + ), pmulhrsw128: add_function_with_attrs( "llvm.x86.ssse3.pmul.hr.sw.128", ret_i16x8_take_i16x8_i16x8, From 004861e9438cb21f49b490cbe71feb2a31811504 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 16 Feb 2026 09:30:33 +0100 Subject: [PATCH 15/18] fix enum order in CpuFeature --- lib/types/src/target.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/types/src/target.rs b/lib/types/src/target.rs index 7f782b4f87d..e47748ef412 100644 --- a/lib/types/src/target.rs +++ b/lib/types/src/target.rs @@ -42,14 +42,14 @@ pub enum CpuFeature { BMI1, BMI2, AVX2, - FMA, AVX512DQ, AVX512VL, AVX512F, LZCNT, // ARM features NEON, - // RISC-V features + // X86 features (TODO: reorganize at some point) + FMA, } impl CpuFeature { From 45a44a7b05c34a306749fd9339f97bbac94fd08a Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 16 Feb 2026 09:30:55 +0100 Subject: [PATCH 16/18] run cargo fmt --- lib/compiler-llvm/src/translator/code.rs | 3 ++- lib/compiler-llvm/src/translator/intrinsics.rs | 12 ++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index 40f64437d2e..d5bca2bb08b 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -4164,7 +4164,8 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { .try_as_basic_value() .unwrap_basic() .into_vector_value(); - let ones = VectorType::const_vector(&[self.intrinsics.i16_ty.const_int(1, false); 8]); + let ones = + VectorType::const_vector(&[self.intrinsics.i16_ty.const_int(1, false); 8]); let dot32 = self .build_call_with_param_attributes( self.intrinsics.x86_64.pmaddwd128, diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index 09e99bc6e0a..032518080c3 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -530,15 +530,11 @@ impl<'ctx> Intrinsics<'ctx> { false, ); let ret_i32x4_take_f32x4 = i32x4_ty.fn_type(&[f32x4_ty_basic_md], false); - let ret_i32x4_take_f32x4_i32x4_i8 = i32x4_ty.fn_type( - &[f32x4_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()], - false, - ); + let ret_i32x4_take_f32x4_i32x4_i8 = + i32x4_ty.fn_type(&[f32x4_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()], false); let ret_i32x4_take_f64x2 = i32x4_ty.fn_type(&[f64x2_ty_basic_md], false); - let ret_i32x4_take_f64x2_i32x4_i8 = i32x4_ty.fn_type( - &[f64x2_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()], - false, - ); + let ret_i32x4_take_f64x2_i32x4_i8 = + i32x4_ty.fn_type(&[f64x2_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()], false); let add_function_with_attrs = |name: &str, ty: FunctionType<'ctx>, linkage: Option| -> FunctionValue<'ctx> { From f80ce65c46d1f3ba97f1a2578b45ff7ba83f4818 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 16 Feb 2026 11:33:58 +0100 Subject: [PATCH 17/18] enable relaxed_simd for the fuzzer --- fuzz/fuzz_targets/universal_llvm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuzz/fuzz_targets/universal_llvm.rs b/fuzz/fuzz_targets/universal_llvm.rs index 40c2af74c06..0a2b3ddc86d 100644 --- a/fuzz/fuzz_targets/universal_llvm.rs +++ b/fuzz/fuzz_targets/universal_llvm.rs @@ -25,7 +25,7 @@ impl Arbitrary<'_> for LLVMPassFuzzModule { config.memory64_enabled = false; config.max_memories = 1; config.tail_call_enabled = false; - config.relaxed_simd_enabled = false; + config.relaxed_simd_enabled = true; Ok(Self(wasm_smith::Module::new(config, u)?)) } } From fbfbecddb8a66720b7a37d3963d8c7cdfd5f73eb Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Tue, 17 Feb 2026 09:25:59 +0100 Subject: [PATCH 18/18] replace cvtps2dq with cvttps2dq --- lib/compiler-llvm/src/translator/code.rs | 2 +- lib/compiler-llvm/src/translator/intrinsics.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs index d5bca2bb08b..99ebe9d6b74 100644 --- a/lib/compiler-llvm/src/translator/code.rs +++ b/lib/compiler-llvm/src/translator/code.rs @@ -8332,7 +8332,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> { let (v, _) = self.v128_into_f32x4(v, i)?; let res = self .build_call_with_param_attributes( - self.intrinsics.x86_64.cvtps2dq, + self.intrinsics.x86_64.cvttps2dq, &[v.into()], "", )? diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs index 032518080c3..2e59cfaf177 100644 --- a/lib/compiler-llvm/src/translator/intrinsics.rs +++ b/lib/compiler-llvm/src/translator/intrinsics.rs @@ -68,7 +68,7 @@ pub struct X86_64Intrinsics<'ctx> { pub min_pd: FunctionValue<'ctx>, pub max_ps: FunctionValue<'ctx>, pub max_pd: FunctionValue<'ctx>, - pub cvtps2dq: FunctionValue<'ctx>, + pub cvttps2dq: FunctionValue<'ctx>, pub cvtps2udq128: FunctionValue<'ctx>, pub cvtpd2dq: FunctionValue<'ctx>, pub cvtpd2udq128: FunctionValue<'ctx>, @@ -1365,8 +1365,8 @@ impl<'ctx> Intrinsics<'ctx> { ret_f64x2_take_f64x2_f64x2, None, ), - cvtps2dq: add_function_with_attrs( - "llvm.x86.sse2.cvtps2dq", + cvttps2dq: add_function_with_attrs( + "llvm.x86.sse2.cvttps2dq", ret_i32x4_take_f32x4, None, ),