From 5b840b94a5501cb943b163ccd1662e09197f2c32 Mon Sep 17 00:00:00 2001
From: Syrus Akbary <me@syrusakbary.com>
Date: Sun, 1 Feb 2026 20:16:28 +0100
Subject: [PATCH 01/18] Implemented Relaxed SIMD in Cranelift

---
 build.rs                                      |   5 +
 .../src/translator/code_translator.rs         | 141 +++++++++--
 tests/compilers/wast.rs                       |   4 +
 tests/ignores.txt                             |   3 +
 tests/lib/wast/src/wast.rs                    |   3 +
 .../relaxed-simd/i16x8_relaxed_q15mulr_s.wast |  28 +++
 .../relaxed-simd/i32x4_relaxed_trunc.wast     | 124 ++++++++++
 .../relaxed-simd/i8x16_relaxed_swizzle.wast   |  45 ++++
 .../relaxed-simd/relaxed_dot_product.wast     | 107 +++++++++
 .../relaxed-simd/relaxed_laneselect.wast      | 103 ++++++++
 .../relaxed-simd/relaxed_madd_nmadd.wast      | 224 ++++++++++++++++++
 .../relaxed-simd/relaxed_min_max.wast         | 184 ++++++++++++++
 12 files changed, 949 insertions(+), 22 deletions(-)
 create mode 100644 tests/wast/spec/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast
 create mode 100644 tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast
 create mode 100644 tests/wast/spec/proposals/relaxed-simd/i8x16_relaxed_swizzle.wast
 create mode 100644 tests/wast/spec/proposals/relaxed-simd/relaxed_dot_product.wast
 create mode 100644 tests/wast/spec/proposals/relaxed-simd/relaxed_laneselect.wast
 create mode 100644 tests/wast/spec/proposals/relaxed-simd/relaxed_madd_nmadd.wast
 create mode 100644 tests/wast/spec/proposals/relaxed-simd/relaxed_min_max.wast

diff --git a/build.rs b/build.rs
index 07de79eaad7..cb0a5c11b96 100644
--- a/build.rs
+++ b/build.rs
@@ -59,6 +59,11 @@ fn main() -> anyhow::Result<()> {
                 wast_processor,
             )?;
             test_directory_module(spectests, "tests/wast/spec/proposals/simd", wast_processor)?;
+            test_directory_module(
+                spectests,
+                "tests/wast/spec/proposals/relaxed-simd",
+                wast_processor,
+            )?;
             test_directory_module(
                 spectests,
                 "tests/wast/spec/proposals/exception-handling",
diff --git a/lib/compiler-cranelift/src/translator/code_translator.rs b/lib/compiler-cranelift/src/translator/code_translator.rs
index 54a80a608a9..2bab41c7810 100644
--- a/lib/compiler-cranelift/src/translator/code_translator.rs
+++ b/lib/compiler-cranelift/src/translator/code_translator.rs
@@ -1740,6 +1740,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let (a, b) = pop2_with_bitcast(state, I8X16, builder);
             state.push1(builder.ins().swizzle(a, b))
         }
+        Operator::I8x16RelaxedSwizzle => {
+            let (a, b) = pop2_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().swizzle(a, b))
+        }
         Operator::I8x16Add | Operator::I16x8Add | Operator::I32x4Add | Operator::I64x2Add => {
             let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().iadd(a, b))
@@ -1852,6 +1856,19 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             // operands must match (hence the bitcast).
             state.push1(builder.ins().bitselect(bitcast_c, bitcast_a, bitcast_b))
         }
+        Operator::I8x16RelaxedLaneselect
+        | Operator::I16x8RelaxedLaneselect
+        | Operator::I32x4RelaxedLaneselect
+        | Operator::I64x2RelaxedLaneselect => {
+            let (a, b, c) = state.pop3();
+            let ty = type_of(op);
+            let bitcast_a = optionally_bitcast_vector(a, ty, builder);
+            let bitcast_b = optionally_bitcast_vector(b, ty, builder);
+            let bitcast_c = optionally_bitcast_vector(c, ty, builder);
+            // The CLIF operand ordering is slightly different and the types of all three
+            // operands must match (hence the bitcast).
+            state.push1(builder.ins().bitselect(bitcast_c, bitcast_a, bitcast_b))
+        }
         Operator::V128AnyTrue => {
             let a = pop1_with_bitcast(state, type_of(op), builder);
             let bool_result = builder.ins().vany_true(a);
@@ -1935,6 +1952,25 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().fmul(a, b))
         }
+        Operator::F32x4RelaxedMadd | Operator::F64x2RelaxedMadd => {
+            let ty = type_of(op);
+            let (a, b, c) = state.pop3();
+            let a = optionally_bitcast_vector(a, ty, builder);
+            let b = optionally_bitcast_vector(b, ty, builder);
+            let c = optionally_bitcast_vector(c, ty, builder);
+            let mul = builder.ins().fmul(a, b);
+            state.push1(builder.ins().fadd(mul, c))
+        }
+        Operator::F32x4RelaxedNmadd | Operator::F64x2RelaxedNmadd => {
+            let ty = type_of(op);
+            let (a, b, c) = state.pop3();
+            let a = optionally_bitcast_vector(a, ty, builder);
+            let b = optionally_bitcast_vector(b, ty, builder);
+            let c = optionally_bitcast_vector(c, ty, builder);
+            let a = builder.ins().fneg(a);
+            let mul = builder.ins().fmul(a, b);
+            state.push1(builder.ins().fadd(mul, c))
+        }
         Operator::F32x4Div | Operator::F64x2Div => {
             let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().fdiv(a, b))
@@ -1943,10 +1979,18 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().fmax(a, b))
         }
+        Operator::F32x4RelaxedMax | Operator::F64x2RelaxedMax => {
+            let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
+            state.push1(builder.ins().fmax(a, b))
+        }
         Operator::F32x4Min | Operator::F64x2Min => {
             let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().fmin(a, b))
         }
+        Operator::F32x4RelaxedMin | Operator::F64x2RelaxedMin => {
+            let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
+            state.push1(builder.ins().fmin(a, b))
+        }
         Operator::F32x4PMax | Operator::F64x2PMax => {
             // Note the careful ordering here with respect to `fcmp` and
             // `bitselect`. This matches the spec definition of:
@@ -2014,6 +2058,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let a = pop1_with_bitcast(state, F32X4, builder);
             state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a))
         }
+        Operator::I32x4RelaxedTruncF32x4S => {
+            let a = pop1_with_bitcast(state, F32X4, builder);
+            state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a))
+        }
         Operator::I32x4TruncSatF64x2SZero => {
             let a = pop1_with_bitcast(state, F64X2, builder);
             let converted_a = builder.ins().fcvt_to_sint_sat(I64X2, a);
@@ -2022,10 +2070,22 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
 
             state.push1(builder.ins().snarrow(converted_a, zero));
         }
+        Operator::I32x4RelaxedTruncF64x2SZero => {
+            let a = pop1_with_bitcast(state, F64X2, builder);
+            let converted_a = builder.ins().fcvt_to_sint_sat(I64X2, a);
+            let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into());
+            let zero = builder.ins().vconst(I64X2, handle);
+
+            state.push1(builder.ins().snarrow(converted_a, zero));
+        }
         Operator::I32x4TruncSatF32x4U => {
             let a = pop1_with_bitcast(state, F32X4, builder);
             state.push1(builder.ins().fcvt_to_uint_sat(I32X4, a))
         }
+        Operator::I32x4RelaxedTruncF32x4U => {
+            let a = pop1_with_bitcast(state, F32X4, builder);
+            state.push1(builder.ins().fcvt_to_uint_sat(I32X4, a))
+        }
         Operator::I32x4TruncSatF64x2UZero => {
             let a = pop1_with_bitcast(state, F64X2, builder);
             let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a);
@@ -2034,6 +2094,14 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
 
             state.push1(builder.ins().uunarrow(converted_a, zero));
         }
+        Operator::I32x4RelaxedTruncF64x2UZero => {
+            let a = pop1_with_bitcast(state, F64X2, builder);
+            let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a);
+            let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into());
+            let zero = builder.ins().vconst(I64X2, handle);
+
+            state.push1(builder.ins().uunarrow(converted_a, zero));
+        }
         Operator::I8x16NarrowI16x8S => {
             let (a, b) = pop2_with_bitcast(state, I16X8, builder);
             state.push1(builder.ins().snarrow(a, b))
@@ -2152,6 +2220,16 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let high = builder.ins().imul(ahigh, bhigh);
             state.push1(builder.ins().iadd_pairwise(low, high));
         }
+        Operator::I16x8RelaxedDotI8x16I7x16S => {
+            let (a, b) = pop2_with_bitcast(state, I8X16, builder);
+            let alow = builder.ins().swiden_low(a);
+            let blow = builder.ins().swiden_low(b);
+            let low = builder.ins().imul(alow, blow);
+            let ahigh = builder.ins().swiden_high(a);
+            let bhigh = builder.ins().swiden_high(b);
+            let high = builder.ins().imul(ahigh, bhigh);
+            state.push1(builder.ins().iadd_pairwise(low, high));
+        }
         Operator::I8x16Popcnt => {
             let arg = pop1_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().popcnt(arg));
@@ -2160,6 +2238,27 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let (a, b) = pop2_with_bitcast(state, I16X8, builder);
             state.push1(builder.ins().sqmul_round_sat(a, b))
         }
+        Operator::I16x8RelaxedQ15mulrS => {
+            let (a, b) = pop2_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().sqmul_round_sat(a, b))
+        }
+        Operator::I32x4RelaxedDotI8x16I7x16AddS => {
+            let (a, b, c) = state.pop3();
+            let a = optionally_bitcast_vector(a, I8X16, builder);
+            let b = optionally_bitcast_vector(b, I8X16, builder);
+            let c = optionally_bitcast_vector(c, I32X4, builder);
+            let alow = builder.ins().swiden_low(a);
+            let blow = builder.ins().swiden_low(b);
+            let low = builder.ins().imul(alow, blow);
+            let ahigh = builder.ins().swiden_high(a);
+            let bhigh = builder.ins().swiden_high(b);
+            let high = builder.ins().imul(ahigh, bhigh);
+            let dot = builder.ins().iadd_pairwise(low, high);
+            let dotlo = builder.ins().swiden_low(dot);
+            let dothi = builder.ins().swiden_high(dot);
+            let dot32 = builder.ins().iadd_pairwise(dotlo, dothi);
+            state.push1(builder.ins().iadd(dot32, c));
+        }
         Operator::I16x8ExtMulLowI8x16S => {
             let (a, b) = pop2_with_bitcast(state, I8X16, builder);
             let a_low = builder.ins().swiden_low(a);
@@ -2235,28 +2334,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
             return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
         }
-        Operator::I8x16RelaxedSwizzle
-        | Operator::I32x4RelaxedTruncF32x4S
-        | Operator::I32x4RelaxedTruncF32x4U
-        | Operator::I32x4RelaxedTruncF64x2SZero
-        | Operator::I32x4RelaxedTruncF64x2UZero
-        | Operator::F32x4RelaxedNmadd
-        | Operator::F32x4RelaxedMadd
-        | Operator::I8x16RelaxedLaneselect
-        | Operator::I16x8RelaxedLaneselect
-        | Operator::I32x4RelaxedLaneselect
-        | Operator::I64x2RelaxedLaneselect
-        | Operator::F32x4RelaxedMin
-        | Operator::F32x4RelaxedMax
-        | Operator::F64x2RelaxedMin
-        | Operator::F64x2RelaxedMax
-        | Operator::F64x2RelaxedMadd
-        | Operator::F64x2RelaxedNmadd
-        | Operator::I16x8RelaxedDotI8x16I7x16S
-        | Operator::I32x4RelaxedDotI8x16I7x16AddS
-        | Operator::I16x8RelaxedQ15mulrS => {
-            return Err(wasm_unsupported!("proposed relaxed-simd operator {:?}", op));
-        }
         Operator::RefEq
         | Operator::StructNew { .. }
         | Operator::StructNewDefault { .. }
@@ -3113,6 +3190,8 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::I8x16ExtractLaneS { .. }
         | Operator::I8x16ExtractLaneU { .. }
         | Operator::I8x16ReplaceLane { .. }
+        | Operator::I8x16RelaxedSwizzle
+        | Operator::I8x16RelaxedLaneselect
         | Operator::I8x16Eq
         | Operator::I8x16Ne
         | Operator::I8x16LtS
@@ -3150,6 +3229,7 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::I16x8ExtractLaneS { .. }
         | Operator::I16x8ExtractLaneU { .. }
         | Operator::I16x8ReplaceLane { .. }
+        | Operator::I16x8RelaxedLaneselect
         | Operator::I16x8Eq
         | Operator::I16x8Ne
         | Operator::I16x8LtS
@@ -3178,6 +3258,8 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::I16x8MaxU
         | Operator::I16x8AvgrU
         | Operator::I16x8Mul
+        | Operator::I16x8RelaxedQ15mulrS
+        | Operator::I16x8RelaxedDotI8x16I7x16S
         | Operator::I16x8Bitmask => I16X8,
 
         Operator::I32x4Splat
@@ -3186,6 +3268,7 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::V128Store32Lane { .. }
         | Operator::I32x4ExtractLane { .. }
         | Operator::I32x4ReplaceLane { .. }
+        | Operator::I32x4RelaxedLaneselect
         | Operator::I32x4Eq
         | Operator::I32x4Ne
         | Operator::I32x4LtS
@@ -3212,6 +3295,11 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::I32x4Bitmask
         | Operator::I32x4TruncSatF32x4S
         | Operator::I32x4TruncSatF32x4U
+        | Operator::I32x4RelaxedTruncF32x4S
+        | Operator::I32x4RelaxedTruncF32x4U
+        | Operator::I32x4RelaxedTruncF64x2SZero
+        | Operator::I32x4RelaxedTruncF64x2UZero
+        | Operator::I32x4RelaxedDotI8x16I7x16AddS
         | Operator::V128Load32Zero { .. } => I32X4,
 
         Operator::I64x2Splat
@@ -3220,6 +3308,7 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::V128Store64Lane { .. }
         | Operator::I64x2ExtractLane { .. }
         | Operator::I64x2ReplaceLane { .. }
+        | Operator::I64x2RelaxedLaneselect
         | Operator::I64x2Eq
         | Operator::I64x2Ne
         | Operator::I64x2LtS
@@ -3258,6 +3347,10 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::F32x4Max
         | Operator::F32x4PMin
         | Operator::F32x4PMax
+        | Operator::F32x4RelaxedMin
+        | Operator::F32x4RelaxedMax
+        | Operator::F32x4RelaxedMadd
+        | Operator::F32x4RelaxedNmadd
         | Operator::F32x4ConvertI32x4S
         | Operator::F32x4ConvertI32x4U
         | Operator::F32x4Ceil
@@ -3285,6 +3378,10 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::F64x2Max
         | Operator::F64x2PMin
         | Operator::F64x2PMax
+        | Operator::F64x2RelaxedMin
+        | Operator::F64x2RelaxedMax
+        | Operator::F64x2RelaxedMadd
+        | Operator::F64x2RelaxedNmadd
         | Operator::F64x2Ceil
         | Operator::F64x2Floor
         | Operator::F64x2Trunc
diff --git a/tests/compilers/wast.rs b/tests/compilers/wast.rs
index c637d8a32dc..eee48e7db74 100644
--- a/tests/compilers/wast.rs
+++ b/tests/compilers/wast.rs
@@ -22,6 +22,7 @@ pub fn run_wast(mut config: crate::Config, wast_path: &str) -> anyhow::Result<()
     let mut features = Features::default();
     let is_bulkmemory = wast_path.contains("bulk-memory");
     let is_simd = wast_path.contains("simd");
+    let is_relaxed_simd = wast_path.contains("relaxed-simd");
     let is_threads = wast_path.contains("threads");
     let is_exception_handling = wast_path.contains("exception-handling");
     if is_bulkmemory {
@@ -30,6 +31,9 @@ pub fn run_wast(mut config: crate::Config, wast_path: &str) -> anyhow::Result<()
     if is_simd {
         features.simd(true);
     }
+    if is_relaxed_simd {
+        features.relaxed_simd(true);
+    }
     if is_threads {
         features.threads(true);
     }
diff --git a/tests/ignores.txt b/tests/ignores.txt
index c671963d48e..3769315d50a 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -1,7 +1,9 @@
 # Compilers
 singlepass spec::simd # Singlepass doesn't support yet SIMD (no one asked for this feature)
+singlepass spec::relaxed_simd # Singlepass doesn't support relaxed SIMD yet
 singlepass wasmer::simd_generated_ext_ops
 singlepass wasmer::simd
+llvm spec::relaxed_simd # LLVM compiler doesn't support relaxed SIMD yet
 singlepass spec::exception_handling # Singlepass doesn't support EH yet (no one asked for this feature)
 singlepass wasmer::exception_handling
 windows spec::exception_handling # No EH support on Windows yet
@@ -69,6 +71,7 @@ cranelift+riscv64 spec::r#if::cranelift
 
 # no SIMD on riscv, Cranelift will not handle them
 cranelift+riscv64 spec::simd
+cranelift+riscv64 spec::relaxed_simd
 # 6078
 cranelift+riscv64 wasmer::simd_generated_ext_ops
 cranelift+riscv64 wasmer::simd
diff --git a/tests/lib/wast/src/wast.rs b/tests/lib/wast/src/wast.rs
index 5368e89cf85..c1fff494629 100644
--- a/tests/lib/wast/src/wast.rs
+++ b/tests/lib/wast/src/wast.rs
@@ -557,6 +557,9 @@ impl Wast {
             (Value::F32(a), WastRetCore::F32(b)) => f32_matches(*a, b),
             (Value::F64(a), WastRetCore::F64(b)) => f64_matches(*a, b),
             (Value::V128(a), WastRetCore::V128(b)) => v128_matches(*a, b),
+            (actual, WastRetCore::Either(cases)) => cases
+                .iter()
+                .any(|case| self.val_matches(actual, case).unwrap_or(false)),
             (
                 Value::FuncRef(None),
                 WastRetCore::RefNull(Some(wast::core::HeapType::Abstract {
diff --git a/tests/wast/spec/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast b/tests/wast/spec/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast
new file mode 100644
index 00000000000..00f901cbc2a
--- /dev/null
+++ b/tests/wast/spec/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast
@@ -0,0 +1,28 @@
+;; Tests for i16x8.relaxed_q15mulr_s.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i16x8.relaxed_q15mulr_s") (param v128 v128) (result v128) (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)))
+
+    (func (export "i16x8.relaxed_q15mulr_s_cmp") (param v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))
+            (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))))
+)
+
+;; INT16_MIN = -32768
+(assert_return (invoke "i16x8.relaxed_q15mulr_s"
+                       (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0)
+                       (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0))
+               ;; overflows, return either INT16_MIN or INT16_MAX
+               (either (v128.const i16x8 -32768 32767 32766 0 0 0 0 0)
+                       (v128.const i16x8 32767 32767 32766 0 0 0 0 0)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "i16x8.relaxed_q15mulr_s_cmp"
+                       (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0)
+                       (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0))
+               ;; overflows, return either INT16_MIN or INT16_MAX
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
diff --git a/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast b/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast
new file mode 100644
index 00000000000..cca3ecb958a
--- /dev/null
+++ b/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast
@@ -0,0 +1,124 @@
+;; Tests for i32x4.relaxed_trunc_f32x4_s, i32x4.relaxed_trunc_f32x4_u, i32x4.relaxed_trunc_f64x2_s_zero, and i32x4.relaxed_trunc_f64x2_u_zero.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i32x4.relaxed_trunc_f32x4_s") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_s (local.get 0)))
+    (func (export "i32x4.relaxed_trunc_f32x4_u") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_u (local.get 0)))
+    (func (export "i32x4.relaxed_trunc_f64x2_s_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)))
+    (func (export "i32x4.relaxed_trunc_f64x2_u_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)))
+
+    (func (export "i32x4.relaxed_trunc_f32x4_s_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f32x4_s (local.get 0))
+            (i32x4.relaxed_trunc_f32x4_s (local.get 0))))
+    (func (export "i32x4.relaxed_trunc_f32x4_u_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f32x4_u (local.get 0))
+            (i32x4.relaxed_trunc_f32x4_u (local.get 0))))
+    (func (export "i32x4.relaxed_trunc_f64x2_s_zero_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))
+            (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))))
+    (func (export "i32x4.relaxed_trunc_f64x2_u_zero_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))
+            (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))))
+)
+
+;; Test some edge cases around min/max to ensure that the instruction either
+;; saturates correctly or returns INT_MIN.
+;;
+;; Note, though, that INT_MAX itself is not tested. The value for INT_MAX is
+;; 2147483647 but that is not representable in a `f32` since it requires 31 bits
+;; when a f32 has only 24 bits available. This means that the closest integers
+;; to INT_MAX which can be represented are 2147483520 and 2147483648, meaning
+;; that the INT_MAX test case cannot be tested.
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s"
+                       ;;                INT32_MIN     <INT32_MIN        >INT32_MAX
+                       (v128.const f32x4 -2147483648.0 -2147483904.0 2.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (either (v128.const i32x4 -2147483648 -2147483648 2 2147483647)
+                       (v128.const i32x4 -2147483648 -2147483648 2 -2147483648)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or INT32_MIN
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u"
+                       ;; UINT32_MIN UINT32_MIN-1 <UINT32_MAX UINT32_MAX+1
+                       (v128.const f32x4 0 -1.0 4294967040.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (either (v128.const i32x4 0 0 4294967040 0xffffffff)
+                       (v128.const i32x4 0 0xffffffff 4294967040 0xffffffff)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or UINT32_MAX
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero"
+                       (v128.const f64x2 -2147483904.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (either (v128.const i32x4 -2147483648 2147483647 0 0)
+                       (v128.const i32x4 -2147483648 -2147483648 0 0)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero"
+                       (v128.const f64x2 nan -nan))
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0x80000000 0x80000000 0 0)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero"
+                       (v128.const f64x2 -1.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (either (v128.const i32x4 0 0xffffffff 0 0)
+                       (v128.const i32x4 0xffffffff 0xffffffff 0 0)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero"
+                       (v128.const f64x2 nan -nan))
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0 0 0xffffffff 0xffffffff)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp"
+                       ;; INT32_MIN <INT32_MIN INT32_MAX >INT32_MAX
+                       (v128.const f32x4 -2147483648.0 -2147483904.0 2147483647.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or INT32_MIN
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp"
+                       ;; UINT32_MIN UINT32_MIN-1 <UINT32_MAX UINT32_MAX+1
+                       (v128.const f32x4 0 -1.0 4294967040.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or UINT32_MAX
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp"
+                       (v128.const f64x2 -2147483904.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp"
+                       (v128.const f64x2 nan -nan))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp"
+                       (v128.const f64x2 -1.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp"
+                       (v128.const f64x2 nan -nan))
+               (v128.const i32x4 -1 -1 -1 -1))
diff --git a/tests/wast/spec/proposals/relaxed-simd/i8x16_relaxed_swizzle.wast b/tests/wast/spec/proposals/relaxed-simd/i8x16_relaxed_swizzle.wast
new file mode 100644
index 00000000000..f1bcb455209
--- /dev/null
+++ b/tests/wast/spec/proposals/relaxed-simd/i8x16_relaxed_swizzle.wast
@@ -0,0 +1,45 @@
+;; Tests for relaxed i8x16 swizzle.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i8x16.relaxed_swizzle") (param v128 v128) (result v128) (i8x16.relaxed_swizzle (local.get 0) (local.get 1)))
+
+    (func (export "i8x16.relaxed_swizzle_cmp") (param v128 v128) (result v128)
+          (i8x16.eq
+            (i8x16.relaxed_swizzle (local.get 0) (local.get 1))
+            (i8x16.relaxed_swizzle (local.get 0) (local.get 1))))
+)
+
+(assert_return (invoke "i8x16.relaxed_swizzle"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
+               (either (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)))
+
+;; out of range, returns 0 or modulo 15 if < 128
+(assert_return (invoke "i8x16.relaxed_swizzle"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31))
+               (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)))
+
+;; out of range, returns 0 if >= 128
+(assert_return (invoke "i8x16.relaxed_swizzle"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255))
+               (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; out of range, returns 0 or modulo 15 if < 128
+(assert_return (invoke "i8x16.relaxed_swizzle_cmp"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31))
+               (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; out of range, returns 0 if >= 128
+(assert_return (invoke "i8x16.relaxed_swizzle_cmp"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255))
+               (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
diff --git a/tests/wast/spec/proposals/relaxed-simd/relaxed_dot_product.wast b/tests/wast/spec/proposals/relaxed-simd/relaxed_dot_product.wast
new file mode 100644
index 00000000000..48714b87bd8
--- /dev/null
+++ b/tests/wast/spec/proposals/relaxed-simd/relaxed_dot_product.wast
@@ -0,0 +1,107 @@
+;; Tests for relaxed dot products.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i16x8.relaxed_dot_i8x16_i7x16_s") (param v128 v128) (result v128) (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)))
+    (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s") (param v128 v128 v128) (result v128) (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "i16x8.relaxed_dot_i8x16_i7x16_s_cmp") (param v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))
+            (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))))
+    (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp") (param v128 v128 v128) (result v128)
+          (i16x8.eq
+            (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))
+            (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))))
+)
+
+;; Simple values to ensure things are functional.
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
+               (v128.const i16x8 1 13 41 85 145 221 313 421))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s"
+                       (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -32512 32258 0 0 0 0 0 0))
+
+;; signed * unsigned   : -128 *  129 * 2 = -33,024 saturated to -32,768
+;; signed * signed     : -128 * -127 * 2 =  32,512
+;; unsigned * unsigned :  128 *  129 * 2 =  33,024
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s"
+                       (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+               (either
+                 (v128.const i16x8 -32768 0 0 0 0 0 0 0)
+                 (v128.const i16x8  32512 0 0 0 0 0 0 0)
+                 (v128.const i16x8  33024 0 0 0 0 0 0 0)))
+
+;; Simple values to ensure things are functional.
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i32x4 0 1 2 3))
+               ;; intermediate result is [14, 126, 366, 734]
+               (v128.const i32x4 14 127 368 737))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               ;; intermediate result is [-65024, 64516, 0, 0]
+               (v128.const i32x4 -65023 64518 3 4))
+
+;; signed * unsigned   : -128 *  129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI
+;; signed * unsigned with intermediate saturation :
+;;   (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW)
+;;   -32768 + -32768 = -65536 (+ 1)
+;; signed * signed     : -128 * -127 * 4 =  65,024 (+ 1)
+;; unsigned * unsigned :  128 *  129 * 2 =  66,048 (+ 1)
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               (either
+                 (v128.const i32x4 -66047 2 3 4)
+                 (v128.const i32x4 -65535 2 3 4)
+                 (v128.const i32x4  65025 2 3 4)
+                 (v128.const i32x4  66049 2 3 4)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; Test max and min i8 values;
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp"
+                       (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp"
+                       (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               ;; intermediate result is [-65024, 64516, 0, 0]
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; signed * unsigned   : -128 *  129 * 2 = -33,024 saturated to -32,768
+;; signed * signed     : -128 * -127 * 2 =  32,512
+;; unsigned * unsigned :  128 *  129 * 2 =  33,024
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp"
+                       (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; signed * unsigned   : -128 *  129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI
+;; signed * unsigned with intermediate saturation :
+;;   (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW)
+;;   -32768 + -32768 = -65536 (+ 1)
+;; signed * signed     : -128 * -127 * 4 =  65,024 (+ 1)
+;; unsigned * unsigned :  128 *  129 * 2 =  66,048 (+ 1)
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp"
+                       (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               (v128.const i32x4 -1 -1 -1 -1))
diff --git a/tests/wast/spec/proposals/relaxed-simd/relaxed_laneselect.wast b/tests/wast/spec/proposals/relaxed-simd/relaxed_laneselect.wast
new file mode 100644
index 00000000000..10913816b0b
--- /dev/null
+++ b/tests/wast/spec/proposals/relaxed-simd/relaxed_laneselect.wast
@@ -0,0 +1,103 @@
+;; Tests for i8x16.relaxed_laneselect, i16x8.relaxed_laneselect, i32x4.relaxed_laneselect, and i64x2.relaxed_laneselect.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i8x16.relaxed_laneselect") (param v128 v128 v128) (result v128) (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "i16x8.relaxed_laneselect") (param v128 v128 v128) (result v128) (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "i32x4.relaxed_laneselect") (param v128 v128 v128) (result v128) (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "i64x2.relaxed_laneselect") (param v128 v128 v128) (result v128) (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "i8x16.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i8x16.eq
+            (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "i16x8.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "i32x4.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "i64x2.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i64x2.eq
+            (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+)
+
+(assert_return (invoke "i8x16.relaxed_laneselect"
+                       (v128.const i8x16 0    1  0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16   17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31)
+                       (v128.const i8x16 0xff 0  0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0))
+               (either (v128.const i8x16 0    17 0x14 0x32 20 21 22 23 24 25 26 27 28 29 30 31)
+                       (v128.const i8x16 0    17 0x12 0x34 20 21 22 23 24 25 26 27 28 29 30 31)))
+
+(assert_return (invoke "i16x8.relaxed_laneselect"
+                       (v128.const i16x8 0      1 0x1234 0x1234 4 5 6 7)
+                       (v128.const i16x8 8      9 0x5678 0x5678 12 13 14 15)
+                       (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0))
+               (either (v128.const i16x8 0      9 0x1278 0x5634 12 13 14 15)
+                       (v128.const i16x8 0      9 0x1234 0x5678 12 13 14 15)))
+
+;; special case for i16x8 to allow pblendvb
+(assert_return (invoke "i16x8.relaxed_laneselect"
+                       (v128.const i16x8 0      1 0x1234 0x1234 4 5 6 7)
+                       (v128.const i16x8 8      9 0x5678 0x5678 12 13 14 15)
+                       (v128.const i16x8 0xffff 0 0xff00 0x0080 0 0 0 0))  ;; 0x0080 is the special case
+               (either (v128.const i16x8 0      9 0x1278 0x5678 12 13 14 15)  ;; bitselect
+                       (v128.const i16x8 0      9 0x1234 0x5678 12 13 14 15)  ;; top bit of i16 lane examined
+                       (v128.const i16x8 0      9 0x1278 0x5634 12 13 14 15)  ;; top bit of each byte
+                       ))
+
+(assert_return (invoke "i32x4.relaxed_laneselect"
+                       (v128.const i32x4 0          1 0x12341234 0x12341234)
+                       (v128.const i32x4 4          5 0x56785678 0x56785678)
+                       (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff))
+               (either (v128.const i32x4 0          5 0x12345678 0x56781234)
+                       (v128.const i32x4 0          5 0x12341234 0x56785678)))
+
+(assert_return (invoke "i64x2.relaxed_laneselect"
+                       (v128.const i64x2 0                  1)
+                       (v128.const i64x2 2                  3)
+                       (v128.const i64x2 0xffffffffffffffff 0))
+               (either (v128.const i64x2 0                  3)
+                       (v128.const i64x2 0                  3)))
+
+(assert_return (invoke "i64x2.relaxed_laneselect"
+                       (v128.const i64x2 0x1234123412341234 0x1234123412341234)
+                       (v128.const i64x2 0x5678567856785678 0x5678567856785678)
+                       (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff))
+               (either (v128.const i64x2 0x1234123456785678 0x5678567812341234)
+                       (v128.const i64x2 0x1234123412341234 0x5678567856785678)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "i8x16.relaxed_laneselect_cmp"
+                       (v128.const i8x16 0    1  0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16   17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31)
+                       (v128.const i8x16 0xff 0  0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
+
+(assert_return (invoke "i16x8.relaxed_laneselect_cmp"
+                       (v128.const i16x8 0      1 0x1234 0x1234 4 5 6 7)
+                       (v128.const i16x8 8      9 0x5678 0x5678 12 13 14 15)
+                       (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_laneselect_cmp"
+                       (v128.const i32x4 0          1 0x12341234 0x12341234)
+                       (v128.const i32x4 4          5 0x56785678 0x56785678)
+                       (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i64x2.relaxed_laneselect_cmp"
+                       (v128.const i64x2 0                  1)
+                       (v128.const i64x2 2                  3)
+                       (v128.const i64x2 0xffffffffffffffff 0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "i64x2.relaxed_laneselect_cmp"
+                       (v128.const i64x2 0x1234123412341234 0x1234123412341234)
+                       (v128.const i64x2 0x5678567856785678 0x5678567856785678)
+                       (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff))
+               (v128.const i64x2 -1 -1))
diff --git a/tests/wast/spec/proposals/relaxed-simd/relaxed_madd_nmadd.wast b/tests/wast/spec/proposals/relaxed-simd/relaxed_madd_nmadd.wast
new file mode 100644
index 00000000000..187b71d5a3f
--- /dev/null
+++ b/tests/wast/spec/proposals/relaxed-simd/relaxed_madd_nmadd.wast
@@ -0,0 +1,224 @@
+;; Tests for f32x4.relaxed_madd, f32x4.relaxed_nmadd, f64x2.relaxed_madd, and f64x2.relaxed_nmadd.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "f32x4.relaxed_madd") (param v128 v128 v128) (result v128) (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "f32x4.relaxed_nmadd") (param v128 v128 v128) (result v128) (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "f64x2.relaxed_nmadd") (param v128 v128 v128) (result v128) (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "f64x2.relaxed_madd") (param v128 v128 v128) (result v128) (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "f32x4.relaxed_madd_cmp") (param v128 v128 v128) (result v128)
+          (f32x4.eq
+            (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))
+            (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "f32x4.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128)
+          (f32x4.eq
+            (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))
+            (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "f64x2.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128)
+          (f64x2.eq
+            (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))
+            (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "f64x2.relaxed_madd_cmp") (param v128 v128 v128) (result v128)
+          (f64x2.eq
+            (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))
+            (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))))
+)
+
+
+;; FLT_MAX == 0x1.fffffep+127
+;; FLT_MAX * 2 - FLT_MAX ==
+;;   FLT_MAX (if fma)
+;;   0       (if no fma)
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f32x4.relaxed_madd"
+                       (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                       (v128.const f32x4 2.0 2.0 2.0 2.0)
+                       (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+               (either (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127)
+                       (v128.const f32x4 inf inf inf inf)))
+
+;; Special values for float:
+;; x            = 0x1.000004p+0 (1 + 2^-22)
+;; y            = 0x1.0002p+0   (1 + 2^-15)
+;; z            = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0)
+;;              = -0x1.000204p+0
+;; x.y          = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = (0x1p-37) 2^-37
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f32x4.relaxed_madd"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
+                       (v128.const f32x4 0 0 0 0)))
+;; nmadd tests with negated x, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd"
+                       (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
+                       (v128.const f32x4 0 0 0 0)))
+;; nmadd tests with negated y, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
+                       (v128.const f32x4 0 0 0 0)))
+
+;; DBL_MAX = 0x1.fffffffffffffp+1023
+;; DLB_MAX * 2 - DLB_MAX ==
+;;   DLB_MAX (if fma)
+;;   0       (if no fma)
+;; form https://www.vinc17.net/software/fma-tests.c
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f64x2.relaxed_madd"
+                       (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
+                       (v128.const f64x2 2.0 2.0)
+                       (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023))
+               (either (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
+                       (v128.const f64x2 inf inf)))
+
+;; Special values for double:
+;; x            = 0x1.00000004p+0 (1 + 2^-30)
+;; y            = 0x1.000002p+0   (1 + 2^-23)
+;; z            = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0)
+;;              = -0x1.00000204p+0
+;; x.y          = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = 0x1p-53
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f64x2.relaxed_madd"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (either (v128.const f64x2 0x1p-53 0x1p-53)
+                       (v128.const f64x2 0 0)))
+;; nmadd tests with negated x, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd"
+                       (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (either (v128.const f64x2 0x1p-53 0x1p-53)
+                       (v128.const f64x2 0 0)))
+;; nmadd tests with negated y, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (either (v128.const f64x2 0x1p-53 0x1p-53)
+                       (v128.const f64x2 0 0)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; FLT_MAX == 0x1.fffffep+127
+;; FLT_MAX * 2 - FLT_MAX ==
+;;   FLT_MAX (if fma)
+;;   0       (if no fma)
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f32x4.relaxed_madd_cmp"
+                       (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                       (v128.const f32x4 2.0 2.0 2.0 2.0)
+                       (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; Special values for float:
+;; x            = 0x1.000004p+0 (1 + 2^-22)
+;; y            = 0x1.0002p+0   (1 + 2^-15)
+;; z            = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0)
+;;              = -0x1.000204p+0
+;; x.y          = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = (0x1p-37) 2^-37
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f32x4.relaxed_madd_cmp"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (v128.const i32x4 -1 -1 -1 -1))
+;; nmadd tests with negated x, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd_cmp"
+                       (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (v128.const i32x4 -1 -1 -1 -1))
+;; nmadd tests with negated y, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd_cmp"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; DBL_MAX = 0x1.fffffffffffffp+1023
+;; DLB_MAX * 2 - DLB_MAX ==
+;;   DLB_MAX (if fma)
+;;   0       (if no fma)
+;; form https://www.vinc17.net/software/fma-tests.c
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f64x2.relaxed_madd_cmp"
+                       (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
+                       (v128.const f64x2 2.0 2.0)
+                       (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023))
+               (v128.const i64x2 -1 -1))
+
+;; Special values for double:
+;; x            = 0x1.00000004p+0 (1 + 2^-30)
+;; y            = 0x1.000002p+0   (1 + 2^-23)
+;; z            = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0)
+;;              = -0x1.00000204p+0
+;; x.y          = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = 0x1p-53
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f64x2.relaxed_madd_cmp"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (v128.const i64x2 -1 -1))
+;; nmadd tests with negated x, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd_cmp"
+                       (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (v128.const i64x2 -1 -1))
+;; nmadd tests with negated y, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd_cmp"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (v128.const i64x2 -1 -1))
+
+;; Test that the non-deterministic choice of fusing and then rounding or
+;; rounding multiple times in `relaxed_madd` is consistent throughout a
+;; program's execution.
+;;
+;; This property is impossible to test exhaustively, so this is just a simple
+;; smoke test for when the operands to a `relaxed_madd` are known statically
+;; versus when they are dynamically supplied. This should, at least, catch
+;; illegal constant-folding and -propagation by the compiler that leads to
+;; inconsistent rounding behavior at compile time versus at run time.
+;;
+;; FLT_MAX == 0x1.fffffep+127
+;; FLT_MAX * 2 - FLT_MAX ==
+;;   FLT_MAX (if fma)
+;;   0       (if no fma)
+;; from https://www.vinc17.net/software/fma-tests.c
+(module
+  (func (export "test-consistent-nondeterminism") (param v128 v128 v128) (result v128)
+    (f32x4.eq
+      (f32x4.relaxed_madd (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                          (v128.const f32x4 2.0 2.0 2.0 2.0)
+                          (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+      (f32x4.relaxed_madd (local.get 0)
+                          (local.get 1)
+                          (local.get 2))
+    )
+  )
+)
+(assert_return (invoke "test-consistent-nondeterminism"
+                       (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                       (v128.const f32x4 2.0 2.0 2.0 2.0)
+                       (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+               (v128.const i32x4 -1 -1 -1 -1))
diff --git a/tests/wast/spec/proposals/relaxed-simd/relaxed_min_max.wast b/tests/wast/spec/proposals/relaxed-simd/relaxed_min_max.wast
new file mode 100644
index 00000000000..ac3ebb07cac
--- /dev/null
+++ b/tests/wast/spec/proposals/relaxed-simd/relaxed_min_max.wast
@@ -0,0 +1,184 @@
+;; Tests for f32x4.min, f32x4.max, f64x2.min, and f64x2.max.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "f32x4.relaxed_min") (param v128 v128) (result v128) (f32x4.relaxed_min (local.get 0) (local.get 1)))
+    (func (export "f32x4.relaxed_max") (param v128 v128) (result v128) (f32x4.relaxed_max (local.get 0) (local.get 1)))
+    (func (export "f64x2.relaxed_min") (param v128 v128) (result v128) (f64x2.relaxed_min (local.get 0) (local.get 1)))
+    (func (export "f64x2.relaxed_max") (param v128 v128) (result v128) (f64x2.relaxed_max (local.get 0) (local.get 1)))
+
+    (func (export "f32x4.relaxed_min_cmp") (param v128 v128) (result v128)
+          (i32x4.eq
+            (f32x4.relaxed_min (local.get 0) (local.get 1))
+            (f32x4.relaxed_min (local.get 0) (local.get 1))))
+    (func (export "f32x4.relaxed_max_cmp") (param v128 v128) (result v128)
+          (i32x4.eq
+            (f32x4.relaxed_max (local.get 0) (local.get 1))
+            (f32x4.relaxed_max (local.get 0) (local.get 1))))
+    (func (export "f64x2.relaxed_min_cmp") (param v128 v128) (result v128)
+          (i64x2.eq
+            (f64x2.relaxed_min (local.get 0) (local.get 1))
+            (f64x2.relaxed_min (local.get 0) (local.get 1))))
+    (func (export "f64x2.relaxed_max_cmp") (param v128 v128) (result v128)
+          (i64x2.eq
+            (f64x2.relaxed_max (local.get 0) (local.get 1))
+            (f64x2.relaxed_max (local.get 0) (local.get 1))))
+)
+
+(assert_return (invoke "f32x4.relaxed_min"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical)
+                       (v128.const f32x4 nan:canonical nan:canonical 0 0)
+                       (v128.const f32x4 0 0 nan:canonical nan:canonical)
+                       (v128.const f32x4 0 0 0 0)))
+
+(assert_return (invoke "f32x4.relaxed_min"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (either (v128.const f32x4 -0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 -0.0 +0.0 -0.0)))
+
+(assert_return (invoke "f32x4.relaxed_max"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical)
+                       (v128.const f32x4 nan:canonical nan:canonical 0 0)
+                       (v128.const f32x4 0 0 nan:canonical nan:canonical)
+                       (v128.const f32x4 0 0 0 0)))
+
+(assert_return (invoke "f32x4.relaxed_max"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (either (v128.const f32x4 +0.0 +0.0 +0.0 -0.0)
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 -0.0 +0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (either (v128.const f64x2 -0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0)
+                       (v128.const f64x2 -0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (either (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (either (v128.const f64x2 +0.0 +0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0)
+                       (v128.const f64x2 -0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (either (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "f32x4.relaxed_min_cmp"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f32x4.relaxed_min_cmp"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f32x4.relaxed_max_cmp"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f32x4.relaxed_max_cmp"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (v128.const i64x2 -1 -1))

From 662231311c98c04cb772114d09fa9e01c3cc0fe0 Mon Sep 17 00:00:00 2001
From: Syrus Akbary <me@syrusakbary.com>
Date: Sun, 1 Feb 2026 20:31:07 +0100
Subject: [PATCH 02/18] Added Relaxed SIMD LLVM implementation

---
 lib/compiler-llvm/src/translator/code.rs | 295 ++++++++++++++++++++++-
 tests/ignores.txt                        |   1 -
 2 files changed, 285 insertions(+), 11 deletions(-)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index ce663fe7898..33acf0f1d96 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -3733,7 +3733,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
-            Operator::I16x8Q15MulrSatS => {
+            Operator::I16x8Q15MulrSatS | Operator::I16x8RelaxedQ15mulrS => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, _) = self.v128_into_i16x8(v1, i1)?;
                 let (v2, _) = self.v128_into_i16x8(v2, i2)?;
@@ -4018,6 +4018,174 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
+            Operator::I16x8RelaxedDotI8x16I7x16S => {
+                let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
+                let (v1, _) = self.v128_into_i8x16(v1, i1)?;
+                let (v2, _) = self.v128_into_i8x16(v2, i2)?;
+
+                let left_indices = [
+                    self.intrinsics.i32_consts[0],
+                    self.intrinsics.i32_consts[2],
+                    self.intrinsics.i32_consts[4],
+                    self.intrinsics.i32_consts[6],
+                    self.intrinsics.i32_consts[8],
+                    self.intrinsics.i32_consts[10],
+                    self.intrinsics.i32_consts[12],
+                    self.intrinsics.i32_consts[14],
+                ];
+                let right_indices = [
+                    self.intrinsics.i32_consts[1],
+                    self.intrinsics.i32_consts[3],
+                    self.intrinsics.i32_consts[5],
+                    self.intrinsics.i32_consts[7],
+                    self.intrinsics.i32_consts[9],
+                    self.intrinsics.i32_consts[11],
+                    self.intrinsics.i32_consts[13],
+                    self.intrinsics.i32_consts[15],
+                ];
+
+                let v1_left = err!(self.builder.build_shuffle_vector(
+                    v1,
+                    v1.get_type().get_undef(),
+                    VectorType::const_vector(&left_indices),
+                    "",
+                ));
+                let v1_left =
+                    err!(self.builder.build_int_s_extend(v1_left, self.intrinsics.i16x8_ty, ""));
+                let v1_right = err!(self.builder.build_shuffle_vector(
+                    v1,
+                    v1.get_type().get_undef(),
+                    VectorType::const_vector(&right_indices),
+                    "",
+                ));
+                let v1_right =
+                    err!(self.builder.build_int_s_extend(v1_right, self.intrinsics.i16x8_ty, ""));
+
+                let v2_left = err!(self.builder.build_shuffle_vector(
+                    v2,
+                    v2.get_type().get_undef(),
+                    VectorType::const_vector(&left_indices),
+                    "",
+                ));
+                let v2_left =
+                    err!(self.builder.build_int_s_extend(v2_left, self.intrinsics.i16x8_ty, ""));
+                let v2_right = err!(self.builder.build_shuffle_vector(
+                    v2,
+                    v2.get_type().get_undef(),
+                    VectorType::const_vector(&right_indices),
+                    "",
+                ));
+                let v2_right =
+                    err!(self.builder.build_int_s_extend(v2_right, self.intrinsics.i16x8_ty, ""));
+
+                let prod_left = err!(self.builder.build_int_mul(v1_left, v2_left, ""));
+                let prod_right = err!(self.builder.build_int_mul(v1_right, v2_right, ""));
+                let res = err!(self.builder.build_int_add(prod_left, prod_right, ""));
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
+            Operator::I32x4RelaxedDotI8x16I7x16AddS => {
+                let ((v1, i1), (v2, i2), (acc, acc_info)) = self.state.pop3_extra()?;
+                let (v1, _) = self.v128_into_i8x16(v1, i1)?;
+                let (v2, _) = self.v128_into_i8x16(v2, i2)?;
+                let (acc, _) = self.v128_into_i32x4(acc, acc_info)?;
+
+                let left_indices = [
+                    self.intrinsics.i32_consts[0],
+                    self.intrinsics.i32_consts[2],
+                    self.intrinsics.i32_consts[4],
+                    self.intrinsics.i32_consts[6],
+                    self.intrinsics.i32_consts[8],
+                    self.intrinsics.i32_consts[10],
+                    self.intrinsics.i32_consts[12],
+                    self.intrinsics.i32_consts[14],
+                ];
+                let right_indices = [
+                    self.intrinsics.i32_consts[1],
+                    self.intrinsics.i32_consts[3],
+                    self.intrinsics.i32_consts[5],
+                    self.intrinsics.i32_consts[7],
+                    self.intrinsics.i32_consts[9],
+                    self.intrinsics.i32_consts[11],
+                    self.intrinsics.i32_consts[13],
+                    self.intrinsics.i32_consts[15],
+                ];
+
+                let v1_left = err!(self.builder.build_shuffle_vector(
+                    v1,
+                    v1.get_type().get_undef(),
+                    VectorType::const_vector(&left_indices),
+                    "",
+                ));
+                let v1_left =
+                    err!(self.builder.build_int_s_extend(v1_left, self.intrinsics.i16x8_ty, ""));
+                let v1_right = err!(self.builder.build_shuffle_vector(
+                    v1,
+                    v1.get_type().get_undef(),
+                    VectorType::const_vector(&right_indices),
+                    "",
+                ));
+                let v1_right =
+                    err!(self.builder.build_int_s_extend(v1_right, self.intrinsics.i16x8_ty, ""));
+
+                let v2_left = err!(self.builder.build_shuffle_vector(
+                    v2,
+                    v2.get_type().get_undef(),
+                    VectorType::const_vector(&left_indices),
+                    "",
+                ));
+                let v2_left =
+                    err!(self.builder.build_int_s_extend(v2_left, self.intrinsics.i16x8_ty, ""));
+                let v2_right = err!(self.builder.build_shuffle_vector(
+                    v2,
+                    v2.get_type().get_undef(),
+                    VectorType::const_vector(&right_indices),
+                    "",
+                ));
+                let v2_right =
+                    err!(self.builder.build_int_s_extend(v2_right, self.intrinsics.i16x8_ty, ""));
+
+                let prod_left = err!(self.builder.build_int_mul(v1_left, v2_left, ""));
+                let prod_right = err!(self.builder.build_int_mul(v1_right, v2_right, ""));
+                let dot16 = err!(self.builder.build_int_add(prod_left, prod_right, ""));
+
+                let pair_left = err!(self.builder.build_shuffle_vector(
+                    dot16,
+                    dot16.get_type().get_undef(),
+                    VectorType::const_vector(&[
+                        self.intrinsics.i32_consts[0],
+                        self.intrinsics.i32_consts[2],
+                        self.intrinsics.i32_consts[4],
+                        self.intrinsics.i32_consts[6],
+                    ]),
+                    "",
+                ));
+                let pair_left =
+                    err!(self.builder.build_int_s_extend(pair_left, self.intrinsics.i32x4_ty, ""));
+                let pair_right = err!(self.builder.build_shuffle_vector(
+                    dot16,
+                    dot16.get_type().get_undef(),
+                    VectorType::const_vector(&[
+                        self.intrinsics.i32_consts[1],
+                        self.intrinsics.i32_consts[3],
+                        self.intrinsics.i32_consts[5],
+                        self.intrinsics.i32_consts[7],
+                    ]),
+                    "",
+                ));
+                let pair_right =
+                    err!(self.builder.build_int_s_extend(pair_right, self.intrinsics.i32x4_ty, ""));
+                let dot32 = err!(self.builder.build_int_add(pair_left, pair_right, ""));
+                let res = err!(self.builder.build_int_add(dot32, acc, ""));
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
             Operator::I32DivS | Operator::I64DivS => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let v1 = self.apply_pending_canonicalization(v1, i1)?;
@@ -4139,7 +4307,11 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 let res = err!(self.builder.build_and(v1, v2, ""));
                 self.state.push1(res);
             }
-            Operator::V128Bitselect => {
+            Operator::I8x16RelaxedLaneselect
+            | Operator::I16x8RelaxedLaneselect
+            | Operator::I32x4RelaxedLaneselect
+            | Operator::I64x2RelaxedLaneselect
+            | Operator::V128Bitselect => {
                 let ((v1, i1), (v2, i2), (cond, cond_info)) = self.state.pop3_extra()?;
                 let v1 = self.apply_pending_canonicalization(v1, i1)?;
                 let v2 = self.apply_pending_canonicalization(v2, i2)?;
@@ -5301,6 +5473,52 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f32_nan())?,
                 );
             }
+            Operator::F32x4RelaxedMadd | Operator::F32x4RelaxedNmadd => {
+                let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?;
+                let (v1, i1) = self.v128_into_f32x4(v1, i1)?;
+                let (v2, i2) = self.v128_into_f32x4(v2, i2)?;
+                let (v3, i3) = self.v128_into_f32x4(v3, i3)?;
+
+                let v1 = match op {
+                    Operator::F32x4RelaxedNmadd => err!(self.builder.build_float_neg(v1, "")),
+                    _ => v1,
+                };
+                let mul = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.mul_f32x4,
+                        &[
+                            v1.into(),
+                            v2.into(),
+                            self.intrinsics.fp_rounding_md,
+                            self.intrinsics.fp_exception_md,
+                        ],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let mul = mul.into_vector_value();
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.add_f32x4,
+                        &[
+                            mul.into(),
+                            v3.into(),
+                            self.intrinsics.fp_rounding_md,
+                            self.intrinsics.fp_exception_md,
+                        ],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                let info = (i1.strip_pending() & i2.strip_pending())?;
+                let info = (info & i3.strip_pending())?;
+                let info = (info | ExtraInfo::pending_f32_nan())?;
+                self.state.push1_extra(res, info);
+            }
             Operator::F64x2Mul => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
@@ -5327,6 +5545,52 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f64_nan())?,
                 );
             }
+            Operator::F64x2RelaxedMadd | Operator::F64x2RelaxedNmadd => {
+                let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?;
+                let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
+                let (v2, i2) = self.v128_into_f64x2(v2, i2)?;
+                let (v3, i3) = self.v128_into_f64x2(v3, i3)?;
+
+                let v1 = match op {
+                    Operator::F64x2RelaxedNmadd => err!(self.builder.build_float_neg(v1, "")),
+                    _ => v1,
+                };
+                let mul = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.mul_f64x2,
+                        &[
+                            v1.into(),
+                            v2.into(),
+                            self.intrinsics.fp_rounding_md,
+                            self.intrinsics.fp_exception_md,
+                        ],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let mul = mul.into_vector_value();
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.add_f64x2,
+                        &[
+                            mul.into(),
+                            v3.into(),
+                            self.intrinsics.fp_rounding_md,
+                            self.intrinsics.fp_exception_md,
+                        ],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                let info = (i1.strip_pending() & i2.strip_pending())?;
+                let info = (info & i3.strip_pending())?;
+                let info = (info | ExtraInfo::pending_f64_nan())?;
+                self.state.push1_extra(res, info);
+            }
             Operator::F32Div => {
                 let (v1, v2) = self.state.pop2()?;
                 let (v1, v2) = (v1.into_float_value(), v2.into_float_value());
@@ -5505,7 +5769,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
 
                 self.state.push1_extra(res, ExtraInfo::pending_f64_nan());
             }
-            Operator::F32x4Min => {
+            Operator::F32x4Min | Operator::F32x4RelaxedMin => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, i1) = self.v128_into_f32x4(v1, i1)?;
                 let (v2, i2) = self.v128_into_f32x4(v2, i2)?;
@@ -5546,7 +5810,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
-            Operator::F64x2Min => {
+            Operator::F64x2Min | Operator::F64x2RelaxedMin => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
                 let (v2, i2) = self.v128_into_f64x2(v2, i2)?;
@@ -5633,7 +5897,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
 
                 self.state.push1_extra(res, ExtraInfo::pending_f64_nan());
             }
-            Operator::F32x4Max => {
+            Operator::F32x4Max | Operator::F32x4RelaxedMax => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, i1) = self.v128_into_f32x4(v1, i1)?;
                 let (v2, i2) = self.v128_into_f32x4(v2, i2)?;
@@ -5675,7 +5939,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
-            Operator::F64x2Max => {
+            Operator::F64x2Max | Operator::F64x2RelaxedMax => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
                 let (v2, i2) = self.v128_into_f64x2(v2, i2)?;
@@ -7768,7 +8032,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
-            Operator::I32x4TruncSatF32x4S => {
+            Operator::I32x4TruncSatF32x4S | Operator::I32x4RelaxedTruncF32x4S => {
                 let (v, i) = self.state.pop1_extra()?;
                 let v = self.apply_pending_canonicalization(v, i)?;
                 let v = v.into_int_value();
@@ -7783,7 +8047,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 )?;
                 self.state.push1(res);
             }
-            Operator::I32x4TruncSatF32x4U => {
+            Operator::I32x4TruncSatF32x4U | Operator::I32x4RelaxedTruncF32x4U => {
                 let (v, i) = self.state.pop1_extra()?;
                 let v = self.apply_pending_canonicalization(v, i)?;
                 let v = v.into_int_value();
@@ -7798,7 +8062,10 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 )?;
                 self.state.push1(res);
             }
-            Operator::I32x4TruncSatF64x2SZero | Operator::I32x4TruncSatF64x2UZero => {
+            Operator::I32x4TruncSatF64x2SZero
+            | Operator::I32x4TruncSatF64x2UZero
+            | Operator::I32x4RelaxedTruncF64x2SZero
+            | Operator::I32x4RelaxedTruncF64x2UZero => {
                 let ((min, max), (cmp_min, cmp_max)) = match op {
                     Operator::I32x4TruncSatF64x2SZero => (
                         (i32::MIN as u64, i32::MAX as u64),
@@ -7808,6 +8075,14 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                         (u32::MIN as u64, u32::MAX as u64),
                         (LEF64_GEQ_U32_MIN, GEF64_LEQ_U32_MAX),
                     ),
+                    Operator::I32x4RelaxedTruncF64x2SZero => (
+                        (i32::MIN as u64, i32::MAX as u64),
+                        (LEF64_GEQ_I32_MIN, GEF64_LEQ_I32_MAX),
+                    ),
+                    Operator::I32x4RelaxedTruncF64x2UZero => (
+                        (u32::MIN as u64, u32::MAX as u64),
+                        (LEF64_GEQ_U32_MIN, GEF64_LEQ_U32_MAX),
+                    ),
                     _ => unreachable!("Unhandled internal variant"),
                 };
                 let (v, i) = self.state.pop1_extra()?;
@@ -9549,7 +9824,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 };
                 self.state.push1_extra(res, info);
             }
-            Operator::I8x16Swizzle => {
+            Operator::I8x16Swizzle | Operator::I8x16RelaxedSwizzle => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let v1 = self.apply_pending_canonicalization(v1, i1)?;
                 let v1 = err!(
diff --git a/tests/ignores.txt b/tests/ignores.txt
index 3769315d50a..bfd9f05411a 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -3,7 +3,6 @@ singlepass spec::simd # Singlepass doesn't support yet SIMD (no one asked for th
 singlepass spec::relaxed_simd # Singlepass doesn't support relaxed SIMD yet
 singlepass wasmer::simd_generated_ext_ops
 singlepass wasmer::simd
-llvm spec::relaxed_simd # LLVM compiler doesn't support relaxed SIMD yet
 singlepass spec::exception_handling # Singlepass doesn't support EH yet (no one asked for this feature)
 singlepass wasmer::exception_handling
 windows spec::exception_handling # No EH support on Windows yet

From dfa9fce7de816eb6178fadad2cd4c7bf340861a7 Mon Sep 17 00:00:00 2001
From: Syrus Akbary <me@syrusakbary.com>
Date: Sun, 1 Feb 2026 20:49:29 +0100
Subject: [PATCH 03/18] Improved linting

---
 lib/compiler-llvm/src/translator/code.rs | 70 +++++++++++++++++-------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index 33acf0f1d96..6f42980425f 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -4050,16 +4050,22 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     VectorType::const_vector(&left_indices),
                     "",
                 ));
-                let v1_left =
-                    err!(self.builder.build_int_s_extend(v1_left, self.intrinsics.i16x8_ty, ""));
+                let v1_left = err!(self.builder.build_int_s_extend(
+                    v1_left,
+                    self.intrinsics.i16x8_ty,
+                    ""
+                ));
                 let v1_right = err!(self.builder.build_shuffle_vector(
                     v1,
                     v1.get_type().get_undef(),
                     VectorType::const_vector(&right_indices),
                     "",
                 ));
-                let v1_right =
-                    err!(self.builder.build_int_s_extend(v1_right, self.intrinsics.i16x8_ty, ""));
+                let v1_right = err!(self.builder.build_int_s_extend(
+                    v1_right,
+                    self.intrinsics.i16x8_ty,
+                    ""
+                ));
 
                 let v2_left = err!(self.builder.build_shuffle_vector(
                     v2,
@@ -4067,16 +4073,22 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     VectorType::const_vector(&left_indices),
                     "",
                 ));
-                let v2_left =
-                    err!(self.builder.build_int_s_extend(v2_left, self.intrinsics.i16x8_ty, ""));
+                let v2_left = err!(self.builder.build_int_s_extend(
+                    v2_left,
+                    self.intrinsics.i16x8_ty,
+                    ""
+                ));
                 let v2_right = err!(self.builder.build_shuffle_vector(
                     v2,
                     v2.get_type().get_undef(),
                     VectorType::const_vector(&right_indices),
                     "",
                 ));
-                let v2_right =
-                    err!(self.builder.build_int_s_extend(v2_right, self.intrinsics.i16x8_ty, ""));
+                let v2_right = err!(self.builder.build_int_s_extend(
+                    v2_right,
+                    self.intrinsics.i16x8_ty,
+                    ""
+                ));
 
                 let prod_left = err!(self.builder.build_int_mul(v1_left, v2_left, ""));
                 let prod_right = err!(self.builder.build_int_mul(v1_right, v2_right, ""));
@@ -4120,16 +4132,22 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     VectorType::const_vector(&left_indices),
                     "",
                 ));
-                let v1_left =
-                    err!(self.builder.build_int_s_extend(v1_left, self.intrinsics.i16x8_ty, ""));
+                let v1_left = err!(self.builder.build_int_s_extend(
+                    v1_left,
+                    self.intrinsics.i16x8_ty,
+                    ""
+                ));
                 let v1_right = err!(self.builder.build_shuffle_vector(
                     v1,
                     v1.get_type().get_undef(),
                     VectorType::const_vector(&right_indices),
                     "",
                 ));
-                let v1_right =
-                    err!(self.builder.build_int_s_extend(v1_right, self.intrinsics.i16x8_ty, ""));
+                let v1_right = err!(self.builder.build_int_s_extend(
+                    v1_right,
+                    self.intrinsics.i16x8_ty,
+                    ""
+                ));
 
                 let v2_left = err!(self.builder.build_shuffle_vector(
                     v2,
@@ -4137,16 +4155,22 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     VectorType::const_vector(&left_indices),
                     "",
                 ));
-                let v2_left =
-                    err!(self.builder.build_int_s_extend(v2_left, self.intrinsics.i16x8_ty, ""));
+                let v2_left = err!(self.builder.build_int_s_extend(
+                    v2_left,
+                    self.intrinsics.i16x8_ty,
+                    ""
+                ));
                 let v2_right = err!(self.builder.build_shuffle_vector(
                     v2,
                     v2.get_type().get_undef(),
                     VectorType::const_vector(&right_indices),
                     "",
                 ));
-                let v2_right =
-                    err!(self.builder.build_int_s_extend(v2_right, self.intrinsics.i16x8_ty, ""));
+                let v2_right = err!(self.builder.build_int_s_extend(
+                    v2_right,
+                    self.intrinsics.i16x8_ty,
+                    ""
+                ));
 
                 let prod_left = err!(self.builder.build_int_mul(v1_left, v2_left, ""));
                 let prod_right = err!(self.builder.build_int_mul(v1_right, v2_right, ""));
@@ -4163,8 +4187,11 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     ]),
                     "",
                 ));
-                let pair_left =
-                    err!(self.builder.build_int_s_extend(pair_left, self.intrinsics.i32x4_ty, ""));
+                let pair_left = err!(self.builder.build_int_s_extend(
+                    pair_left,
+                    self.intrinsics.i32x4_ty,
+                    ""
+                ));
                 let pair_right = err!(self.builder.build_shuffle_vector(
                     dot16,
                     dot16.get_type().get_undef(),
@@ -4176,8 +4203,11 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     ]),
                     "",
                 ));
-                let pair_right =
-                    err!(self.builder.build_int_s_extend(pair_right, self.intrinsics.i32x4_ty, ""));
+                let pair_right = err!(self.builder.build_int_s_extend(
+                    pair_right,
+                    self.intrinsics.i32x4_ty,
+                    ""
+                ));
                 let dot32 = err!(self.builder.build_int_add(pair_left, pair_right, ""));
                 let res = err!(self.builder.build_int_add(dot32, acc, ""));
                 let res = err!(

From 685da768954e37ad7c615410aa97bb8d592417a6 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Sun, 15 Feb 2026 20:12:22 +0100
Subject: [PATCH 04/18] proper detection of relaxed SIMD feature for WASM

---
 lib/compiler-cranelift/src/config.rs | 1 +
 lib/compiler-llvm/src/config.rs      | 1 +
 lib/package/src/utils.rs             | 3 +++
 lib/types/src/features.rs            | 5 ++++-
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/compiler-cranelift/src/config.rs b/lib/compiler-cranelift/src/config.rs
index 787ba9d7b41..39ac34f358a 100644
--- a/lib/compiler-cranelift/src/config.rs
+++ b/lib/compiler-cranelift/src/config.rs
@@ -305,6 +305,7 @@ impl CompilerConfig for Cranelift {
         if target.triple().operating_system == OperatingSystem::Linux {
             feats.exceptions(true);
         }
+        feats.relaxed_simd(true);
         feats
     }
 }
diff --git a/lib/compiler-llvm/src/config.rs b/lib/compiler-llvm/src/config.rs
index 90534135922..902bda116c2 100644
--- a/lib/compiler-llvm/src/config.rs
+++ b/lib/compiler-llvm/src/config.rs
@@ -384,6 +384,7 @@ impl CompilerConfig for LLVM {
     fn supported_features_for_target(&self, _target: &Target) -> wasmer_types::Features {
         let mut feats = Features::default();
         feats.exceptions(true);
+        feats.relaxed_simd(true);
         feats
     }
 }
diff --git a/lib/package/src/utils.rs b/lib/package/src/utils.rs
index 073e5686e6d..83bbf9c3ce5 100644
--- a/lib/package/src/utils.rs
+++ b/lib/package/src/utils.rs
@@ -197,6 +197,9 @@ pub fn wasm_annotations_to_features(feature_strings: &[String]) -> Features {
             "memory64" => {
                 features.memory64(true);
             }
+            "relaxed-simd" => {
+                features.relaxed_simd(true);
+            }
             // Ignore unrecognized features
             _ => {}
         }
diff --git a/lib/types/src/features.rs b/lib/types/src/features.rs
index 8cfdb812381..d8fc16b8e37 100644
--- a/lib/types/src/features.rs
+++ b/lib/types/src/features.rs
@@ -371,6 +371,7 @@ impl Features {
         wasm_features.set(WasmFeatures::TAIL_CALL, true);
         wasm_features.set(WasmFeatures::MULTI_MEMORY, true);
         wasm_features.set(WasmFeatures::MEMORY64, true);
+        wasm_features.set(WasmFeatures::RELAXED_SIMD, false);
 
         let mut validator = Validator::new_with_features(wasm_features);
         match validator.validate_all(wasm_bytes) {
@@ -390,7 +391,9 @@ impl Features {
                     features.reference_types(true);
                 }
 
-                if err_msg.contains("simd") {
+                if err_msg.contains("relaxed simd") {
+                    features.relaxed_simd(true);
+                } else if err_msg.contains("simd") {
                     features.simd(true);
                 }
 

From 31b0792ab967814e6fb5fa90c4128b29375eb64e Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Sun, 15 Feb 2026 20:21:02 +0100
Subject: [PATCH 05/18] x86_64: fast I8x16RelaxedSwizzle implementation

---
 Cargo.lock                                    |  1 +
 lib/compiler-llvm/Cargo.toml                  |  1 +
 lib/compiler-llvm/src/compiler.rs             |  2 ++
 lib/compiler-llvm/src/translator/code.rs      | 29 ++++++++++++++++++-
 .../src/translator/intrinsics.rs              | 16 ++++++++++
 5 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index 14c20b4d8e4..c472db8818b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7110,6 +7110,7 @@ dependencies = [
  "byteorder",
  "cc",
  "crossbeam-channel",
+ "enumset",
  "inkwell",
  "itertools 0.14.0",
  "libc",
diff --git a/lib/compiler-llvm/Cargo.toml b/lib/compiler-llvm/Cargo.toml
index 57c072139e6..6c5ed4bd995 100644
--- a/lib/compiler-llvm/Cargo.toml
+++ b/lib/compiler-llvm/Cargo.toml
@@ -29,6 +29,7 @@ itertools.workspace = true
 rayon.workspace = true
 phf = { workspace = true, features = ["macros"] }
 tracing = { workspace = true, features = ["log"] }
+enumset.workspace = true
 inkwell = { workspace = true, features = [
 	"llvm21-1-prefer-static",
 	"target-x86",
diff --git a/lib/compiler-llvm/src/compiler.rs b/lib/compiler-llvm/src/compiler.rs
index 2c0df6624d7..05d3f1d8fd0 100644
--- a/lib/compiler-llvm/src/compiler.rs
+++ b/lib/compiler-llvm/src/compiler.rs
@@ -200,6 +200,7 @@ impl LLVMCompiler {
                     Some(target_machine_no_opt),
                     binary_format,
                     pointer_width,
+                    *target.cpu_features(),
                 )
                 .unwrap()
             },
@@ -449,6 +450,7 @@ impl Compiler for LLVMCompiler {
                     Some(target_machine_no_opt),
                     binary_format,
                     pointer_width,
+                    *target.cpu_features(),
                 )
                 .unwrap()
             },
diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index 0cbc1cf3199..d227385b003 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -8,6 +8,7 @@ use super::{
     state::{ControlFrame, ExtraInfo, IfElseState, State, TagCatchInfo},
 };
 use crate::compiler::ModuleBasedSymbolRegistry;
+use enumset::EnumSet;
 use inkwell::{
     AddressSpace, AtomicOrdering, AtomicRMWBinOp, DLLStorageClass, FloatPredicate, IntPredicate,
     attributes::{Attribute, AttributeLoc},
@@ -51,7 +52,7 @@ use wasmer_compiler::{
 };
 use wasmer_types::{
     CompileError, FunctionIndex, FunctionType, GlobalIndex, LocalFunctionIndex, MemoryIndex,
-    ModuleInfo, SignatureIndex, TableIndex, Type,
+    ModuleInfo, SignatureIndex, TableIndex, Type, target::CpuFeature,
 };
 use wasmer_types::{TagIndex, entity::PrimaryMap};
 use wasmer_vm::{MemoryStyle, TableStyle, VMOffsets};
@@ -76,6 +77,7 @@ pub struct FuncTranslator {
     binary_fmt: BinaryFormat,
     func_section: String,
     pointer_width: u8,
+    cpu_features: EnumSet<CpuFeature>,
 }
 
 impl wasmer_compiler::FuncTranslator for FuncTranslator {}
@@ -87,6 +89,7 @@ impl FuncTranslator {
         target_machine_no_opt: Option<TargetMachine>,
         binary_fmt: BinaryFormat,
         pointer_width: u8,
+        cpu_features: EnumSet<CpuFeature>,
     ) -> Result<Self, CompileError> {
         let abi = get_abi(&target_machine);
         Ok(Self {
@@ -106,6 +109,7 @@ impl FuncTranslator {
             },
             binary_fmt,
             pointer_width,
+            cpu_features,
         })
     }
 
@@ -339,6 +343,7 @@ impl FuncTranslator {
             target_triple: self.target_triple.clone(),
             tags_cache: HashMap::new(),
             binary_fmt: self.binary_fmt,
+            cpu_features: self.cpu_features,
         };
 
         fcg.ctx.add_func(
@@ -1931,6 +1936,7 @@ pub struct LLVMFunctionCodeGenerator<'ctx, 'a> {
     target_triple: Triple,
     tags_cache: HashMap<i32, BasicValueEnum<'ctx>>,
     binary_fmt: target_lexicon::BinaryFormat,
+    cpu_features: EnumSet<CpuFeature>,
 }
 
 impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
@@ -9854,6 +9860,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 };
                 self.state.push1_extra(res, info);
             }
+            Operator::I8x16RelaxedSwizzle if self.cpu_features.contains(CpuFeature::SSSE3) => {
+                let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
+                let v1 = self.apply_pending_canonicalization(v1, i1)?;
+                let v2 = self.apply_pending_canonicalization(v2, i2)?;
+
+                let (v1, _) = self.v128_into_i8x16(v1, i1)?;
+                let (v2, _) = self.v128_into_i8x16(v2, i2)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.pshufb128,
+                        &[v1.into(), v2.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
             Operator::I8x16Swizzle | Operator::I8x16RelaxedSwizzle => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let v1 = self.apply_pending_canonicalization(v1, i1)?;
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index 2a0153cb3cf..0ab626bb04a 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -56,6 +56,12 @@ pub fn type_to_llvm<'ctx>(
     }
 }
 
+/// Struct containing x86_64 SIMD LLVM intrinsics.
+#[allow(dead_code)]
+pub struct X86_64Intrinsics<'ctx> {
+    pub pshufb128: FunctionValue<'ctx>,
+}
+
 /// Struct containing LLVM and VM intrinsics.
 #[allow(dead_code)]
 pub struct Intrinsics<'ctx> {
@@ -195,6 +201,8 @@ pub struct Intrinsics<'ctx> {
 
     pub ptr_ty: PointerType<'ctx>,
 
+    pub x86_64: X86_64Intrinsics<'ctx>,
+
     pub anyfunc_ty: StructType<'ctx>,
 
     pub i1_zero: IntValue<'ctx>,
@@ -1255,6 +1263,14 @@ impl<'ctx> Intrinsics<'ctx> {
 
             // LLVM > 15 has a single type for pointers.
             ptr_ty,
+
+            x86_64: X86_64Intrinsics {
+                pshufb128: add_function_with_attrs(
+                    "llvm.x86.ssse3.pshuf.b.128",
+                    ret_i8x16_take_i8x16_i8x16,
+                    None,
+                ),
+            },
         };
 
         let noreturn =

From 59249890d99f733161d6d31983724e7aa307f717 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Sun, 15 Feb 2026 20:43:17 +0100
Subject: [PATCH 06/18] x86_64: fast implementation of I32x4RelaxedTruncF32x4S

---
 lib/compiler-llvm/src/translator/code.rs      | 19 +++++++++++++++++++
 .../src/translator/intrinsics.rs              |  7 +++++++
 2 files changed, 26 insertions(+)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index d227385b003..c4f3d950cdf 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -8068,6 +8068,25 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
+            Operator::I32x4RelaxedTruncF32x4S
+                if self.cpu_features.contains(CpuFeature::SSE2) =>
+            {
+                let (v, i) = self.state.pop1_extra()?;
+                let (v, _) = self.v128_into_f32x4(v, i)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.cvtps2dq,
+                        &[v.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
             Operator::I32x4TruncSatF32x4S | Operator::I32x4RelaxedTruncF32x4S => {
                 let (v, i) = self.state.pop1_extra()?;
                 let v = self.apply_pending_canonicalization(v, i)?;
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index 0ab626bb04a..875964f311d 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -60,6 +60,7 @@ pub fn type_to_llvm<'ctx>(
 #[allow(dead_code)]
 pub struct X86_64Intrinsics<'ctx> {
     pub pshufb128: FunctionValue<'ctx>,
+    pub cvtps2dq: FunctionValue<'ctx>,
 }
 
 /// Struct containing LLVM and VM intrinsics.
@@ -490,6 +491,7 @@ impl<'ctx> Intrinsics<'ctx> {
             ],
             false,
         );
+        let ret_i32x4_take_f32x4 = i32x4_ty.fn_type(&[f32x4_ty_basic_md], false);
 
         let add_function_with_attrs =
             |name: &str, ty: FunctionType<'ctx>, linkage: Option<Linkage>| -> FunctionValue<'ctx> {
@@ -1270,6 +1272,11 @@ impl<'ctx> Intrinsics<'ctx> {
                     ret_i8x16_take_i8x16_i8x16,
                     None,
                 ),
+                cvtps2dq: add_function_with_attrs(
+                    "llvm.x86.sse2.cvtps2dq",
+                    ret_i32x4_take_f32x4,
+                    None,
+                ),
             },
         };
 

From f8a74b9cf1a0870e4e7de11e44794d4ef05b5ed3 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Sun, 15 Feb 2026 20:48:20 +0100
Subject: [PATCH 07/18] x86_64: fast implementation of I32x4RelaxedTruncF32x4U

---
 lib/compiler-llvm/src/translator/code.rs      | 24 +++++++++++++++++++
 .../src/translator/intrinsics.rs              | 11 +++++++++
 2 files changed, 35 insertions(+)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index c4f3d950cdf..a12937a9ac2 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -8102,6 +8102,30 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 )?;
                 self.state.push1(res);
             }
+            Operator::I32x4RelaxedTruncF32x4U
+                if self.cpu_features.contains(CpuFeature::AVX512F)
+                    && self.cpu_features.contains(CpuFeature::AVX512VL) =>
+            {
+                let (v, i) = self.state.pop1_extra()?;
+                let (v, _) = self.v128_into_f32x4(v, i)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.cvtps2udq128,
+                        &[
+                            v.into(),
+                            self.intrinsics.i32x4_ty.const_zero().into(),
+                            self.intrinsics.i8_ty.const_int(0xff, false).into(),
+                        ],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
             Operator::I32x4TruncSatF32x4U | Operator::I32x4RelaxedTruncF32x4U => {
                 let (v, i) = self.state.pop1_extra()?;
                 let v = self.apply_pending_canonicalization(v, i)?;
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index 875964f311d..9c4f2f2ab9e 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -61,6 +61,7 @@ pub fn type_to_llvm<'ctx>(
 pub struct X86_64Intrinsics<'ctx> {
     pub pshufb128: FunctionValue<'ctx>,
     pub cvtps2dq: FunctionValue<'ctx>,
+    pub cvtps2udq128: FunctionValue<'ctx>,
 }
 
 /// Struct containing LLVM and VM intrinsics.
@@ -370,6 +371,7 @@ impl<'ctx> Intrinsics<'ctx> {
         let f64_ty_basic_md: BasicMetadataTypeEnum = f64_ty.into();
         let i8x16_ty_basic_md: BasicMetadataTypeEnum = i8x16_ty.into();
         let i16x8_ty_basic_md: BasicMetadataTypeEnum = i16x8_ty.into();
+        let i32x4_ty_basic_md: BasicMetadataTypeEnum = i32x4_ty.into();
         let f32x4_ty_basic_md: BasicMetadataTypeEnum = f32x4_ty.into();
         let f64x2_ty_basic_md: BasicMetadataTypeEnum = f64x2_ty.into();
         let md_ty_basic_md: BasicMetadataTypeEnum = md_ty.into();
@@ -492,6 +494,10 @@ impl<'ctx> Intrinsics<'ctx> {
             false,
         );
         let ret_i32x4_take_f32x4 = i32x4_ty.fn_type(&[f32x4_ty_basic_md], false);
+        let ret_i32x4_take_f32x4_i32x4_i8 = i32x4_ty.fn_type(
+            &[f32x4_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()],
+            false,
+        );
 
         let add_function_with_attrs =
             |name: &str, ty: FunctionType<'ctx>, linkage: Option<Linkage>| -> FunctionValue<'ctx> {
@@ -1277,6 +1283,11 @@ impl<'ctx> Intrinsics<'ctx> {
                     ret_i32x4_take_f32x4,
                     None,
                 ),
+                cvtps2udq128: add_function_with_attrs(
+                    "llvm.x86.avx512.mask.cvtps2udq.128",
+                    ret_i32x4_take_f32x4_i32x4_i8,
+                    None,
+                ),
             },
         };
 

From 95fec257748899976c93387bb7a7509eab9ee182 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Sun, 15 Feb 2026 23:27:01 +0100
Subject: [PATCH 08/18] x86_64: fast implementation of 2 more trunc
 instructions

---
 lib/compiler-llvm/src/translator/code.rs      | 47 +++++++++++++++++--
 .../src/translator/intrinsics.rs              | 17 +++++++
 .../relaxed-simd/i32x4_relaxed_trunc.wast     |  2 +-
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index a12937a9ac2..253a3c57a8d 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -8068,9 +8068,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
-            Operator::I32x4RelaxedTruncF32x4S
-                if self.cpu_features.contains(CpuFeature::SSE2) =>
-            {
+            Operator::I32x4RelaxedTruncF32x4S if self.cpu_features.contains(CpuFeature::SSE2) => {
                 let (v, i) = self.state.pop1_extra()?;
                 let (v, _) = self.v128_into_f32x4(v, i)?;
                 let res = self
@@ -8141,6 +8139,49 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 )?;
                 self.state.push1(res);
             }
+            Operator::I32x4RelaxedTruncF64x2SZero
+                if self.cpu_features.contains(CpuFeature::SSE2) =>
+            {
+                let (v, i) = self.state.pop1_extra()?;
+                let (v, _) = self.v128_into_f64x2(v, i)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.cvtpd2dq,
+                        &[v.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
+            Operator::I32x4RelaxedTruncF64x2UZero
+                if self.cpu_features.contains(CpuFeature::AVX512F)
+                    && self.cpu_features.contains(CpuFeature::AVX512VL) =>
+            {
+                let (v, i) = self.state.pop1_extra()?;
+                let (v, _) = self.v128_into_f64x2(v, i)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.cvtpd2udq128,
+                        &[
+                            v.into(),
+                            self.intrinsics.i32x4_ty.const_zero().into(),
+                            self.intrinsics.i8_ty.const_int(0xff, false).into(),
+                        ],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
             Operator::I32x4TruncSatF64x2SZero
             | Operator::I32x4TruncSatF64x2UZero
             | Operator::I32x4RelaxedTruncF64x2SZero
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index 9c4f2f2ab9e..94925c8fb2d 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -62,6 +62,8 @@ pub struct X86_64Intrinsics<'ctx> {
     pub pshufb128: FunctionValue<'ctx>,
     pub cvtps2dq: FunctionValue<'ctx>,
     pub cvtps2udq128: FunctionValue<'ctx>,
+    pub cvtpd2dq: FunctionValue<'ctx>,
+    pub cvtpd2udq128: FunctionValue<'ctx>,
 }
 
 /// Struct containing LLVM and VM intrinsics.
@@ -498,6 +500,11 @@ impl<'ctx> Intrinsics<'ctx> {
             &[f32x4_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()],
             false,
         );
+        let ret_i32x4_take_f64x2 = i32x4_ty.fn_type(&[f64x2_ty_basic_md], false);
+        let ret_i32x4_take_f64x2_i32x4_i8 = i32x4_ty.fn_type(
+            &[f64x2_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()],
+            false,
+        );
 
         let add_function_with_attrs =
             |name: &str, ty: FunctionType<'ctx>, linkage: Option<Linkage>| -> FunctionValue<'ctx> {
@@ -1288,6 +1295,16 @@ impl<'ctx> Intrinsics<'ctx> {
                     ret_i32x4_take_f32x4_i32x4_i8,
                     None,
                 ),
+                cvtpd2dq: add_function_with_attrs(
+                    "llvm.x86.sse2.cvtpd2dq",
+                    ret_i32x4_take_f64x2,
+                    None,
+                ),
+                cvtpd2udq128: add_function_with_attrs(
+                    "llvm.x86.avx512.mask.cvtpd2udq.128",
+                    ret_i32x4_take_f64x2_i32x4_i8,
+                    None,
+                ),
             },
         };
 
diff --git a/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast b/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast
index cca3ecb958a..e4ea88e3643 100644
--- a/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast
+++ b/tests/wast/spec/proposals/relaxed-simd/i32x4_relaxed_trunc.wast
@@ -79,7 +79,7 @@
 (assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero"
                        (v128.const f64x2 nan -nan))
                (either (v128.const i32x4 0 0 0 0)
-                       (v128.const i32x4 0 0 0xffffffff 0xffffffff)))
+                       (v128.const i32x4 0xffffffff 0xffffffff 0 0)))
 
 ;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
 

From d8ab31b3258f0fac664317de525183b543a06503 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Sun, 15 Feb 2026 23:40:45 +0100
Subject: [PATCH 09/18] x86_64: fast implementation for FMA isntructions

---
 lib/compiler-llvm/src/translator/code.rs      | 70 +++++++++++++++++++
 .../src/translator/intrinsics.rs              | 32 +++++++++
 lib/types/src/target.rs                       |  6 ++
 3 files changed, 108 insertions(+)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index 253a3c57a8d..6ce7bde0d5b 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -5509,6 +5509,41 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f32_nan())?,
                 );
             }
+            Operator::F32x4RelaxedMadd | Operator::F32x4RelaxedNmadd
+                if self.cpu_features.contains(CpuFeature::FMA) =>
+            {
+                let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?;
+                let (v1, i1) = self.v128_into_f32x4(v1, i1)?;
+                let (v2, i2) = self.v128_into_f32x4(v2, i2)?;
+                let (v3, i3) = self.v128_into_f32x4(v3, i3)?;
+
+                let v1 = match op {
+                    Operator::F32x4RelaxedNmadd => err!(self.builder.build_float_neg(v1, "")),
+                    _ => v1,
+                };
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.muladd_f32x4,
+                        &[
+                            v1.into(),
+                            v2.into(),
+                            v3.into(),
+                            self.intrinsics.fp_rounding_md,
+                            self.intrinsics.fp_exception_md,
+                        ],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                let info = (i1.strip_pending() & i2.strip_pending())?;
+                let info = (info & i3.strip_pending())?;
+                let info = (info | ExtraInfo::pending_f32_nan())?;
+                self.state.push1_extra(res, info);
+            }
             Operator::F32x4RelaxedMadd | Operator::F32x4RelaxedNmadd => {
                 let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?;
                 let (v1, i1) = self.v128_into_f32x4(v1, i1)?;
@@ -5581,6 +5616,41 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f64_nan())?,
                 );
             }
+            Operator::F64x2RelaxedMadd | Operator::F64x2RelaxedNmadd
+                if self.cpu_features.contains(CpuFeature::FMA) =>
+            {
+                let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?;
+                let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
+                let (v2, i2) = self.v128_into_f64x2(v2, i2)?;
+                let (v3, i3) = self.v128_into_f64x2(v3, i3)?;
+
+                let v1 = match op {
+                    Operator::F64x2RelaxedNmadd => err!(self.builder.build_float_neg(v1, "")),
+                    _ => v1,
+                };
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.muladd_f64x2,
+                        &[
+                            v1.into(),
+                            v2.into(),
+                            v3.into(),
+                            self.intrinsics.fp_rounding_md,
+                            self.intrinsics.fp_exception_md,
+                        ],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                let info = (i1.strip_pending() & i2.strip_pending())?;
+                let info = (info & i3.strip_pending())?;
+                let info = (info | ExtraInfo::pending_f64_nan())?;
+                self.state.push1_extra(res, info);
+            }
             Operator::F64x2RelaxedMadd | Operator::F64x2RelaxedNmadd => {
                 let ((v1, i1), (v2, i2), (v3, i3)) = self.state.pop3_extra()?;
                 let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index 94925c8fb2d..70570ebbc65 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -99,6 +99,8 @@ pub struct Intrinsics<'ctx> {
     pub mul_f64: FunctionValue<'ctx>,
     pub mul_f32x4: FunctionValue<'ctx>,
     pub mul_f64x2: FunctionValue<'ctx>,
+    pub muladd_f32x4: FunctionValue<'ctx>,
+    pub muladd_f64x2: FunctionValue<'ctx>,
 
     pub div_f32: FunctionValue<'ctx>,
     pub div_f64: FunctionValue<'ctx>,
@@ -415,6 +417,26 @@ impl<'ctx> Intrinsics<'ctx> {
             f32x4_ty.fn_type(&[f32x4_ty_basic_md, f32x4_ty_basic_md], false);
         let ret_f64x2_take_f64x2_f64x2 =
             f64x2_ty.fn_type(&[f64x2_ty_basic_md, f64x2_ty_basic_md], false);
+        let ret_f32x4_take_f32x4_f32x4_f32x4_md_md = f32x4_ty.fn_type(
+            &[
+                f32x4_ty_basic_md,
+                f32x4_ty_basic_md,
+                f32x4_ty_basic_md,
+                md_ty_basic_md,
+                md_ty_basic_md,
+            ],
+            false,
+        );
+        let ret_f64x2_take_f64x2_f64x2_f64x2_md_md = f64x2_ty.fn_type(
+            &[
+                f64x2_ty_basic_md,
+                f64x2_ty_basic_md,
+                f64x2_ty_basic_md,
+                md_ty_basic_md,
+                md_ty_basic_md,
+            ],
+            false,
+        );
 
         let ret_f64_take_f32_md = f64_ty.fn_type(&[f32_ty_basic_md, md_ty_basic_md], false);
         let ret_f32_take_f64_md_md =
@@ -649,6 +671,16 @@ impl<'ctx> Intrinsics<'ctx> {
                 ret_f64x2_take_f64x2_f64x2_md_md,
                 None,
             ),
+            muladd_f32x4: add_function_with_attrs(
+                "llvm.experimental.constrained.fmuladd.v4f32",
+                ret_f32x4_take_f32x4_f32x4_f32x4_md_md,
+                None,
+            ),
+            muladd_f64x2: add_function_with_attrs(
+                "llvm.experimental.constrained.fmuladd.v2f64",
+                ret_f64x2_take_f64x2_f64x2_f64x2_md_md,
+                None,
+            ),
 
             div_f32: add_function_with_attrs(
                 "llvm.experimental.constrained.fdiv.f32",
diff --git a/lib/types/src/target.rs b/lib/types/src/target.rs
index c0e4eada1e0..7f782b4f87d 100644
--- a/lib/types/src/target.rs
+++ b/lib/types/src/target.rs
@@ -42,6 +42,7 @@ pub enum CpuFeature {
     BMI1,
     BMI2,
     AVX2,
+    FMA,
     AVX512DQ,
     AVX512VL,
     AVX512F,
@@ -87,6 +88,9 @@ impl CpuFeature {
         if std::is_x86_feature_detected!("avx2") {
             features.insert(Self::AVX2);
         }
+        if std::is_x86_feature_detected!("fma") {
+            features.insert(Self::FMA);
+        }
         if std::is_x86_feature_detected!("avx512dq") {
             features.insert(Self::AVX512DQ);
         }
@@ -154,6 +158,7 @@ impl FromStr for CpuFeature {
             "bmi" => Ok(Self::BMI1),
             "bmi2" => Ok(Self::BMI2),
             "avx2" => Ok(Self::AVX2),
+            "fma" => Ok(Self::FMA),
             "avx512dq" => Ok(Self::AVX512DQ),
             "avx512vl" => Ok(Self::AVX512VL),
             "avx512f" => Ok(Self::AVX512F),
@@ -180,6 +185,7 @@ impl std::fmt::Display for CpuFeature {
                 Self::BMI1 => "bmi",
                 Self::BMI2 => "bmi2",
                 Self::AVX2 => "avx2",
+                Self::FMA => "fma",
                 Self::AVX512DQ => "avx512dq",
                 Self::AVX512VL => "avx512vl",
                 Self::AVX512F => "avx512f",

From 20d03eb5336a0407cda9b1f37357d9a0879ae353 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Sun, 15 Feb 2026 23:50:28 +0100
Subject: [PATCH 10/18] x86_64: fast implementation of RelaxedLaneselect

---
 lib/compiler-llvm/src/translator/code.rs      | 28 +++++++++++++++++++
 .../src/translator/intrinsics.rs              | 10 +++++++
 2 files changed, 38 insertions(+)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index 6ce7bde0d5b..d5c4e5f066f 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -4346,6 +4346,34 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
             Operator::I8x16RelaxedLaneselect
             | Operator::I16x8RelaxedLaneselect
             | Operator::I32x4RelaxedLaneselect
+            | Operator::I64x2RelaxedLaneselect
+                if self.cpu_features.contains(CpuFeature::SSE41) =>
+            {
+                let ((v1, i1), (v2, i2), (mask, mask_info)) = self.state.pop3_extra()?;
+                let v1 = self.apply_pending_canonicalization(v1, i1)?;
+                let v2 = self.apply_pending_canonicalization(v2, i2)?;
+                let mask = self.apply_pending_canonicalization(mask, mask_info)?;
+
+                let (v1, _) = self.v128_into_i8x16(v1, i1)?;
+                let (v2, _) = self.v128_into_i8x16(v2, i2)?;
+                let (mask, _) = self.v128_into_i8x16(mask, mask_info)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.pblendvb,
+                        &[v2.into(), v1.into(), mask.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
+            Operator::I8x16RelaxedLaneselect
+            | Operator::I16x8RelaxedLaneselect
+            | Operator::I32x4RelaxedLaneselect
             | Operator::I64x2RelaxedLaneselect
             | Operator::V128Bitselect => {
                 let ((v1, i1), (v2, i2), (cond, cond_info)) = self.state.pop3_extra()?;
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index 70570ebbc65..f6325b014cc 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -60,6 +60,7 @@ pub fn type_to_llvm<'ctx>(
 #[allow(dead_code)]
 pub struct X86_64Intrinsics<'ctx> {
     pub pshufb128: FunctionValue<'ctx>,
+    pub pblendvb: FunctionValue<'ctx>,
     pub cvtps2dq: FunctionValue<'ctx>,
     pub cvtps2udq128: FunctionValue<'ctx>,
     pub cvtpd2dq: FunctionValue<'ctx>,
@@ -397,6 +398,10 @@ impl<'ctx> Intrinsics<'ctx> {
         let ret_i8x16_take_i8x16 = i8x16_ty.fn_type(&[i8x16_ty_basic_md], false);
         let ret_i8x16_take_i8x16_i8x16 =
             i8x16_ty.fn_type(&[i8x16_ty_basic_md, i8x16_ty_basic_md], false);
+        let ret_i8x16_take_i8x16_i8x16_i8x16 = i8x16_ty.fn_type(
+            &[i8x16_ty_basic_md, i8x16_ty_basic_md, i8x16_ty_basic_md],
+            false,
+        );
         let ret_i16x8_take_i16x8_i16x8 =
             i16x8_ty.fn_type(&[i16x8_ty_basic_md, i16x8_ty_basic_md], false);
 
@@ -1317,6 +1322,11 @@ impl<'ctx> Intrinsics<'ctx> {
                     ret_i8x16_take_i8x16_i8x16,
                     None,
                 ),
+                pblendvb: add_function_with_attrs(
+                    "llvm.x86.sse41.pblendvb",
+                    ret_i8x16_take_i8x16_i8x16_i8x16,
+                    None,
+                ),
                 cvtps2dq: add_function_with_attrs(
                     "llvm.x86.sse2.cvtps2dq",
                     ret_i32x4_take_f32x4,

From 4c4d146861a80cda70e5c9496d53cc007d52a6f0 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Sun, 15 Feb 2026 23:57:58 +0100
Subject: [PATCH 11/18] x86_64: fast implementation of RelaxedMin/Max

---
 lib/compiler-llvm/src/translator/code.rs      | 84 +++++++++++++++++++
 .../src/translator/intrinsics.rs              | 24 ++++++
 2 files changed, 108 insertions(+)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index d5c4e5f066f..c6c6d5f3b02 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -5903,6 +5903,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
 
                 self.state.push1_extra(res, ExtraInfo::pending_f64_nan());
             }
+            Operator::F32x4RelaxedMin if self.cpu_features.contains(CpuFeature::SSE2) => {
+                let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
+                let (v1, i1) = self.v128_into_f32x4(v1, i1)?;
+                let (v2, i2) = self.v128_into_f32x4(v2, i2)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.min_ps,
+                        &[v1.into(), v2.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1_extra(
+                    res,
+                    ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f32_nan())?,
+                );
+            }
             Operator::F32x4Min | Operator::F32x4RelaxedMin => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, i1) = self.v128_into_f32x4(v1, i1)?;
@@ -5944,6 +5965,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
+            Operator::F64x2RelaxedMin if self.cpu_features.contains(CpuFeature::SSE2) => {
+                let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
+                let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
+                let (v2, i2) = self.v128_into_f64x2(v2, i2)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.min_pd,
+                        &[v1.into(), v2.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1_extra(
+                    res,
+                    ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f64_nan())?,
+                );
+            }
             Operator::F64x2Min | Operator::F64x2RelaxedMin => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
@@ -6031,6 +6073,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
 
                 self.state.push1_extra(res, ExtraInfo::pending_f64_nan());
             }
+            Operator::F32x4RelaxedMax if self.cpu_features.contains(CpuFeature::SSE2) => {
+                let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
+                let (v1, i1) = self.v128_into_f32x4(v1, i1)?;
+                let (v2, i2) = self.v128_into_f32x4(v2, i2)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.max_ps,
+                        &[v1.into(), v2.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1_extra(
+                    res,
+                    ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f32_nan())?,
+                );
+            }
             Operator::F32x4Max | Operator::F32x4RelaxedMax => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, i1) = self.v128_into_f32x4(v1, i1)?;
@@ -6073,6 +6136,27 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
+            Operator::F64x2RelaxedMax if self.cpu_features.contains(CpuFeature::SSE2) => {
+                let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
+                let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
+                let (v2, i2) = self.v128_into_f64x2(v2, i2)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.max_pd,
+                        &[v1.into(), v2.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1_extra(
+                    res,
+                    ((i1.strip_pending() & i2.strip_pending())? | ExtraInfo::pending_f64_nan())?,
+                );
+            }
             Operator::F64x2Max | Operator::F64x2RelaxedMax => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, i1) = self.v128_into_f64x2(v1, i1)?;
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index f6325b014cc..d3e16c017c0 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -61,6 +61,10 @@ pub fn type_to_llvm<'ctx>(
 pub struct X86_64Intrinsics<'ctx> {
     pub pshufb128: FunctionValue<'ctx>,
     pub pblendvb: FunctionValue<'ctx>,
+    pub min_ps: FunctionValue<'ctx>,
+    pub min_pd: FunctionValue<'ctx>,
+    pub max_ps: FunctionValue<'ctx>,
+    pub max_pd: FunctionValue<'ctx>,
     pub cvtps2dq: FunctionValue<'ctx>,
     pub cvtps2udq128: FunctionValue<'ctx>,
     pub cvtpd2dq: FunctionValue<'ctx>,
@@ -1327,6 +1331,26 @@ impl<'ctx> Intrinsics<'ctx> {
                     ret_i8x16_take_i8x16_i8x16_i8x16,
                     None,
                 ),
+                min_ps: add_function_with_attrs(
+                    "llvm.x86.sse.min.ps",
+                    ret_f32x4_take_f32x4_f32x4,
+                    None,
+                ),
+                min_pd: add_function_with_attrs(
+                    "llvm.x86.sse2.min.pd",
+                    ret_f64x2_take_f64x2_f64x2,
+                    None,
+                ),
+                max_ps: add_function_with_attrs(
+                    "llvm.x86.sse.max.ps",
+                    ret_f32x4_take_f32x4_f32x4,
+                    None,
+                ),
+                max_pd: add_function_with_attrs(
+                    "llvm.x86.sse2.max.pd",
+                    ret_f64x2_take_f64x2_f64x2,
+                    None,
+                ),
                 cvtps2dq: add_function_with_attrs(
                     "llvm.x86.sse2.cvtps2dq",
                     ret_i32x4_take_f32x4,

From 04ee7589b83d7615e618c35603f9390a7d077840 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Mon, 16 Feb 2026 00:05:30 +0100
Subject: [PATCH 12/18] x86_64: fast implementation of I16x8RelaxedQ15mulrS

---
 lib/compiler-llvm/src/translator/code.rs       | 18 ++++++++++++++++++
 lib/compiler-llvm/src/translator/intrinsics.rs |  6 ++++++
 2 files changed, 24 insertions(+)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index c6c6d5f3b02..7048c42da63 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -3739,6 +3739,24 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
+            Operator::I16x8RelaxedQ15mulrS if self.cpu_features.contains(CpuFeature::SSSE3) => {
+                let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
+                let (v1, _) = self.v128_into_i16x8(v1, i1)?;
+                let (v2, _) = self.v128_into_i16x8(v2, i2)?;
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.pmulhrsw128,
+                        &[v1.into(), v2.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
             Operator::I16x8Q15MulrSatS | Operator::I16x8RelaxedQ15mulrS => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, _) = self.v128_into_i16x8(v1, i1)?;
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index d3e16c017c0..a5b5be07a9a 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -60,6 +60,7 @@ pub fn type_to_llvm<'ctx>(
 #[allow(dead_code)]
 pub struct X86_64Intrinsics<'ctx> {
     pub pshufb128: FunctionValue<'ctx>,
+    pub pmulhrsw128: FunctionValue<'ctx>,
     pub pblendvb: FunctionValue<'ctx>,
     pub min_ps: FunctionValue<'ctx>,
     pub min_pd: FunctionValue<'ctx>,
@@ -1326,6 +1327,11 @@ impl<'ctx> Intrinsics<'ctx> {
                     ret_i8x16_take_i8x16_i8x16,
                     None,
                 ),
+                pmulhrsw128: add_function_with_attrs(
+                    "llvm.x86.ssse3.pmul.hr.sw.128",
+                    ret_i16x8_take_i16x8_i16x8,
+                    None,
+                ),
                 pblendvb: add_function_with_attrs(
                     "llvm.x86.sse41.pblendvb",
                     ret_i8x16_take_i8x16_i8x16_i8x16,

From 013c5ce049872c6ccd77872d598430b65a56740a Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Mon, 16 Feb 2026 00:14:41 +0100
Subject: [PATCH 13/18] x86_64: fast implementation of
 I16x8RelaxedDotI8x16I7x16S

---
 lib/compiler-llvm/src/translator/code.rs      | 22 +++++++++++++++++++
 .../src/translator/intrinsics.rs              |  6 +++++
 2 files changed, 28 insertions(+)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index 7048c42da63..01a05c9f9d8 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -4042,6 +4042,28 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
+            Operator::I16x8RelaxedDotI8x16I7x16S
+                if self.cpu_features.contains(CpuFeature::SSSE3) =>
+            {
+                let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
+                let (a, _) = self.v128_into_i8x16(v1, i1)?;
+                let (b, _) = self.v128_into_i8x16(v2, i2)?;
+
+                let res = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.pmaddubsw128,
+                        &[b.into(), a.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic()
+                    .into_vector_value();
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
             Operator::I16x8RelaxedDotI8x16I7x16S => {
                 let ((v1, i1), (v2, i2)) = self.state.pop2_extra()?;
                 let (v1, _) = self.v128_into_i8x16(v1, i1)?;
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index a5b5be07a9a..e49e033b9b9 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -60,6 +60,7 @@ pub fn type_to_llvm<'ctx>(
 #[allow(dead_code)]
 pub struct X86_64Intrinsics<'ctx> {
     pub pshufb128: FunctionValue<'ctx>,
+    pub pmaddubsw128: FunctionValue<'ctx>,
     pub pmulhrsw128: FunctionValue<'ctx>,
     pub pblendvb: FunctionValue<'ctx>,
     pub min_ps: FunctionValue<'ctx>,
@@ -1327,6 +1328,11 @@ impl<'ctx> Intrinsics<'ctx> {
                     ret_i8x16_take_i8x16_i8x16,
                     None,
                 ),
+                pmaddubsw128: add_function_with_attrs(
+                    "llvm.x86.ssse3.pmadd.ub.sw.128",
+                    i16x8_ty.fn_type(&[i8x16_ty_basic_md, i8x16_ty_basic_md], false),
+                    None,
+                ),
                 pmulhrsw128: add_function_with_attrs(
                     "llvm.x86.ssse3.pmul.hr.sw.128",
                     ret_i16x8_take_i16x8_i16x8,

From b447a0539ce62e3be2664f2f52faa0b2a883920d Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Mon, 16 Feb 2026 00:19:20 +0100
Subject: [PATCH 14/18] x86_64: fast implementation of
 I32x4RelaxedDotI8x16I7x16AddS

---
 lib/compiler-llvm/src/translator/code.rs      | 36 +++++++++++++++++++
 .../src/translator/intrinsics.rs              |  6 ++++
 2 files changed, 42 insertions(+)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index 01a05c9f9d8..40f64437d2e 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -4145,6 +4145,42 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 );
                 self.state.push1(res);
             }
+            Operator::I32x4RelaxedDotI8x16I7x16AddS
+                if self.cpu_features.contains(CpuFeature::SSSE3) =>
+            {
+                let ((v1, i1), (v2, i2), (acc, acc_info)) = self.state.pop3_extra()?;
+                let (v1, _) = self.v128_into_i8x16(v1, i1)?;
+                let (v2, _) = self.v128_into_i8x16(v2, i2)?;
+                let (acc, _) = self.v128_into_i32x4(acc, acc_info)?;
+
+                // PMADDUBSW computes pairwise u8*i8 with i16 saturation, which
+                // is one of the valid relaxed dot-product behaviors.
+                let dot16 = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.pmaddubsw128,
+                        &[v2.into(), v1.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic()
+                    .into_vector_value();
+                let ones = VectorType::const_vector(&[self.intrinsics.i16_ty.const_int(1, false); 8]);
+                let dot32 = self
+                    .build_call_with_param_attributes(
+                        self.intrinsics.x86_64.pmaddwd128,
+                        &[dot16.into(), ones.into()],
+                        "",
+                    )?
+                    .try_as_basic_value()
+                    .unwrap_basic()
+                    .into_vector_value();
+                let res = err!(self.builder.build_int_add(dot32, acc, ""));
+                let res = err!(
+                    self.builder
+                        .build_bit_cast(res, self.intrinsics.i128_ty, "")
+                );
+                self.state.push1(res);
+            }
             Operator::I32x4RelaxedDotI8x16I7x16AddS => {
                 let ((v1, i1), (v2, i2), (acc, acc_info)) = self.state.pop3_extra()?;
                 let (v1, _) = self.v128_into_i8x16(v1, i1)?;
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index e49e033b9b9..09e99bc6e0a 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -61,6 +61,7 @@ pub fn type_to_llvm<'ctx>(
 pub struct X86_64Intrinsics<'ctx> {
     pub pshufb128: FunctionValue<'ctx>,
     pub pmaddubsw128: FunctionValue<'ctx>,
+    pub pmaddwd128: FunctionValue<'ctx>,
     pub pmulhrsw128: FunctionValue<'ctx>,
     pub pblendvb: FunctionValue<'ctx>,
     pub min_ps: FunctionValue<'ctx>,
@@ -1333,6 +1334,11 @@ impl<'ctx> Intrinsics<'ctx> {
                     i16x8_ty.fn_type(&[i8x16_ty_basic_md, i8x16_ty_basic_md], false),
                     None,
                 ),
+                pmaddwd128: add_function_with_attrs(
+                    "llvm.x86.sse2.pmadd.wd",
+                    i32x4_ty.fn_type(&[i16x8_ty_basic_md, i16x8_ty_basic_md], false),
+                    None,
+                ),
                 pmulhrsw128: add_function_with_attrs(
                     "llvm.x86.ssse3.pmul.hr.sw.128",
                     ret_i16x8_take_i16x8_i16x8,

From 004861e9438cb21f49b490cbe71feb2a31811504 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Mon, 16 Feb 2026 09:30:33 +0100
Subject: [PATCH 15/18] fix enum order in CpuFeature

---
 lib/types/src/target.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/types/src/target.rs b/lib/types/src/target.rs
index 7f782b4f87d..e47748ef412 100644
--- a/lib/types/src/target.rs
+++ b/lib/types/src/target.rs
@@ -42,14 +42,14 @@ pub enum CpuFeature {
     BMI1,
     BMI2,
     AVX2,
-    FMA,
     AVX512DQ,
     AVX512VL,
     AVX512F,
     LZCNT,
     // ARM features
     NEON,
-    // RISC-V features
+    // X86 features (TODO: reorganize at some point)
+    FMA,
 }
 
 impl CpuFeature {

From 45a44a7b05c34a306749fd9339f97bbac94fd08a Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Mon, 16 Feb 2026 09:30:55 +0100
Subject: [PATCH 16/18] run cargo fmt

---
 lib/compiler-llvm/src/translator/code.rs       |  3 ++-
 lib/compiler-llvm/src/translator/intrinsics.rs | 12 ++++--------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index 40f64437d2e..d5bca2bb08b 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -4164,7 +4164,8 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                     .try_as_basic_value()
                     .unwrap_basic()
                     .into_vector_value();
-                let ones = VectorType::const_vector(&[self.intrinsics.i16_ty.const_int(1, false); 8]);
+                let ones =
+                    VectorType::const_vector(&[self.intrinsics.i16_ty.const_int(1, false); 8]);
                 let dot32 = self
                     .build_call_with_param_attributes(
                         self.intrinsics.x86_64.pmaddwd128,
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index 09e99bc6e0a..032518080c3 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -530,15 +530,11 @@ impl<'ctx> Intrinsics<'ctx> {
             false,
         );
         let ret_i32x4_take_f32x4 = i32x4_ty.fn_type(&[f32x4_ty_basic_md], false);
-        let ret_i32x4_take_f32x4_i32x4_i8 = i32x4_ty.fn_type(
-            &[f32x4_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()],
-            false,
-        );
+        let ret_i32x4_take_f32x4_i32x4_i8 =
+            i32x4_ty.fn_type(&[f32x4_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()], false);
         let ret_i32x4_take_f64x2 = i32x4_ty.fn_type(&[f64x2_ty_basic_md], false);
-        let ret_i32x4_take_f64x2_i32x4_i8 = i32x4_ty.fn_type(
-            &[f64x2_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()],
-            false,
-        );
+        let ret_i32x4_take_f64x2_i32x4_i8 =
+            i32x4_ty.fn_type(&[f64x2_ty_basic_md, i32x4_ty_basic_md, i8_ty.into()], false);
 
         let add_function_with_attrs =
             |name: &str, ty: FunctionType<'ctx>, linkage: Option<Linkage>| -> FunctionValue<'ctx> {

From f80ce65c46d1f3ba97f1a2578b45ff7ba83f4818 Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Mon, 16 Feb 2026 11:33:58 +0100
Subject: [PATCH 17/18] enable relaxed_simd for the fuzzer

---
 fuzz/fuzz_targets/universal_llvm.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fuzz/fuzz_targets/universal_llvm.rs b/fuzz/fuzz_targets/universal_llvm.rs
index 40c2af74c06..0a2b3ddc86d 100644
--- a/fuzz/fuzz_targets/universal_llvm.rs
+++ b/fuzz/fuzz_targets/universal_llvm.rs
@@ -25,7 +25,7 @@ impl Arbitrary<'_> for LLVMPassFuzzModule {
         config.memory64_enabled = false;
         config.max_memories = 1;
         config.tail_call_enabled = false;
-        config.relaxed_simd_enabled = false;
+        config.relaxed_simd_enabled = true;
         Ok(Self(wasm_smith::Module::new(config, u)?))
     }
 }

From fbfbecddb8a66720b7a37d3963d8c7cdfd5f73eb Mon Sep 17 00:00:00 2001
From: Martin Liska <martin.liska@hey.com>
Date: Tue, 17 Feb 2026 09:25:59 +0100
Subject: [PATCH 18/18] replace cvtps2dq with cvttps2dq

---
 lib/compiler-llvm/src/translator/code.rs       | 2 +-
 lib/compiler-llvm/src/translator/intrinsics.rs | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index d5bca2bb08b..99ebe9d6b74 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -8332,7 +8332,7 @@ impl<'ctx> LLVMFunctionCodeGenerator<'ctx, '_> {
                 let (v, _) = self.v128_into_f32x4(v, i)?;
                 let res = self
                     .build_call_with_param_attributes(
-                        self.intrinsics.x86_64.cvtps2dq,
+                        self.intrinsics.x86_64.cvttps2dq,
                         &[v.into()],
                         "",
                     )?
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index 032518080c3..2e59cfaf177 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -68,7 +68,7 @@ pub struct X86_64Intrinsics<'ctx> {
     pub min_pd: FunctionValue<'ctx>,
     pub max_ps: FunctionValue<'ctx>,
     pub max_pd: FunctionValue<'ctx>,
-    pub cvtps2dq: FunctionValue<'ctx>,
+    pub cvttps2dq: FunctionValue<'ctx>,
     pub cvtps2udq128: FunctionValue<'ctx>,
     pub cvtpd2dq: FunctionValue<'ctx>,
     pub cvtpd2udq128: FunctionValue<'ctx>,
@@ -1365,8 +1365,8 @@ impl<'ctx> Intrinsics<'ctx> {
                     ret_f64x2_take_f64x2_f64x2,
                     None,
                 ),
-                cvtps2dq: add_function_with_attrs(
-                    "llvm.x86.sse2.cvtps2dq",
+                cvttps2dq: add_function_with_attrs(
+                    "llvm.x86.sse2.cvttps2dq",
                     ret_i32x4_take_f32x4,
                     None,
                 ),