From e05f6af6c2f2ddcdb9869774741b9abbefd48b38 Mon Sep 17 00:00:00 2001
From: Julian Eager <eagr@tutanota.com>
Date: Mon, 23 Dec 2024 20:35:15 +0800
Subject: [PATCH 1/4] fcmp vector comparisons

---
 .../codegen/src/isa/pulley_shared/lower.isle  |  27 +++
 pulley/src/interp.rs                          | 208 ++++++++++++++++++
 pulley/src/lib.rs                             |  33 +++
 3 files changed, 268 insertions(+)
diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle
index 280bc770c228..6b52f49725ff 100644
--- a/cranelift/codegen/src/isa/pulley_shared/lower.isle
+++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle
@@ -751,6 +751,33 @@
   (if-let true (floatcc_unordered cc))
   (pulley_xbxor32_s8 (lower_fcmp ty (floatcc_complement cc) a b) 1))
 
+;; vector comparisons
+
+(rule 1 (lower (has_type (ty_vec128 _) (fcmp cc a b @ (value_type (ty_vec128 ty)))))
+  (lower_vfcmp ty cc a b))
+
+(decl lower_vfcmp (Type FloatCC Value Value) VReg)
+(rule (lower_vfcmp $F32X4 (FloatCC.Ordered) a b) (pulley_vordf32x4 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.Unordered) a b) (pulley_vunof32x4 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.Equal) a b) (pulley_veqf32x4 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.NotEqual) a b) (pulley_vneqf32x4 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.OrderedNotEqual) a b) (pulley_vordneqf32x4 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.UnorderedOrEqual) a b) (pulley_vunoeqf32x4 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.LessThan) a b) (pulley_vltf32x4 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.LessThanOrEqual) a b) (pulley_vlteqf32x4 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.GreaterThan) a b) (pulley_vltf32x4 b a))
+(rule (lower_vfcmp $F32X4 (FloatCC.GreaterThanOrEqual) a b) (pulley_vlteqf32x4 b a))
+(rule (lower_vfcmp $F64X2 (FloatCC.Ordered) a b) (pulley_vordf64x2 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.Unordered) a b) (pulley_vunof64x2 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.Equal) a b) (pulley_veqf64x2 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.NotEqual) a b) (pulley_vneqf64x2 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.OrderedNotEqual) a b) (pulley_vordneqf64x2 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.UnorderedOrEqual) a b) (pulley_vunoeqf64x2 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.LessThan) a b) (pulley_vltf64x2 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.LessThanOrEqual) a b) (pulley_vlteqf64x2 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.GreaterThan) a b) (pulley_vltf64x2 b a))
+(rule (lower_vfcmp $F64X2 (FloatCC.GreaterThanOrEqual) a b) (pulley_vlteqf64x2 b a))
+
 ;;;; Rules for `load` and friends ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (decl amode (Value Offset32) Amode)
diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
index d0e999ad8c2b..7452d4d7229d 100644
--- a/pulley/src/interp.rs
+++ b/pulley/src/interp.rs
@@ -4207,6 +4207,214 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vordf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a.is_nan() || b.is_nan() {
+                0
+            } else {
+                u32::MAX
+            };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vunof32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a.is_nan() || b.is_nan() {
+                u32::MAX
+            } else {
+                0
+            };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn veqf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a == b { u32::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vneqf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a != b { u32::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vordneqf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a == b || a.is_nan() || b.is_nan() {
+                0
+            } else {
+                u32::MAX
+            }
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vunoeqf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a == b || a.is_nan() || b.is_nan() {
+                u32::MAX
+            } else {
+                0
+            }
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vltf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a < b { u32::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vlteqf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a <= b { u32::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vordf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a.is_nan() || b.is_nan() {
+                0
+            } else {
+                u64::MAX
+            };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vunof64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a.is_nan() || b.is_nan() {
+                u64::MAX
+            } else {
+                0
+            };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn veqf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a == b { u64::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vneqf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a != b { u64::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vordneqf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a == b || a.is_nan() || b.is_nan() {
+                0
+            } else {
+                u64::MAX
+            }
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vunoeqf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a == b || a.is_nan() || b.is_nan() {
+                u64::MAX
+            } else {
+                0
+            }
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vltf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a < b { u64::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vlteqf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a <= b { u64::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
     fn vneg8x16(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
         let a = self.state[src].get_i8x16();
         self.state[dst].set_i8x16(a.map(|i| i.wrapping_neg()));
diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs
index f83a24606e8d..d8336be2c9cf 100644
--- a/pulley/src/lib.rs
+++ b/pulley/src/lib.rs
@@ -1144,6 +1144,39 @@ macro_rules! for_each_extended_op {
             /// `dst = src <= dst` (unsigned)
             vulteq64x2 = Vulteq64x2 { operands: BinaryOperands<VReg> };
 
+            /// `dst = !src1.is_nan() && !src2.is_nan()`
+            vordf32x4 = Vordf32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1.is_nan() || src2.is_nan()`
+            vunof32x4 = Vunof32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 == src2`
+            veqf32x4 = Veqf32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 != src2`
+            vneqf32x4 = Vneqf32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = !src1.is_nan() && !src2.is_nan() && src1 != src2`
+            vordneqf32x4 = Vordneqf32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1.is_nan() || src2.is_nan() || src1 == src2`
+            vunoeqf32x4 = Vunoeqf32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 < src2`
+            vltf32x4 = Vltf32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 <= src2`
+            vlteqf32x4 = Vlteqf32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = !src1.is_nan() && !src2.is_nan()`
+            vordf64x2 = Vordf64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src1.is_nan() || src2.is_nan()`
+            vunof64x2 = Vunof64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 == src2`
+            veqf64x2 = Veqf64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 != src2`
+            vneqf64x2 = Vneqf64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = !src1.is_nan() && !src2.is_nan() && src1 != src2`
+            vordneqf64x2 = Vordneqf64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src1.is_nan() || src2.is_nan() || src1 == src2`
+            vunoeqf64x2 = Vunoeqf64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 < src2`
+            vltf64x2 = Vltf64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 <= src2`
+            vlteqf64x2 = Vlteqf64x2 { operands: BinaryOperands<VReg> };
+
             /// `dst = -src`
             vneg8x16 = Vneg8x16 { dst: VReg, src: VReg };
             /// `dst = -src`

From 7851d5ee06dd4566a7dd3733f8858f1466ddf1c7 Mon Sep 17 00:00:00 2001
From: Julian Eager <eagr@tutanota.com>
Date: Mon, 23 Dec 2024 22:31:35 +0800
Subject: [PATCH 2/4] reenable canon-nan tests

---
 crates/wast-util/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
index c700cf6aea2d..694814806c4a 100644
--- a/crates/wast-util/src/lib.rs
+++ b/crates/wast-util/src/lib.rs
@@ -401,7 +401,6 @@ impl WastTest {
         // features in Pulley are implemented.
         if config.compiler == Compiler::CraneliftPulley {
             let unsupported = [
-                "misc_testsuite/simd/canonicalize-nan.wast",
                 "misc_testsuite/simd/issue_3327_bnot_lowering.wast",
                 "misc_testsuite/simd/v128-select.wast",
                 "spec_testsuite/proposals/annotations/simd_lane.wast",

From 35be5cb68597991f01a2aec958cbf37087c1d3f8 Mon Sep 17 00:00:00 2001
From: Julian Eager <eagr@tutanota.com>
Date: Tue, 24 Dec 2024 14:53:11 +0800
Subject: [PATCH 3/4] unflag more tests

---
 crates/wast-util/src/lib.rs | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
index 694814806c4a..598f81274b26 100644
--- a/crates/wast-util/src/lib.rs
+++ b/crates/wast-util/src/lib.rs
@@ -416,11 +416,7 @@ impl WastTest {
                 "spec_testsuite/proposals/memory64/i32x4_relaxed_trunc.wast",
                 "spec_testsuite/proposals/memory64/i8x16_relaxed_swizzle.wast",
                 "spec_testsuite/simd_f32x4_arith.wast",
-                "spec_testsuite/simd_f32x4_cmp.wast",
-                "spec_testsuite/simd_f32x4_pmin_pmax.wast",
                 "spec_testsuite/simd_f64x2_arith.wast",
-                "spec_testsuite/simd_f64x2_cmp.wast",
-                "spec_testsuite/simd_f64x2_pmin_pmax.wast",
                 "spec_testsuite/simd_i16x8_arith2.wast",
                 "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast",
                 "spec_testsuite/simd_i16x8_q15mulr_sat_s.wast",

From 767feaf026902a234683559afd5ae4d3a445021e Mon Sep 17 00:00:00 2001
From: Julian Eager <eagr@tutanota.com>
Date: Tue, 24 Dec 2024 20:12:24 +0800
Subject: [PATCH 4/4] rev instead of negate

---
 pulley/src/interp.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
index 7452d4d7229d..950009bb7232 100644
--- a/pulley/src/interp.rs
+++ b/pulley/src/interp.rs
@@ -4253,7 +4253,7 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         let b = self.state[operands.src2].get_f32x4();
         let mut c = [0; 4];
         for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
-            *c = if a != b { u32::MAX } else { 0 };
+            *c = if a == b { 0 } else { u32::MAX };
         }
         self.state[operands.dst].set_u32x4(c);
         ControlFlow::Continue(())
@@ -4357,7 +4357,7 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         let b = self.state[operands.src2].get_f64x2();
         let mut c = [0; 2];
         for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
-            *c = if a != b { u64::MAX } else { 0 };
+            *c = if a == b { 0 } else { u64::MAX };
         }
         self.state[operands.dst].set_u64x2(c);
         ControlFlow::Continue(())