bytecodealliance · alexcrichton · Dec 30, 2024 · Dec 28, 2024 · Dec 28, 2024 · Dec 28, 2024
@@ -221,6 +221,11 @@
 (rule 1 (lower (has_type $I32X4 (iadd a b))) (pulley_vaddi32x4 a b))
 (rule 1 (lower (has_type $I64X2 (iadd a b))) (pulley_vaddi64x2 a b))
 
+(rule 1 (lower (has_type $I8X16 (sadd_sat a b))) (pulley_vaddi8x16_sat a b))
+(rule 1 (lower (has_type $I8X16 (uadd_sat a b))) (pulley_vaddu8x16_sat a b))
+(rule 1 (lower (has_type $I16X8 (sadd_sat a b))) (pulley_vaddi16x8_sat a b))
+(rule 1 (lower (has_type $I16X8 (uadd_sat a b))) (pulley_vaddu16x8_sat a b))
+
 ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 0 (lower (has_type (ty_int (fits_in_32 _)) (isub a b))) (pulley_xsub32 a b))
@@ -256,6 +261,11 @@
 (rule 1 (lower (has_type $I32X4 (isub a b))) (pulley_vsubi32x4 a b))
 (rule 1 (lower (has_type $I64X2 (isub a b))) (pulley_vsubi64x2 a b))
 
+(rule 1 (lower (has_type $I8X16 (ssub_sat a b))) (pulley_vsubi8x16_sat a b))
+(rule 1 (lower (has_type $I8X16 (usub_sat a b))) (pulley_vsubu8x16_sat a b))
+(rule 1 (lower (has_type $I16X8 (ssub_sat a b))) (pulley_vsubi16x8_sat a b))
+(rule 1 (lower (has_type $I16X8 (usub_sat a b))) (pulley_vsubu16x8_sat a b))
+
 ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I8 (imul a b))) (pulley_xmul32 a b))
@@ -341,6 +351,7 @@
 
 ;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(rule (lower (has_type $I8X16 (avg_round a b))) (pulley_vavground8x16 a b))
 (rule (lower (has_type $I16X8 (avg_round a b))) (pulley_vavground16x8 a b))
 
 ;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -512,6 +523,7 @@
 (rule 0 (lower (has_type (fits_in_32 _) (umin a b)))
   (pulley_xmin32_u (zext32 a) (zext32 b)))
 (rule 1 (lower (has_type $I64 (umin a b))) (pulley_xmin64_u a b))
+(rule 1 (lower (has_type $I8X16 (umin a b))) (pulley_vmin8x16_u a b))
 (rule 1 (lower (has_type $I16X8 (umin a b))) (pulley_vmin16x8_u a b))
 (rule 1 (lower (has_type $I32X4 (umin a b))) (pulley_vmin32x4_u a b))
 
@@ -520,6 +532,7 @@
 (rule 0 (lower (has_type (fits_in_32 _) (smin a b)))
   (pulley_xmin32_s (sext32 a) (sext32 b)))
 (rule 1 (lower (has_type $I64 (smin a b))) (pulley_xmin64_s a b))
+(rule 1 (lower (has_type $I8X16 (smin a b))) (pulley_vmin8x16_s a b))
 (rule 1 (lower (has_type $I16X8 (smin a b))) (pulley_vmin16x8_s a b))
 (rule 1 (lower (has_type $I32X4 (smin a b))) (pulley_vmin32x4_s a b))
 
@@ -528,6 +541,7 @@
 (rule 0 (lower (has_type (fits_in_32 _) (umax a b)))
   (pulley_xmax32_u (zext32 a) (zext32 b)))
 (rule 1 (lower (has_type $I64 (umax a b))) (pulley_xmax64_u a b))
+(rule 1 (lower (has_type $I8X16 (umax a b))) (pulley_vmax8x16_u a b))
 (rule 1 (lower (has_type $I16X8 (umax a b))) (pulley_vmax16x8_u a b))
 (rule 1 (lower (has_type $I32X4 (umax a b))) (pulley_vmax32x4_u a b))
 
@@ -536,6 +550,7 @@
 (rule 0 (lower (has_type (fits_in_32 _) (smax a b)))
   (pulley_xmax32_s (sext32 a) (sext32 b)))
 (rule 1 (lower (has_type $I64 (smax a b))) (pulley_xmax64_s a b))
+(rule 1 (lower (has_type $I8X16 (smax a b))) (pulley_vmax8x16_s a b))
 (rule 1 (lower (has_type $I16X8 (smax a b))) (pulley_vmax16x8_s a b))
 (rule 1 (lower (has_type $I32X4 (smax a b))) (pulley_vmax32x4_s a b))
 
@@ -570,6 +585,7 @@
 
 (rule 0 (lower (has_type (fits_in_32 _) (popcnt a))) (pulley_xpopcnt32 (zext32 a)))
 (rule 1 (lower (has_type $I64 (popcnt a))) (pulley_xpopcnt64 a))
+(rule 1 (lower (has_type $I8X16 (popcnt a))) (pulley_vpopcnt8x16 a))
 
 ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1246,8 +1262,10 @@
 
 (rule 0 (lower (has_type (fits_in_32 _) (iabs a))) (pulley_xabs32 (sext32 a)))
 (rule 1 (lower (has_type $I64 (iabs a))) (pulley_xabs64 a))
+(rule 1 (lower (has_type $I8X16 (iabs a))) (pulley_vabs8x16 a))
 (rule 1 (lower (has_type $I16X8 (iabs a))) (pulley_vabs16x8 a))
 (rule 1 (lower (has_type $I32X4 (iabs a))) (pulley_vabs32x4 a))
+(rule 1 (lower (has_type $I64X2 (iabs a))) (pulley_vabs64x2 a))
 
 ;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 

@@ -419,14 +419,10 @@ impl WastTest {
                 "spec_testsuite/simd_f64x2_cmp.wast",
                 "spec_testsuite/simd_f64x2_pmin_pmax.wast",
                 "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast",
-                "spec_testsuite/simd_i16x8_sat_arith.wast",
                 "spec_testsuite/simd_i32x4_dot_i16x8.wast",
                 "spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast",
                 "spec_testsuite/simd_i32x4_trunc_sat_f32x4.wast",
                 "spec_testsuite/simd_i32x4_trunc_sat_f64x2.wast",
-                "spec_testsuite/simd_i64x2_arith2.wast",
-                "spec_testsuite/simd_i8x16_arith2.wast",
-                "spec_testsuite/simd_i8x16_sat_arith.wast",
                 "spec_testsuite/simd_lane.wast",
                 "spec_testsuite/simd_load.wast",
                 "spec_testsuite/simd_splat.wast",

@@ -3299,6 +3299,46 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vaddi8x16_sat(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i8x16();
+        let b = self.state[operands.src2].get_i8x16();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = (*a).saturating_add(b);
+        }
+        self.state[operands.dst].set_i8x16(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vaddu8x16_sat(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u8x16();
+        let b = self.state[operands.src2].get_u8x16();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = (*a).saturating_add(b);
+        }
+        self.state[operands.dst].set_u8x16(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vaddi16x8_sat(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_i16x8();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = (*a).saturating_add(b);
+        }
+        self.state[operands.dst].set_i16x8(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vaddu16x8_sat(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u16x8();
+        let b = self.state[operands.src2].get_u16x8();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = (*a).saturating_add(b);
+        }
+        self.state[operands.dst].set_u16x8(a);
+        ControlFlow::Continue(())
+    }
+
     fn vshli8x16(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
         let a = self.state[operands.src1].get_i8x16();
         let b = self.state[operands.src2].get_u32();
@@ -3795,6 +3835,46 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vsubi8x16_sat(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i8x16();
+        let b = self.state[operands.src2].get_i8x16();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.saturating_sub(b);
+        }
+        self.state[operands.dst].set_i8x16(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vsubu8x16_sat(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u8x16();
+        let b = self.state[operands.src2].get_u8x16();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.saturating_sub(b);
+        }
+        self.state[operands.dst].set_u8x16(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vsubi16x8_sat(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_i16x8();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.saturating_sub(b);
+        }
+        self.state[operands.dst].set_i16x8(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vsubu16x8_sat(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u16x8();
+        let b = self.state[operands.src2].get_u16x8();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.saturating_sub(b);
+        }
+        self.state[operands.dst].set_u16x8(a);
+        ControlFlow::Continue(())
+    }
+
     fn vmuli8x16(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
         let mut a = self.state[operands.src1].get_i8x16();
         let b = self.state[operands.src2].get_i8x16();
@@ -3848,6 +3928,12 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vpopcnt8x16(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_u8x16();
+        self.state[dst].set_u8x16(a.map(|i| i.count_ones() as u8));
+        ControlFlow::Continue(())
+    }
+
     fn xextractv8x16(&mut self, dst: XReg, src: VReg, lane: u8) -> ControlFlow<Done> {
         let a = unsafe { *self.state[src].get_u8x16().get_unchecked(usize::from(lane)) };
         self.state[dst].set_u32(u32::from(a));
@@ -4256,6 +4342,26 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vmin8x16_s(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i8x16();
+        let b = self.state[operands.src2].get_i8x16();
+        for (a, b) in a.iter_mut().zip(&b) {
+            *a = (*a).min(*b);
+        }
+        self.state[operands.dst].set_i8x16(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vmin8x16_u(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u8x16();
+        let b = self.state[operands.src2].get_u8x16();
+        for (a, b) in a.iter_mut().zip(&b) {
+            *a = (*a).min(*b);
+        }
+        self.state[operands.dst].set_u8x16(a);
+        ControlFlow::Continue(())
+    }
+
     fn vmin16x8_s(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
         let mut a = self.state[operands.src1].get_i16x8();
         let b = self.state[operands.src2].get_i16x8();
@@ -4296,6 +4402,26 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vmax8x16_s(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i8x16();
+        let b = self.state[operands.src2].get_i8x16();
+        for (a, b) in a.iter_mut().zip(&b) {
+            *a = (*a).max(*b);
+        }
+        self.state[operands.dst].set_i8x16(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vmax8x16_u(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u8x16();
+        let b = self.state[operands.src2].get_u8x16();
+        for (a, b) in a.iter_mut().zip(&b) {
+            *a = (*a).max(*b);
+        }
+        self.state[operands.dst].set_u8x16(a);
+        ControlFlow::Continue(())
+    }
+
     fn vmax16x8_s(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
         let mut a = self.state[operands.src1].get_i16x8();
         let b = self.state[operands.src2].get_i16x8();
@@ -4336,6 +4462,12 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vabs8x16(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_i8x16();
+        self.state[dst].set_i8x16(a.map(|i| i.wrapping_abs()));
+        ControlFlow::Continue(())
+    }
+
     fn vabs16x8(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
         let a = self.state[src].get_i16x8();
         self.state[dst].set_i16x8(a.map(|i| i.wrapping_abs()));
@@ -4348,6 +4480,12 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vabs64x2(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_i64x2();
+        self.state[dst].set_i64x2(a.map(|i| i.wrapping_abs()));
+        ControlFlow::Continue(())
+    }
+
     fn vabsf32x4(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
         let a = self.state[src].get_f32x4();
         self.state[dst].set_f32x4(a.map(|i| i.wasm_abs()));
@@ -4415,11 +4553,22 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vavground8x16(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u8x16();
+        let b = self.state[operands.src2].get_u8x16();
+        for (a, b) in a.iter_mut().zip(&b) {
+            // use wider precision to avoid overflow
+            *a = ((u32::from(*a) + u32::from(*b) + 1) / 2) as u8;
+        }
+        self.state[operands.dst].set_u8x16(a);
+        ControlFlow::Continue(())
+    }
+
     fn vavground16x8(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
         let mut a = self.state[operands.src1].get_u16x8();
         let b = self.state[operands.src2].get_u16x8();
         for (a, b) in a.iter_mut().zip(&b) {
-            // rounding average
+            // use wider precision to avoid overflow
             *a = ((u32::from(*a) + u32::from(*b) + 1) / 2) as u16;
         }
         self.state[operands.dst].set_u16x8(a);

@@ -899,6 +899,15 @@ macro_rules! for_each_extended_op {
             /// `dst = src1 + src2`
             vaddf64x2 = VAddF64x2 { operands: BinaryOperands<VReg> };
 
+            /// `dst = satruating_add(src1, src2)`
+            vaddi8x16_sat = VAddI8x16Sat { operands: BinaryOperands<VReg> };
+            /// `dst = satruating_add(src1, src2)`
+            vaddu8x16_sat = VAddU8x16Sat { operands: BinaryOperands<VReg> };
+            /// `dst = satruating_add(src1, src2)`
+            vaddi16x8_sat = VAddI16x8Sat { operands: BinaryOperands<VReg> };
+            /// `dst = satruating_add(src1, src2)`
+            vaddu16x8_sat = VAddU16x8Sat { operands: BinaryOperands<VReg> };
+
             /// `dst = src1 << src2`
             vshli8x16 = VShlI8x16 { operands: BinaryOperands<VReg, VReg, XReg> };
             /// `dst = src1 << src2`
@@ -1062,6 +1071,15 @@ macro_rules! for_each_extended_op {
             /// `dst = src1 - src2`
             vsubi64x2 = VSubI64x2 { operands: BinaryOperands<VReg> };
 
+            /// `dst = saturating_sub(src1, src2)`
+            vsubi8x16_sat = VSubI8x16Sat { operands: BinaryOperands<VReg> };
+            /// `dst = saturating_sub(src1, src2)`
+            vsubu8x16_sat = VSubU8x16Sat { operands: BinaryOperands<VReg> };
+            /// `dst = saturating_sub(src1, src2)`
+            vsubi16x8_sat = VSubI16x8Sat { operands: BinaryOperands<VReg> };
+            /// `dst = saturating_sub(src1, src2)`
+            vsubu16x8_sat = VSubU16x8Sat { operands: BinaryOperands<VReg> };
+
             /// `dst = src1 * src2`
             vmuli8x16 = VMulI8x16 { operands: BinaryOperands<VReg> };
             /// `dst = src1 * src2`
@@ -1074,6 +1092,9 @@ macro_rules! for_each_extended_op {
             /// `dst = signed_saturate(src1 * src2 + (1 << (Q - 1)) >> Q)`
             vqmulrsi16x8 = VQmulrsI16x8 { operands: BinaryOperands<VReg> };
 
+            /// `dst = count_ones(src)`
+            vpopcnt8x16 = VPopcnt8x16 { dst: VReg, src: VReg };
+
             /// `low32(dst) = zext(src[lane])`
             xextractv8x16 = XExtractV8x16 { dst: XReg, src: VReg, lane: u8 };
             /// `low32(dst) = zext(src[lane])`
@@ -1158,11 +1179,19 @@ macro_rules! for_each_extended_op {
             /// `dst = -src`
             vneg64x2 = Vneg64x2 { dst: VReg, src: VReg };
 
+            /// `dst = min(src1, src2)` (signed)
+            vmin8x16_s = Vmin8x16S { operands: BinaryOperands<VReg> };
+            /// `dst = min(src1, src2)` (unsigned)
+            vmin8x16_u = Vmin8x16U { operands: BinaryOperands<VReg> };
             /// `dst = min(src1, src2)` (signed)
             vmin16x8_s = Vmin16x8S { operands: BinaryOperands<VReg> };
             /// `dst = min(src1, src2)` (unsigned)
             vmin16x8_u = Vmin16x8U { operands: BinaryOperands<VReg> };
             /// `dst = max(src1, src2)` (signed)
+            vmax8x16_s = Vmax8x16S { operands: BinaryOperands<VReg> };
+            /// `dst = max(src1, src2)` (unsigned)
+            vmax8x16_u = Vmax8x16U { operands: BinaryOperands<VReg> };
+            /// `dst = max(src1, src2)` (signed)
             vmax16x8_s = Vmax16x8S { operands: BinaryOperands<VReg> };
             /// `dst = max(src1, src2)` (unsigned)
             vmax16x8_u = Vmax16x8U { operands: BinaryOperands<VReg> };
@@ -1176,10 +1205,14 @@ macro_rules! for_each_extended_op {
             /// `dst = max(src1, src2)` (unsigned)
             vmax32x4_u = Vmax32x4U { operands: BinaryOperands<VReg> };
 
+            /// `dst = |src|`
+            vabs8x16 = Vabs8x16 { dst: VReg, src: VReg };
             /// `dst = |src|`
             vabs16x8 = Vabs16x8 { dst: VReg, src: VReg };
             /// `dst = |src|`
             vabs32x4 = Vabs32x4 { dst: VReg, src: VReg };
+            /// `dst = |src|`
+            vabs64x2 = Vabs64x2 { dst: VReg, src: VReg };
 
             /// `dst = |src|`
             vabsf32x4 = Vabsf32x4 { dst: VReg, src: VReg };
@@ -1197,6 +1230,8 @@ macro_rules! for_each_extended_op {
             /// `dst = swizzle(src1, src2)`
             vswizzlei8x16 = Vswizzlei8x16 { operands: BinaryOperands<VReg> };
 
+            /// `dst = (src1 + src2 + 1) // 2`
+            vavground8x16 = Vavground8x16 { operands: BinaryOperands<VReg> };
             /// `dst = (src1 + src2 + 1) // 2`
             vavground16x8 = Vavground16x8 { operands: BinaryOperands<VReg> };
         }