pulley: Implement 16x8 arithmetics (#9904)

* pulley: Implement 16x8 arith. * leverage 32-bit arithmetic * fix
bytecodealliance · Dec 28, 2024 · 01a43ed · 01a43ed
1 parent af5a789
commit 01a43ed
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 1 deletion.
diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle
@@ -335,6 +335,10 @@
   (pulley_xrem32_u (zext32 a) (zext32 b)))
 (rule 1 (lower (has_type $I64 (urem a b))) (pulley_xrem64_u a b))
 
+;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $I16X8 (avg_round a b))) (pulley_vavground16x8 a b))
+
 ;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I8 (ishl a b)))
@@ -504,24 +508,28 @@
 (rule 0 (lower (has_type (fits_in_32 _) (umin a b)))
   (pulley_xmin32_u (zext32 a) (zext32 b)))
 (rule 1 (lower (has_type $I64 (umin a b))) (pulley_xmin64_u a b))
+(rule 1 (lower (has_type $I16X8 (umin a b))) (pulley_vmin16x8_u a b))
 
 ;;;; Rules for `smin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 0 (lower (has_type (fits_in_32 _) (smin a b)))
   (pulley_xmin32_s (sext32 a) (sext32 b)))
 (rule 1 (lower (has_type $I64 (smin a b))) (pulley_xmin64_s a b))
+(rule 1 (lower (has_type $I16X8 (smin a b))) (pulley_vmin16x8_s a b))
 
 ;;;; Rules for `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 0 (lower (has_type (fits_in_32 _) (umax a b)))
   (pulley_xmax32_u (zext32 a) (zext32 b)))
 (rule 1 (lower (has_type $I64 (umax a b))) (pulley_xmax64_u a b))
+(rule 1 (lower (has_type $I16X8 (umax a b))) (pulley_vmax16x8_u a b))
 
 ;;;; Rules for `smax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 0 (lower (has_type (fits_in_32 _) (smax a b)))
   (pulley_xmax32_s (sext32 a) (sext32 b)))
 (rule 1 (lower (has_type $I64 (smax a b))) (pulley_xmax64_s a b))
+(rule 1 (lower (has_type $I16X8 (smax a b))) (pulley_vmax16x8_s a b))
 
 ;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1230,6 +1238,7 @@
 
 (rule 0 (lower (has_type (fits_in_32 _) (iabs a))) (pulley_xabs32 (sext32 a)))
 (rule 1 (lower (has_type $I64 (iabs a))) (pulley_xabs64 a))
+(rule 1 (lower (has_type $I16X8 (iabs a))) (pulley_vabs16x8 a))
 
 ;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 

diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
@@ -420,7 +420,6 @@ impl WastTest {
                 "spec_testsuite/simd_f64x2_arith.wast",
                 "spec_testsuite/simd_f64x2_cmp.wast",
                 "spec_testsuite/simd_f64x2_pmin_pmax.wast",
-                "spec_testsuite/simd_i16x8_arith2.wast",
                 "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast",
                 "spec_testsuite/simd_i16x8_q15mulr_sat_s.wast",
                 "spec_testsuite/simd_i16x8_sat_arith.wast",

diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
@@ -4243,6 +4243,52 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vmin16x8_s(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_i16x8();
+        for (a, b) in a.iter_mut().zip(&b) {
+            *a = (*a).min(*b);
+        }
+        self.state[operands.dst].set_i16x8(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vmin16x8_u(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u16x8();
+        let b = self.state[operands.src2].get_u16x8();
+        for (a, b) in a.iter_mut().zip(&b) {
+            *a = (*a).min(*b);
+        }
+        self.state[operands.dst].set_u16x8(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vmax16x8_s(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_i16x8();
+        for (a, b) in a.iter_mut().zip(&b) {
+            *a = (*a).max(*b);
+        }
+        self.state[operands.dst].set_i16x8(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vmax16x8_u(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u16x8();
+        let b = self.state[operands.src2].get_u16x8();
+        for (a, b) in a.iter_mut().zip(&b) {
+            *a = (*a).max(*b);
+        }
+        self.state[operands.dst].set_u16x8(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vabs16x8(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_i16x8();
+        self.state[dst].set_i16x8(a.map(|i| i.wrapping_abs()));
+        ControlFlow::Continue(())
+    }
+
     fn vabsf32x4(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
         let a = self.state[src].get_f32x4();
         self.state[dst].set_f32x4(a.map(|i| i.wasm_abs()));
@@ -4309,4 +4355,15 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         self.state[operands.dst].set_i8x16(dst);
         ControlFlow::Continue(())
     }
+
+    fn vavground16x8(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_u16x8();
+        let b = self.state[operands.src2].get_u16x8();
+        for (a, b) in a.iter_mut().zip(&b) {
+            // rounding average
+            *a = ((u32::from(*a) + u32::from(*b) + 1) / 2) as u16;
+        }
+        self.state[operands.dst].set_u16x8(a);
+        ControlFlow::Continue(())
+    }
 }
diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs
@@ -1155,6 +1155,18 @@ macro_rules! for_each_extended_op {
             /// `dst = -src`
             vneg64x2 = Vneg64x2 { dst: VReg, src: VReg };
 
+            /// `dst = min(src1, src2)` (signed)
+            vmin16x8_s = Vmin16x8S { operands: BinaryOperands<VReg> };
+            /// `dst = min(src1, src2)` (unsigned)
+            vmin16x8_u = Vmin16x8U { operands: BinaryOperands<VReg> };
+            /// `dst = max(src1, src2)` (signed)
+            vmax16x8_s = Vmax16x8S { operands: BinaryOperands<VReg> };
+            /// `dst = max(src1, src2)` (unsigned)
+            vmax16x8_u = Vmax16x8U { operands: BinaryOperands<VReg> };
+
+            /// `dst = |src|`
+            vabs16x8 = Vabs16x8 { dst: VReg, src: VReg };
+
             /// `dst = |src|`
             vabsf32x4 = Vabsf32x4 { dst: VReg, src: VReg };
             /// `dst = |src|`
@@ -1167,8 +1179,12 @@ macro_rules! for_each_extended_op {
             vminimumf32x4 = Vminimumf32x4 { operands: BinaryOperands<VReg> };
             /// `dst = ieee_minimum(src1, src2)`
             vminimumf64x2 = Vminimumf64x2 { operands: BinaryOperands<VReg> };
+
             /// `dst = swizzle(src1, src2)`
             vswizzlei8x16 = Vswizzlei8x16 { operands: BinaryOperands<VReg> };
+
+            /// `dst = (src1 + src2 + 1) // 2`
+            vavground16x8 = Vavground16x8 { operands: BinaryOperands<VReg> };
         }
     };
 }