cranelift: 32bit div_s, rem_u, rem_s for aarch64 (#9850)

* rename put_non_zero_in_reg to put_nonzero_in_reg_maybe_zext * handle 32bit sdiv * i32 rem_u/s * fix tests * fix rem * fmt * improve load_constant_full * fix tests, and review edits
bytecodealliance · Dec 19, 2024 · a179f95 · a179f95
1 parent 71ca453
commit a179f95
Show file tree

Hide file tree

Showing 7 changed files with 180 additions and 138 deletions.
diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -3598,12 +3598,17 @@
        (orr_imm ty (zero_reg) n)
        m k k))
 
-(decl load_constant64_full (Type ImmExtend u64) Reg)
-(extern constructor load_constant64_full load_constant64_full)
+(decl load_constant_full (Type ImmExtend OperandSize u64) Reg)
+(extern constructor load_constant_full load_constant_full)
+
+;; Fallback for integral 32-bit constants
+(rule (imm (fits_in_32 (integral_ty ty)) extend n)
+      (load_constant_full ty extend (operand_size $I32) n))
 
 ;; Fallback for integral 64-bit constants
-(rule (imm (integral_ty ty) extend n)
-      (load_constant64_full ty extend n))
+(rule -1 (imm (integral_ty $I64) extend n)
+      (load_constant_full $I64 extend (operand_size $I64) n))
+
 
 ;; Sign extension helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 

diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -758,8 +758,7 @@ impl MachInstEmit for Inst {
                     ALUOp::EorNot => 0b01001010_001,
                     ALUOp::AddS => 0b00101011_000,
                     ALUOp::SubS => 0b01101011_000,
-                    ALUOp::SDiv => 0b10011010_110,
-                    ALUOp::UDiv => 0b00011010_110,
+                    ALUOp::SDiv | ALUOp::UDiv => 0b00011010_110,
                     ALUOp::RotR | ALUOp::Lsr | ALUOp::Asr | ALUOp::Lsl => 0b00011010_110,
                     ALUOp::SMulH => 0b10011011_010,
                     ALUOp::UMulH => 0b10011011_110,

diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -1028,43 +1028,48 @@
 
 ;;;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; Note that aarch64's `udiv` doesn't trap so to respect the semantics of
-;; CLIF's `udiv` the check for zero needs to be manually performed.
-
-(rule udiv 1 (lower (has_type $I64 (udiv x y)))
-      (a64_udiv $I64 (put_in_reg x) (put_nonzero_in_reg y)))
+;; Enum representing the types of extensions
+(type ExtType
+      (enum
+        (Signed)
+        (Unsigned)))
 
-(rule udiv (lower (has_type (fits_in_32 ty) (udiv x y)))
-      (a64_udiv $I32 (put_in_reg_zext32 x) (put_nonzero_in_reg y)))
-
-;; helpers for udiv:
 ;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero.
-(decl put_nonzero_in_reg (Value) Reg)
+;; It takes a value and extension type, and performs the appropriate checks.
+;; TODO: restore spec
+;  (spec (put_nonzero_in_reg_sext64 x)
+;        (provide (= (sign_ext 64 x) result))
+;        (require (not (= #x0000000000000000 result))))
+(decl put_nonzero_in_reg (Value ExtType Type) Reg)
 
 ;; Special case where if a `Value` is known to be nonzero we can trivially
 ;; move it into a register.
-(rule (put_nonzero_in_reg (and (value_type ty) (iconst (nonzero_u64_from_imm64 n))))
+
+;; zero-extend non-zero constant
+(rule (put_nonzero_in_reg (iconst (nonzero_u64_from_imm64 n)) (ExtType.Unsigned) ty)
       (imm ty (ImmExtend.Zero) n))
 
-(rule -1 (put_nonzero_in_reg (and (value_type $I64) val))
+;; sign-extend non-zero constant
+(rule (put_nonzero_in_reg (iconst (nonzero_u64_from_imm64 n)) (ExtType.Signed) ty)
+      (imm ty (ImmExtend.Sign) n))
+
+(rule -1 (put_nonzero_in_reg val _ $I64)
       (trap_if_zero_divisor (put_in_reg val) (operand_size $I64)))
 
-(rule -2 (put_nonzero_in_reg (and (value_type (fits_in_32 _)) val))
+(rule -2 (put_nonzero_in_reg val (ExtType.Signed) (fits_in_32 _))
+      (trap_if_zero_divisor (put_in_reg_sext32 val) (operand_size $I32)))
+
+(rule -2 (put_nonzero_in_reg val (ExtType.Unsigned) (fits_in_32 _))
       (trap_if_zero_divisor (put_in_reg_zext32 val) (operand_size $I32)))
 
-;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero and extending it to 64 bits.
-(spec (put_nonzero_in_reg_zext64 x)
-      (provide (= result (zero_ext 64 x)))
-      (require (not (= result #x0000000000000000))))
-(decl put_nonzero_in_reg_zext64 (Value) Reg)
-(rule -1 (put_nonzero_in_reg_zext64 (and (value_type ty) val))
-      (trap_if_zero_divisor (put_in_reg_zext64 val) (operand_size ty)))
+;; Note that aarch64's `udiv` doesn't trap so to respect the semantics of
+;; CLIF's `udiv` the check for zero needs to be manually performed.
 
-;; Special case where if a `Value` is known to be nonzero we can trivially
-;; move it into a register.
-(rule (put_nonzero_in_reg_zext64 (and (value_type ty)
-                                      (iconst (nonzero_u64_from_imm64 n))))
-      (imm ty (ImmExtend.Zero) n))
+(rule udiv 1 (lower (has_type $I64 (udiv x y)))
+      (a64_udiv $I64 (put_in_reg x) (put_nonzero_in_reg y (ExtType.Unsigned) $I64)))
+
+(rule udiv (lower (has_type (fits_in_32 ty) (udiv x y)))
+      (a64_udiv $I32 (put_in_reg_zext32 x) (put_nonzero_in_reg y (ExtType.Unsigned) ty)))
 
 ;;;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1088,33 +1093,34 @@
 ;;
 ;; TODO: if `y` is -1 then a check that `x` is not INT_MIN is all that's
 ;; necessary, but right now `y` is checked to not be -1 as well.
-(rule sdiv_base_case (lower (has_type (fits_in_64 ty) (sdiv x y)))
+
+(rule sdiv_base_case (lower (has_type $I64 (sdiv x y)))
       (let ((x64 Reg (put_in_reg_sext64 x))
-            (y64 Reg (put_nonzero_in_reg_sext64 y))
-            (intmin_check_x Reg (intmin_check ty x64))
-            (valid_x64 Reg (trap_if_div_overflow ty intmin_check_x x64 y64))
+            (y64 Reg (put_nonzero_in_reg y (ExtType.Signed) $I64))
+            (intmin_check_x Reg (intmin_check $I64 x64))
+            (valid_x64 Reg (trap_if_div_overflow $I64 intmin_check_x x64 y64))
             (result Reg (a64_sdiv $I64 valid_x64 y64)))
         result))
 
+(rule sdiv_base_case -1 (lower (has_type (fits_in_32 ty) (sdiv x y)))
+      (let ((x32 Reg (put_in_reg_sext32 x))
+            (y32 Reg (put_nonzero_in_reg y (ExtType.Signed) ty))
+            (intmin_check_x Reg (intmin_check ty x32))
+            (valid_x32 Reg (trap_if_div_overflow ty intmin_check_x x32 y32))
+            (result Reg (a64_sdiv ty valid_x32 y32)))
+        result))
+
 ;; Special case for `sdiv` where no checks are needed due to division by a
 ;; constant meaning the checks are always passed.
-(rule sdiv_safe_divisor 1 (lower (has_type (fits_in_64 ty) (sdiv x (iconst imm))))
+(rule sdiv_safe_divisor 2 (lower (has_type $I64 (sdiv x (iconst imm))))
+      (if-let y (safe_divisor_from_imm64 $I64 imm))
+      (a64_sdiv $I64 (put_in_reg_sext64 x) (imm $I64 (ImmExtend.Sign) y)))
+
+(rule sdiv_safe_divisor 1 (lower (has_type (fits_in_32 ty) (sdiv x (iconst imm))))
       (if-let y (safe_divisor_from_imm64 ty imm))
-      (a64_sdiv $I64 (put_in_reg_sext64 x) (imm ty (ImmExtend.Sign) y)))
+      (a64_sdiv ty (put_in_reg_sext32 x) (imm ty (ImmExtend.Sign) y)))
 
 ;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero.
- (spec (put_nonzero_in_reg_sext64 x)
-       (provide (= (sign_ext 64 x) result))
-       (require (not (= #x0000000000000000 result))))
-(decl put_nonzero_in_reg_sext64 (Value) Reg)
-(rule -1 (put_nonzero_in_reg_sext64 val)
-      (trap_if_zero_divisor (put_in_reg_sext64 val) (operand_size $I64)))
-
-;; Note that this has a special case where if the `Value` is a constant that's
-;; not zero we can skip the zero check.
-(rule (put_nonzero_in_reg_sext64 (and (value_type ty)
-                                      (iconst (nonzero_u64_from_imm64 n))))
-      (imm ty (ImmExtend.Sign) n))
 
 ;;;; Rules for `urem` and `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1130,20 +1136,36 @@
 ;;   div rd, x, y       ; rd = x / y
 ;;   msub rd, rd, y, x  ; rd = x - rd * y
 
-(rule urem (lower (has_type (fits_in_64 ty) (urem x y)))
+;; TODO: we can avoid a 0 check, if the dividend is a non-0 constant
+
+(rule urem (lower (has_type $I64 (urem x y)))
       (let ((x64 Reg (put_in_reg_zext64 x))
-            (y64 Reg (put_nonzero_in_reg_zext64 y))
+            (y64 Reg (put_nonzero_in_reg y (ExtType.Unsigned) $I64))
             (div Reg (a64_udiv $I64 x64 y64))
             (result Reg (msub $I64 div y64 x64)))
         result))
 
-(rule srem (lower (has_type (fits_in_64 ty) (srem x y)))
+(rule urem -1 (lower (has_type (fits_in_32 ty) (urem x y)))
+      (let ((x64 Reg (put_in_reg_zext32 x))
+            (y64 Reg (put_nonzero_in_reg y (ExtType.Unsigned) ty))
+            (div Reg (a64_udiv ty x64 y64))
+            (result Reg (msub ty div y64 x64)))
+        result))
+
+(rule srem (lower (has_type $I64 (srem x y)))
       (let ((x64 Reg (put_in_reg_sext64 x))
-            (y64 Reg (put_nonzero_in_reg_sext64 y))
+            (y64 Reg (put_nonzero_in_reg y (ExtType.Signed) $I64))
             (div Reg (a64_sdiv $I64 x64 y64))
             (result Reg (msub $I64 div y64 x64)))
         result))
 
+(rule srem -1 (lower (has_type (fits_in_32 ty) (srem x y)))
+      (let ((x64 Reg (put_in_reg_sext32 x))
+            (y64 Reg (put_nonzero_in_reg y (ExtType.Signed) ty))
+            (div Reg (a64_sdiv ty x64 y64))
+            (result Reg (msub ty div y64 x64)))
+        result))
+
 ;;; Rules for integer min/max: umin, smin, umax, smax ;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; `i64` and smaller.

diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -2,7 +2,7 @@
 
 // Pull in the ISLE generated code.
 pub mod generated_code;
-use generated_code::Context;
+use generated_code::{Context, ImmExtend};
 
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
@@ -30,6 +30,7 @@ use crate::{
         abi::ArgPair, ty_bits, InstOutput, IsTailCall, MachInst, VCodeConstant, VCodeConstantData,
     },
 };
+use core::u32;
 use regalloc2::PReg;
 use std::boxed::Box;
 use std::vec::Vec;
@@ -206,24 +207,36 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
     ///
     /// The logic here is nontrivial enough that it's not really worth porting
     /// this over to ISLE.
-    fn load_constant64_full(
+    fn load_constant_full(
         &mut self,
         ty: Type,
-        extend: &generated_code::ImmExtend,
+        extend: &ImmExtend,
+        extend_to: &OperandSize,
         value: u64,
     ) -> Reg {
         let bits = ty.bits();
-        let value = if bits < 64 {
-            if *extend == generated_code::ImmExtend::Sign {
+
+        let value = match (extend_to, *extend) {
+            (OperandSize::Size32, ImmExtend::Sign) if bits < 32 => {
+                let shift = 32 - bits;
+                let value = value as i32;
+
+                // we cast first to a u32 and then to a u64, to ensure that we are representing a
+                // i32 in a u64, and not a i64. This is important, otherwise value will not fit in
+                // 32 bits
+                ((value << shift) >> shift) as u32 as u64
+            }
+            (OperandSize::Size32, ImmExtend::Zero) if bits < 32 => {
+                value & !((u32::MAX as u64) << bits)
+            }
+            (OperandSize::Size64, ImmExtend::Sign) if bits < 64 => {
                 let shift = 64 - bits;
                 let value = value as i64;
 
                 ((value << shift) >> shift) as u64
-            } else {
-                value & !(u64::MAX << bits)
             }
-        } else {
-            value
+            (OperandSize::Size64, ImmExtend::Zero) if bits < 64 => value & !(u64::MAX << bits),
+            _ => value,
         };
 
         // Divide the value into 16-bit slices that we can manipulate using

diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
@@ -215,24 +215,20 @@ block0(v0: i32, v1: i32):
 
 ; VCode:
 ; block0:
-;   sxtw x3, w0
-;   sxtw x5, w1
-;   cbz x5, #trap=int_divz
-;   adds wzr, w5, #1
-;   ccmp w3, #1, #nzcv, eq
+;   cbz w1, #trap=int_divz
+;   adds wzr, w1, #1
+;   ccmp w0, #1, #nzcv, eq
 ;   b.vs #trap=int_ovf
-;   sdiv x0, x3, x5
+;   sdiv w0, w0, w1
 ;   ret
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
-;   sxtw x3, w0
-;   sxtw x5, w1
-;   cbz x5, #0x20
-;   cmn w5, #1
-;   ccmp w3, #1, #0, eq
-;   b.vs #0x24
-;   sdiv x0, x3, x5
+;   cbz w1, #0x18
+;   cmn w1, #1
+;   ccmp w0, #1, #0, eq
+;   b.vs #0x1c
+;   sdiv w0, w0, w1
 ;   ret
 ;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: int_divz
 ;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: int_ovf
@@ -246,16 +242,14 @@ block0(v0: i32):
 
 ; VCode:
 ; block0:
-;   sxtw x2, w0
-;   movz w4, #2
-;   sdiv x0, x2, x4
+;   movz w2, #2
+;   sdiv w0, w0, w2
 ;   ret
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
-;   sxtw x2, w0
-;   mov w4, #2
-;   sdiv x0, x2, x4
+;   mov w2, #2
+;   sdiv w0, w0, w2
 ;   ret
 
 function %f14(i32, i32) -> i32 {
@@ -304,20 +298,16 @@ block0(v0: i32, v1: i32):
 
 ; VCode:
 ; block0:
-;   sxtw x3, w0
-;   sxtw x5, w1
-;   cbz x5, #trap=int_divz
-;   sdiv x8, x3, x5
-;   msub x0, x8, x5, x3
+;   cbz w1, #trap=int_divz
+;   sdiv w4, w0, w1
+;   msub w0, w4, w1, w0
 ;   ret
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
-;   sxtw x3, w0
-;   sxtw x5, w1
-;   cbz x5, #0x18
-;   sdiv x8, x3, x5
-;   msub x0, x8, x5, x3
+;   cbz w1, #0x10
+;   sdiv w4, w0, w1
+;   msub w0, w4, w1, w0
 ;   ret
 ;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: int_divz
 
@@ -329,20 +319,16 @@ block0(v0: i32, v1: i32):
 
 ; VCode:
 ; block0:
-;   mov w3, w0
-;   mov w5, w1
-;   cbz w5, #trap=int_divz
-;   udiv x8, x3, x5
-;   msub x0, x8, x5, x3
+;   cbz w1, #trap=int_divz
+;   udiv w4, w0, w1
+;   msub w0, w4, w1, w0
 ;   ret
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
-;   mov w3, w0
-;   mov w5, w1
-;   cbz w5, #0x18
-;   udiv x8, x3, x5
-;   msub x0, x8, x5, x3
+;   cbz w1, #0x10
+;   udiv w4, w0, w1
+;   msub w0, w4, w1, w0
 ;   ret
 ;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: int_divz
 
@@ -795,3 +781,24 @@ block0(v0: i64):
 ;   ret
 ;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: int_ovf
 
+function %sdiv_i16_const(i16) -> i16 {
+block0(v0: i16):
+    v1 = iconst.i16 -2
+    v2 = sdiv v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   sxth w2, w0
+;   movn w4, #1
+;   sdiv w0, w2, w4
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   sxth w2, w0
+;   mov w4, #-2
+;   sdiv w0, w2, w4
+;   ret
+