diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle index 280bc770c228..9a192f5e9ca8 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.isle +++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle @@ -720,6 +720,9 @@ (rule (lower (fcmp cc a b @ (value_type (ty_scalar_float ty)))) (lower_fcmp ty cc a b)) +(rule 1 (lower (fcmp cc a b @ (value_type (ty_vec128 ty)))) + (lower_vfcmp ty cc a b)) + (decl lower_fcmp (Type FloatCC Value Value) XReg) (rule (lower_fcmp $F32 (FloatCC.Equal) a b) (pulley_feq32 a b)) @@ -751,6 +754,32 @@ (if-let true (floatcc_unordered cc)) (pulley_xbxor32_s8 (lower_fcmp ty (floatcc_complement cc) a b) 1)) +(decl lower_vfcmp (Type FloatCC Value Value) VReg) + +(rule (lower_vfcmp $F32X4 (FloatCC.Equal) a b) (pulley_veqf32x4 a b)) +(rule (lower_vfcmp $F64X2 (FloatCC.Equal) a b) (pulley_veqf64x2 a b)) +(rule (lower_vfcmp $F32X4 (FloatCC.NotEqual) a b) (pulley_vneqf32x4 a b)) +(rule (lower_vfcmp $F64X2 (FloatCC.NotEqual) a b) (pulley_vneqf64x2 a b)) +(rule (lower_vfcmp $F32X4 (FloatCC.LessThan) a b) (pulley_vltf32x4 a b)) +(rule (lower_vfcmp $F64X2 (FloatCC.LessThan) a b) (pulley_vltf64x2 a b)) +(rule (lower_vfcmp $F32X4 (FloatCC.LessThanOrEqual) a b) (pulley_vlteqf32x4 a b)) +(rule (lower_vfcmp $F64X2 (FloatCC.LessThanOrEqual) a b) (pulley_vlteqf64x2 a b)) + +(rule (lower_vfcmp ty (FloatCC.Unordered) a b) + (pulley_vbor128 + (lower_vfcmp ty (FloatCC.NotEqual) a a) + (lower_vfcmp ty (FloatCC.NotEqual) b b))) + +;; NB: Pulley doesn't have lowerings for `Ordered` or `Unordered*` `FloatCC` +;; conditions as that's not needed by wasm at this time. + +;; Pulley doesn't have instructions for `>` and `>=`, so we have to reverse the +;; operation. +(rule (lower_vfcmp ty (FloatCC.GreaterThan) a b) + (lower_vfcmp ty (FloatCC.LessThan) b a)) +(rule (lower_vfcmp ty (FloatCC.GreaterThanOrEqual) a b) + (lower_vfcmp ty (FloatCC.LessThanOrEqual) b a)) + ;;;; Rules for `load` and friends ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl amode (Value Offset32) Amode) @@ -1121,16 +1150,22 @@ (rule (lower (has_type $F32 (fsub a b))) (pulley_fsub32 a b)) (rule (lower (has_type $F64 (fsub a b))) (pulley_fsub64 a b)) +(rule (lower (has_type $F32X4 (fsub a b))) (pulley_vsubf32x4 a b)) +(rule (lower (has_type $F64X2 (fsub a b))) (pulley_vsubf64x2 a b)) ;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fmul a b))) (pulley_fmul32 a b)) (rule (lower (has_type $F64 (fmul a b))) (pulley_fmul64 a b)) +(rule (lower (has_type $F32X4 (fmul a b))) (pulley_vmulf32x4 a b)) +(rule (lower (has_type $F64X2 (fmul a b))) (pulley_vmulf64x2 a b)) ;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fdiv a b))) (pulley_fdiv32 a b)) (rule (lower (has_type $F64 (fdiv a b))) (pulley_fdiv64 a b)) +(rule (lower (has_type $F32X4 (fdiv a b))) (pulley_vdivf32x4 a b)) +(rule (lower (has_type $F64X2 (fdiv a b))) (pulley_vdivf64x2 a b)) ;;;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1161,6 +1196,7 @@ (pulley_vfloor32x4 a)) (rule (lower (has_type $F64X2 (floor a))) (pulley_vfloor64x2 a)) + ;;;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (ceil a))) (pulley_fceil32 a)) @@ -1188,11 +1224,12 @@ (rule (lower (has_type $F64X2 (sqrt a))) (pulley_vsqrt64x2 a)) - ;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fneg a))) (pulley_fneg32 a)) (rule (lower (has_type $F64 (fneg a))) (pulley_fneg64 a)) +(rule (lower (has_type $F32X4 (fneg a))) (pulley_vnegf32x4 a)) +(rule (lower (has_type $F64X2 (fneg a))) (pulley_vnegf64x2 a)) ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1358,3 +1395,8 @@ (pulley_vinsertf32 (pulley_vconst128 0) a 0)) (rule (lower (scalar_to_vector a @ (value_type $F64))) (pulley_vinsertf64 (pulley_vconst128 0) a 0)) + +;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $F32X4 (fma a b c))) (pulley_vfma32x4 a b c)) +(rule (lower (has_type $F64X2 (fma a b c))) (pulley_vfma64x2 a b c)) diff --git a/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif b/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif index e018fd0fd7a8..c8d3035093ac 100644 --- a/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif @@ -8,6 +8,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %splat_f32x4_2(f32x4) -> f32x4 { block0(v0: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fadd.clif b/cranelift/filetests/filetests/runtests/simd-fadd.clif index 402ee9e44f08..6ca2c6c5779e 100644 --- a/cranelift/filetests/filetests/runtests/simd-fadd.clif +++ b/cranelift/filetests/filetests/runtests/simd-fadd.clif @@ -9,6 +9,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fadd_f32x4(f32x4, f32x4) -> f32x4 { diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-eq.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-eq.clif index 378b5f273069..31c86ca33f35 100644 --- a/cranelift/filetests/filetests/runtests/simd-fcmp-eq.clif +++ b/cranelift/filetests/filetests/runtests/simd-fcmp-eq.clif @@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %simd_fcmp_eq_f32(f32x4, f32x4) -> i32x4 { block0(v0: f32x4, v1: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-ge.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-ge.clif index b9addbfaadaf..8e7c0e3354bb 100644 --- a/cranelift/filetests/filetests/runtests/simd-fcmp-ge.clif +++ b/cranelift/filetests/filetests/runtests/simd-fcmp-ge.clif @@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %simd_fcmp_ge_f32(f32x4, f32x4) -> i32x4 { block0(v0: f32x4, v1: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-gt.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-gt.clif index 25bf525ddda8..947feca07239 100644 --- a/cranelift/filetests/filetests/runtests/simd-fcmp-gt.clif +++ b/cranelift/filetests/filetests/runtests/simd-fcmp-gt.clif @@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %simd_fcmp_gt_f32(f32x4, f32x4) -> i32x4 { block0(v0: f32x4, v1: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-le.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-le.clif index e1ec0e911c25..9e498c42518f 100644 --- a/cranelift/filetests/filetests/runtests/simd-fcmp-le.clif +++ b/cranelift/filetests/filetests/runtests/simd-fcmp-le.clif @@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %simd_fcmp_le_f32(f32x4, f32x4) -> i32x4 { block0(v0: f32x4, v1: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-lt.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-lt.clif index 0a3fd948825f..0a5c22fc1755 100644 --- a/cranelift/filetests/filetests/runtests/simd-fcmp-lt.clif +++ b/cranelift/filetests/filetests/runtests/simd-fcmp-lt.clif @@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %simd_fcmp_lt_f32(f32x4, f32x4) -> i32x4 { block0(v0: f32x4, v1: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-ne.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-ne.clif index 7920996d0357..f9fd58bf54fd 100644 --- a/cranelift/filetests/filetests/runtests/simd-fcmp-ne.clif +++ b/cranelift/filetests/filetests/runtests/simd-fcmp-ne.clif @@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %simd_fcmp_ne_f32(f32x4, f32x4) -> i32x4 { block0(v0: f32x4, v1: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-uno.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-uno.clif index 38886bf1bb09..0fb43c749056 100644 --- a/cranelift/filetests/filetests/runtests/simd-fcmp-uno.clif +++ b/cranelift/filetests/filetests/runtests/simd-fcmp-uno.clif @@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %simd_fcmp_uno_f32(f32x4, f32x4) -> i32x4 { block0(v0: f32x4, v1: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fdiv.clif b/cranelift/filetests/filetests/runtests/simd-fdiv.clif index b6707dc86be3..d491e86ab98c 100644 --- a/cranelift/filetests/filetests/runtests/simd-fdiv.clif +++ b/cranelift/filetests/filetests/runtests/simd-fdiv.clif @@ -9,6 +9,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fdiv_f32x4(f32x4, f32x4) -> f32x4 { diff --git a/cranelift/filetests/filetests/runtests/simd-floor.clif b/cranelift/filetests/filetests/runtests/simd-floor.clif index c6e59c9888c8..494cd229d07e 100644 --- a/cranelift/filetests/filetests/runtests/simd-floor.clif +++ b/cranelift/filetests/filetests/runtests/simd-floor.clif @@ -9,6 +9,10 @@ target s390x set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %floor_f32x4(f32x4) -> f32x4 { block0(v0: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fma-neg.clif b/cranelift/filetests/filetests/runtests/simd-fma-neg.clif index 1351ef34d091..cd3caabb72dd 100644 --- a/cranelift/filetests/filetests/runtests/simd-fma-neg.clif +++ b/cranelift/filetests/filetests/runtests/simd-fma-neg.clif @@ -5,6 +5,10 @@ target aarch64 set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be ;; This file is not enabled in the interpreter since SIMD fneg is currently broken ;; there. diff --git a/cranelift/filetests/filetests/runtests/simd-fma.clif b/cranelift/filetests/filetests/runtests/simd-fma.clif index c3f143987047..91e9c270223b 100644 --- a/cranelift/filetests/filetests/runtests/simd-fma.clif +++ b/cranelift/filetests/filetests/runtests/simd-fma.clif @@ -6,6 +6,10 @@ target aarch64 set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 { block0(v0: f32x4, v1: f32x4, v2: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif b/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif index f8d537301f29..9fd071e5e4dd 100644 --- a/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif @@ -6,6 +6,10 @@ target x86_64 skylake set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { block0(v0:f32x4, v1:f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fmul.clif b/cranelift/filetests/filetests/runtests/simd-fmul.clif index 9febf85eead8..cca72e1beda7 100644 --- a/cranelift/filetests/filetests/runtests/simd-fmul.clif +++ b/cranelift/filetests/filetests/runtests/simd-fmul.clif @@ -8,6 +8,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fmul_f32x4(f32x4, f32x4) -> f32x4 { diff --git a/cranelift/filetests/filetests/runtests/simd-fneg.clif b/cranelift/filetests/filetests/runtests/simd-fneg.clif index 7b56dee100eb..6703e5281159 100644 --- a/cranelift/filetests/filetests/runtests/simd-fneg.clif +++ b/cranelift/filetests/filetests/runtests/simd-fneg.clif @@ -9,6 +9,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fneg_f32x4(f32x4) -> f32x4 { block0(v0: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fsub.clif b/cranelift/filetests/filetests/runtests/simd-fsub.clif index 0322ec2ebf49..9eadc2f38466 100644 --- a/cranelift/filetests/filetests/runtests/simd-fsub.clif +++ b/cranelift/filetests/filetests/runtests/simd-fsub.clif @@ -8,6 +8,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fsub_f32x4(f32x4, f32x4) -> f32x4 { diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index c700cf6aea2d..4e2ae6863a7d 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -401,7 +401,6 @@ impl WastTest { // features in Pulley are implemented. if config.compiler == Compiler::CraneliftPulley { let unsupported = [ - "misc_testsuite/simd/canonicalize-nan.wast", "misc_testsuite/simd/issue_3327_bnot_lowering.wast", "misc_testsuite/simd/v128-select.wast", "spec_testsuite/proposals/annotations/simd_lane.wast", @@ -409,19 +408,11 @@ impl WastTest { "spec_testsuite/proposals/relaxed-simd/i32x4_relaxed_trunc.wast", "spec_testsuite/proposals/relaxed-simd/i8x16_relaxed_swizzle.wast", "spec_testsuite/proposals/relaxed-simd/relaxed_dot_product.wast", - "spec_testsuite/proposals/relaxed-simd/relaxed_madd_nmadd.wast", "spec_testsuite/proposals/memory64/simd_lane.wast", - "spec_testsuite/proposals/memory64/relaxed_madd_nmadd.wast", "spec_testsuite/proposals/memory64/relaxed_dot_product.wast", "spec_testsuite/proposals/memory64/i16x8_relaxed_q15mulr_s.wast", "spec_testsuite/proposals/memory64/i32x4_relaxed_trunc.wast", "spec_testsuite/proposals/memory64/i8x16_relaxed_swizzle.wast", - "spec_testsuite/simd_f32x4_arith.wast", - "spec_testsuite/simd_f32x4_cmp.wast", - "spec_testsuite/simd_f32x4_pmin_pmax.wast", - "spec_testsuite/simd_f64x2_arith.wast", - "spec_testsuite/simd_f64x2_cmp.wast", - "spec_testsuite/simd_f64x2_pmin_pmax.wast", "spec_testsuite/simd_i16x8_arith2.wast", "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast", "spec_testsuite/simd_i16x8_q15mulr_sat_s.wast", diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index d0e999ad8c2b..030ec16a98a5 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -3267,26 +3267,6 @@ impl ExtendedOpVisitor for Interpreter<'_> { ControlFlow::Continue(()) } - fn vaddf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { - let mut a = self.state[operands.src1].get_f32x4(); - let b = self.state[operands.src2].get_f32x4(); - for (a, b) in a.iter_mut().zip(b) { - *a += b; - } - self.state[operands.dst].set_f32x4(a); - ControlFlow::Continue(()) - } - - fn vaddf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { - let mut a = self.state[operands.src1].get_f64x2(); - let b = self.state[operands.src2].get_f64x2(); - for (a, b) in a.iter_mut().zip(b) { - *a += b; - } - self.state[operands.dst].set_f64x2(a); - ControlFlow::Continue(()) - } - fn vshli8x16(&mut self, operands: BinaryOperands) -> ControlFlow { let a = self.state[operands.src1].get_i8x16(); let b = self.state[operands.src2].get_u32(); @@ -4282,4 +4262,206 @@ impl ExtendedOpVisitor for Interpreter<'_> { self.state[operands.dst].set_f64x2(a); ControlFlow::Continue(()) } + + fn vaddf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f32x4(); + let b = self.state[operands.src2].get_f32x4(); + for (a, b) in a.iter_mut().zip(b) { + *a += b; + } + self.state[operands.dst].set_f32x4(a); + ControlFlow::Continue(()) + } + + fn vaddf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f64x2(); + let b = self.state[operands.src2].get_f64x2(); + for (a, b) in a.iter_mut().zip(b) { + *a += b; + } + self.state[operands.dst].set_f64x2(a); + ControlFlow::Continue(()) + } + + fn vsubf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f32x4(); + let b = self.state[operands.src2].get_f32x4(); + for (a, b) in a.iter_mut().zip(b) { + *a -= b; + } + self.state[operands.dst].set_f32x4(a); + ControlFlow::Continue(()) + } + + fn vsubf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f64x2(); + let b = self.state[operands.src2].get_f64x2(); + for (a, b) in a.iter_mut().zip(b) { + *a -= b; + } + self.state[operands.dst].set_f64x2(a); + ControlFlow::Continue(()) + } + + fn vmulf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f32x4(); + let b = self.state[operands.src2].get_f32x4(); + for (a, b) in a.iter_mut().zip(b) { + *a *= b; + } + self.state[operands.dst].set_f32x4(a); + ControlFlow::Continue(()) + } + + fn vmulf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f64x2(); + let b = self.state[operands.src2].get_f64x2(); + for (a, b) in a.iter_mut().zip(b) { + *a *= b; + } + self.state[operands.dst].set_f64x2(a); + ControlFlow::Continue(()) + } + + fn vdivf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f32x4(); + let b = self.state[operands.src2].get_f32x4(); + for (a, b) in a.iter_mut().zip(b) { + *a /= b; + } + self.state[operands.dst].set_f32x4(a); + ControlFlow::Continue(()) + } + + fn vdivf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f64x2(); + let b = self.state[operands.src2].get_f64x2(); + for (a, b) in a.iter_mut().zip(b) { + *a /= b; + } + self.state[operands.dst].set_f64x2(a); + ControlFlow::Continue(()) + } + + fn vnegf32x4(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_f32x4(); + self.state[dst].set_f32x4(a.map(|i| -i)); + ControlFlow::Continue(()) + } + + fn vnegf64x2(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_f64x2(); + self.state[dst].set_f64x2(a.map(|i| -i)); + ControlFlow::Continue(()) + } + + fn veqf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_f32x4(); + let b = self.state[operands.src2].get_f32x4(); + let mut c = [0; 4]; + for ((a, b), c) in a.iter().zip(&b).zip(&mut c) { + *c = if a == b { u32::MAX } else { 0 }; + } + self.state[operands.dst].set_u32x4(c); + ControlFlow::Continue(()) + } + + fn vneqf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_f32x4(); + let b = self.state[operands.src2].get_f32x4(); + let mut c = [0; 4]; + for ((a, b), c) in a.iter().zip(&b).zip(&mut c) { + *c = if a != b { u32::MAX } else { 0 }; + } + self.state[operands.dst].set_u32x4(c); + ControlFlow::Continue(()) + } + + fn vltf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_f32x4(); + let b = self.state[operands.src2].get_f32x4(); + let mut c = [0; 4]; + for ((a, b), c) in a.iter().zip(&b).zip(&mut c) { + *c = if a < b { u32::MAX } else { 0 }; + } + self.state[operands.dst].set_u32x4(c); + ControlFlow::Continue(()) + } + + fn vlteqf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_f32x4(); + let b = self.state[operands.src2].get_f32x4(); + let mut c = [0; 4]; + for ((a, b), c) in a.iter().zip(&b).zip(&mut c) { + *c = if a <= b { u32::MAX } else { 0 }; + } + self.state[operands.dst].set_u32x4(c); + ControlFlow::Continue(()) + } + + fn veqf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_f64x2(); + let b = self.state[operands.src2].get_f64x2(); + let mut c = [0; 2]; + for ((a, b), c) in a.iter().zip(&b).zip(&mut c) { + *c = if a == b { u64::MAX } else { 0 }; + } + self.state[operands.dst].set_u64x2(c); + ControlFlow::Continue(()) + } + + fn vneqf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_f64x2(); + let b = self.state[operands.src2].get_f64x2(); + let mut c = [0; 2]; + for ((a, b), c) in a.iter().zip(&b).zip(&mut c) { + *c = if a != b { u64::MAX } else { 0 }; + } + self.state[operands.dst].set_u64x2(c); + ControlFlow::Continue(()) + } + + fn vltf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_f64x2(); + let b = self.state[operands.src2].get_f64x2(); + let mut c = [0; 2]; + for ((a, b), c) in a.iter().zip(&b).zip(&mut c) { + *c = if a < b { u64::MAX } else { 0 }; + } + self.state[operands.dst].set_u64x2(c); + ControlFlow::Continue(()) + } + + fn vlteqf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_f64x2(); + let b = self.state[operands.src2].get_f64x2(); + let mut c = [0; 2]; + for ((a, b), c) in a.iter().zip(&b).zip(&mut c) { + *c = if a <= b { u64::MAX } else { 0 }; + } + self.state[operands.dst].set_u64x2(c); + ControlFlow::Continue(()) + } + + fn vfma32x4(&mut self, dst: VReg, a: VReg, b: VReg, c: VReg) -> ControlFlow { + let mut a = self.state[a].get_f32x4(); + let b = self.state[b].get_f32x4(); + let c = self.state[c].get_f32x4(); + for ((a, b), c) in a.iter_mut().zip(b).zip(c) { + *a = a.mul_add(b, c); + } + self.state[dst].set_f32x4(a); + ControlFlow::Continue(()) + } + + fn vfma64x2(&mut self, dst: VReg, a: VReg, b: VReg, c: VReg) -> ControlFlow { + let mut a = self.state[a].get_f64x2(); + let b = self.state[b].get_f64x2(); + let c = self.state[c].get_f64x2(); + for ((a, b), c) in a.iter_mut().zip(b).zip(c) { + *a = a.mul_add(b, c); + } + self.state[dst].set_f64x2(a); + ControlFlow::Continue(()) + } } diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index f83a24606e8d..842072a312fb 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -892,10 +892,6 @@ macro_rules! for_each_extended_op { vaddi32x4 = VAddI32x4 { operands: BinaryOperands }; /// `dst = src1 + src2` vaddi64x2 = VAddI64x2 { operands: BinaryOperands }; - /// `dst = src1 + src2` - vaddf32x4 = VAddF32x4 { operands: BinaryOperands }; - /// `dst = src1 + src2` - vaddf64x2 = VAddF64x2 { operands: BinaryOperands }; /// `dst = src1 << src2` vshli8x16 = VShlI8x16 { operands: BinaryOperands }; @@ -1165,6 +1161,49 @@ macro_rules! for_each_extended_op { vminimumf32x4 = Vminimumf32x4 { operands: BinaryOperands }; /// `dst = ieee_minimum(src1, src2)` vminimumf64x2 = Vminimumf64x2 { operands: BinaryOperands }; + + /// `dst = src1 + src2` + vaddf32x4 = VAddF32x4 { operands: BinaryOperands }; + /// `dst = src1 + src2` + vaddf64x2 = VAddF64x2 { operands: BinaryOperands }; + /// `dst = src1 - src2` + vsubf32x4 = VSubF32x4 { operands: BinaryOperands }; + /// `dst = src1 - src2` + vsubf64x2 = VSubF64x2 { operands: BinaryOperands }; + /// `dst = src1 * src2` + vmulf32x4 = VMulF32x4 { operands: BinaryOperands }; + /// `dst = src1 * src2` + vmulf64x2 = VMulF64x2 { operands: BinaryOperands }; + /// `dst = src1 / src2` + vdivf32x4 = VDivF32x4 { operands: BinaryOperands }; + /// `dst = src1 / src2` + vdivf64x2 = VDivF64x2 { operands: BinaryOperands }; + /// `dst = -src` + vnegf32x4 = Vnegf32x4 { dst: VReg, src: VReg }; + /// `dst = -src` + vnegf64x2 = Vnegf64x2 { dst: VReg, src: VReg }; + + /// `dst = src == dst` + veqf32x4 = VeqF32x4 { operands: BinaryOperands }; + /// `dst = src != dst` + vneqf32x4 = VneqF32x4 { operands: BinaryOperands }; + /// `dst = src < dst` + vltf32x4 = VltF32x4 { operands: BinaryOperands }; + /// `dst = src <= dst` + vlteqf32x4 = VlteqF32x4 { operands: BinaryOperands }; + /// `dst = src == dst` + veqf64x2 = VeqF64x2 { operands: BinaryOperands }; + /// `dst = src != dst` + vneqf64x2 = VneqF64x2 { operands: BinaryOperands }; + /// `dst = src < dst` + vltf64x2 = VltF64x2 { operands: BinaryOperands }; + /// `dst = src <= dst` + vlteqf64x2 = VlteqF64x2 { operands: BinaryOperands }; + + /// `dst = ieee_fma(a, b, c)` + vfma32x4 = Vfma32x4 { dst: VReg, a: VReg, b: VReg, c: VReg }; + /// `dst = ieee_fma(a, b, c)` + vfma64x2 = Vfma64x2 { dst: VReg, a: VReg, b: VReg, c: VReg }; } }; }