Skip to content

Commit

Permalink
pulley: Initial scaffold of SIMD support (#9820)
Browse files Browse the repository at this point in the history
* pulley: Initial scaffold of SIMD support

This commit fills out some of the initial infrastructure necessary for
supporting the SIMD proposal to WebAssembly in the Pulley interpreter,
namely 128-bit simd. The `VRegVal` union has been filled out with
various types, endianness questions are settled, and initial
implementations of a suite of opcodes are added to get a basic set of
tests working throughout the backend.

cc #9783

* Avoid dealing with big-endian vectors

* Change wasm `global`s to store `v128` in little-endian format.
* Change pulley stack loads/stores to work with vectors in little-endian
  format.
  • Loading branch information
alexcrichton authored Dec 14, 2024
1 parent 9fd2b3a commit 128decd
Show file tree
Hide file tree
Showing 16 changed files with 518 additions and 77 deletions.
60 changes: 44 additions & 16 deletions cranelift/codegen/meta/src/pulley.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,47 @@ const OPS: &[Inst<'_>] = pulley_interpreter::for_each_op!(define);
const EXTENDED_OPS: &[Inst<'_>] = pulley_interpreter::for_each_extended_op!(define);

enum Operand<'a> {
Normal { name: &'a str, ty: &'a str },
Writable { name: &'a str, ty: &'a str },
TrapCode { name: &'a str, ty: &'a str },
Binop { reg: &'a str },
Normal {
name: &'a str,
ty: &'a str,
},
Writable {
name: &'a str,
ty: &'a str,
},
TrapCode {
name: &'a str,
ty: &'a str,
},
Binop {
dst: &'a str,
src1: &'a str,
src2: &'a str,
},
}

impl Inst<'_> {
fn operands(&self) -> impl Iterator<Item = Operand<'_>> {
self.fields
.iter()
.map(|(name, ty)| match (*name, *ty) {
("operands", "BinaryOperands < XReg >") => Operand::Binop { reg: "XReg" },
("operands", "BinaryOperands < FReg >") => Operand::Binop { reg: "FReg" },
("operands", binop) => {
// Parse "BinaryOperands < A >"` as A/A/A
// Parse "BinaryOperands < A, B >"` as A/B/A
// Parse "BinaryOperands < A, B, C >"` as A/B/C
let mut parts = binop
.strip_prefix("BinaryOperands <")
.unwrap()
.strip_suffix(">")
.unwrap()
.trim()
.split(',')
.map(|x| x.trim());
let dst = parts.next().unwrap();
let src1 = parts.next().unwrap_or(dst);
let src2 = parts.next().unwrap_or(dst);
Operand::Binop { dst, src1, src2 }
}
("dst", ty) => Operand::Writable { name, ty },
(name, ty) => Operand::Normal { name, ty },
})
Expand Down Expand Up @@ -109,7 +137,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
pat.push_str(",");
format_string.push_str(&format!(" // trap={{{name}:?}}"));
}
Operand::Binop { reg: _ } => {
Operand::Binop { .. } => {
pat.push_str("dst, src1, src2,");
format_string.push_str(" {dst}, {src1}, {src2}");
locals.push_str(&format!("let dst = reg_name(*dst.to_reg());\n"));
Expand Down Expand Up @@ -161,7 +189,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
}
}
Operand::TrapCode { .. } => {}
Operand::Binop { reg: _ } => {
Operand::Binop { .. } => {
pat.push_str("dst, src1, src2,");
uses.push("src1");
uses.push("src2");
Expand Down Expand Up @@ -221,7 +249,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
pat.push_str(",");
trap.push_str(&format!("sink.add_trap({name});\n"));
}
Operand::Binop { reg: _ } => {
Operand::Binop { .. } => {
pat.push_str("dst, src1, src2,");
args.push_str(
"pulley_interpreter::regs::BinaryOperands::new(dst, src1, src2),",
Expand Down Expand Up @@ -265,10 +293,10 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> {
Operand::Writable { name, ty } => {
isle.push_str(&format!("\n ({name} Writable{ty})"));
}
Operand::Binop { reg } => {
isle.push_str(&format!("\n (dst Writable{reg})"));
isle.push_str(&format!("\n (src1 {reg})"));
isle.push_str(&format!("\n (src2 {reg})"));
Operand::Binop { dst, src1, src2 } => {
isle.push_str(&format!("\n (dst Writable{dst})"));
isle.push_str(&format!("\n (src1 {src1})"));
isle.push_str(&format!("\n (src2 {src2})"));
}
}
}
Expand Down Expand Up @@ -303,13 +331,13 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> {
assert!(result.is_none(), "{} has >1 result", inst.snake_name);
result = Some(ty);
}
Operand::Binop { reg } => {
isle.push_str(&format!("{reg} {reg}"));
Operand::Binop { dst, src1, src2 } => {
isle.push_str(&format!("{src1} {src2}"));
rule.push_str("src1 src2");
ops.push("src1");
ops.push("src2");
assert!(result.is_none(), "{} has >1 result", inst.snake_name);
result = Some(reg);
result = Some(dst);
}
}
isle.push_str(" ");
Expand Down
37 changes: 25 additions & 12 deletions cranelift/codegen/src/isa/pulley_shared/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,23 @@ where
}

fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I {
Inst::gen_load(into_reg, mem.into(), ty, MemFlags::trusted()).into()
let mut flags = MemFlags::trusted();
// Stack loads/stores of vectors always use little-endianess to avoid
// implementing a byte-swap of vectors on big-endian platforms.
if ty.is_vector() {
flags.set_endianness(ir::Endianness::Little);
}
Inst::gen_load(into_reg, mem.into(), ty, flags).into()
}

fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I {
Inst::gen_store(mem.into(), from_reg, ty, MemFlags::trusted()).into()
let mut flags = MemFlags::trusted();
// Stack loads/stores of vectors always use little-endianess to avoid
// implementing a byte-swap of vectors on big-endian platforms.
if ty.is_vector() {
flags.set_endianness(ir::Endianness::Little);
}
Inst::gen_store(mem.into(), from_reg, ty, flags).into()
}

fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self::I {
Expand Down Expand Up @@ -510,17 +522,18 @@ where
_target_vector_bytes: u32,
_isa_flags: &PulleyFlags,
) -> u32 {
// Spill slots are the size of a "word" or a pointer, but Pulley
// registers are 8-byte for integers/floats regardless of pointer size.
// Calculate the number of slots necessary to store 8 bytes.
let slots_for_8bytes = match P::pointer_width() {
PointerWidth::PointerWidth32 => 2,
PointerWidth::PointerWidth64 => 1,
};
match rc {
// Spilling an integer or float register requires spilling 8 bytes,
// and spill slots are defined in terms of "word bytes" or the size
// of a pointer. That means on 32-bit pulley we need to take up two
// spill slots where on 64-bit pulley we need to only take up one
// spill slot for integers.
RegClass::Int | RegClass::Float => match P::pointer_width() {
PointerWidth::PointerWidth32 => 2,
PointerWidth::PointerWidth64 => 1,
},
RegClass::Vector => unreachable!(),
// Int/float registers are 8-bytes
RegClass::Int | RegClass::Float => slots_for_8bytes,
// Vector registers are 16 bytes
RegClass::Vector => 2 * slots_for_8bytes,
}
}

Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/src/isa/pulley_shared/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,16 @@
(rule (pulley_fstore amode src ty flags)
(SideEffectNoResult.Inst (MInst.FStore amode src ty flags)))

(decl pulley_vload (Amode Type MemFlags) VReg)
(rule (pulley_vload amode ty flags)
(let ((dst WritableVReg (temp_writable_vreg))
(_ Unit (emit (MInst.VLoad dst amode ty flags))))
dst))

(decl pulley_vstore (Amode VReg Type MemFlags) SideEffectNoResult)
(rule (pulley_vstore amode src ty flags)
(SideEffectNoResult.Inst (MInst.VStore amode src ty flags)))

(decl gen_br_table (XReg MachLabel BoxVecMachLabel) Unit)
(rule (gen_br_table idx default labels)
(emit (MInst.BrTable idx default labels)))
Expand Down
14 changes: 2 additions & 12 deletions cranelift/codegen/src/isa/pulley_shared/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,18 +453,8 @@ where
}

fn worst_case_size() -> CodeOffset {
// `BrIfXeq32 { a, b, taken, not_taken }` expands to `br_if_xeq32 a, b, taken; jump not_taken`.
//
// The first instruction is seven bytes long:
// * 1 byte opcode
// * 1 byte `a` register encoding
// * 1 byte `b` register encoding
// * 4 byte `taken` displacement
//
// And the second instruction is five bytes long:
// * 1 byte opcode
// * 4 byte `not_taken` displacement
12
// `Vconst128 { dst, imm }` is 18 bytes (opcode + dst + 16-byte imm)
18
}

fn ref_type_regclass(_settings: &settings::Flags) -> RegClass {
Expand Down
35 changes: 35 additions & 0 deletions cranelift/codegen/src/isa/pulley_shared/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@
(rule (lower (has_type $I64 (iadd a b)))
(pulley_xadd64 a b))

(rule (lower (has_type $I8X16 (iadd a b))) (pulley_vaddi8x16 a b))
(rule (lower (has_type $I16X8 (iadd a b))) (pulley_vaddi16x8 a b))
(rule (lower (has_type $I32X4 (iadd a b))) (pulley_vaddi32x4 a b))
(rule (lower (has_type $I64X2 (iadd a b))) (pulley_vaddi64x2 a b))

;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8 (isub a b)))
Expand Down Expand Up @@ -192,6 +197,11 @@
(rule (lower (has_type $I64 (ishl a b)))
(pulley_xshl64 a b))

(rule (lower (has_type $I8X16 (ishl a b))) (pulley_vshli8x16 a b))
(rule (lower (has_type $I16X8 (ishl a b))) (pulley_vshli16x8 a b))
(rule (lower (has_type $I32X4 (ishl a b))) (pulley_vshli32x4 a b))
(rule (lower (has_type $I64X2 (ishl a b))) (pulley_vshli64x2 a b))

;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I32 (ushr a b)))
Expand All @@ -200,6 +210,11 @@
(rule (lower (has_type $I64 (ushr a b)))
(pulley_xshr64_u a b))

(rule (lower (has_type $I8X16 (ushr a b))) (pulley_vshri8x16_u a b))
(rule (lower (has_type $I16X8 (ushr a b))) (pulley_vshri16x8_u a b))
(rule (lower (has_type $I32X4 (ushr a b))) (pulley_vshri32x4_u a b))
(rule (lower (has_type $I64X2 (ushr a b))) (pulley_vshri64x2_u a b))

;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I32 (sshr a b)))
Expand All @@ -208,6 +223,11 @@
(rule (lower (has_type $I64 (sshr a b)))
(pulley_xshr64_s a b))

(rule (lower (has_type $I8X16 (sshr a b))) (pulley_vshri8x16_s a b))
(rule (lower (has_type $I16X8 (sshr a b))) (pulley_vshri16x8_s a b))
(rule (lower (has_type $I32X4 (sshr a b))) (pulley_vshri32x4_s a b))
(rule (lower (has_type $I64X2 (sshr a b))) (pulley_vshri64x2_s a b))

;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (has_type (fits_in_32 _) (band a b)))
Expand Down Expand Up @@ -414,6 +434,9 @@
(rule 1 (lower (has_type $I64 (sload32 flags addr offset)))
(pulley_xload (amode addr offset) $I32 flags (ExtKind.Sign64)))

(rule 2 (lower (has_type (ty_vec128 ty) (load flags addr offset)))
(pulley_vload (amode addr offset) ty flags))

;;;; Rules for `store` and friends ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (store flags src @ (value_type (ty_int ty)) addr offset))
Expand All @@ -431,6 +454,9 @@
(rule (lower (istore32 flags src addr offset))
(side_effect (pulley_xstore (amode addr offset) src $I32 flags)))

(rule 2 (lower (store flags src @ (value_type (ty_vec128 ty)) addr offset))
(side_effect (pulley_vstore (amode addr offset) src ty flags)))

;;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (stack_addr stack_slot offset))
Expand Down Expand Up @@ -522,6 +548,9 @@
(rule (lower (has_type $I64 (bitcast _flags val @ (value_type $F64))))
(pulley_bitcast_int_from_float_64 val))

(rule 1 (lower (has_type (ty_vec128 _) (bitcast _flags val @ (value_type (ty_vec128 _)))))
val)

;;;; Rules for `fcvt_to_{u,s}int` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I32 (fcvt_to_uint val @ (value_type $F32))))
Expand Down Expand Up @@ -622,6 +651,8 @@

(rule (lower (has_type $F32 (fadd a b))) (pulley_fadd32 a b))
(rule (lower (has_type $F64 (fadd a b))) (pulley_fadd64 a b))
(rule (lower (has_type $F32X4 (fadd a b))) (pulley_vaddf32x4 a b))
(rule (lower (has_type $F64X2 (fadd a b))) (pulley_vaddf64x2 a b))

;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down Expand Up @@ -687,3 +718,7 @@

(rule (lower (has_type $F32 (fabs a))) (pulley_fabs32 a))
(rule (lower (has_type $F64 (fabs a))) (pulley_fabs64 a))

;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (ty_vec128 _) (vconst (u128_from_constant a)))) (pulley_vconst128 a))
10 changes: 10 additions & 0 deletions crates/cranelift/src/translate/code_translator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ pub fn translate_operator(
GlobalVariable::Memory { gv, offset, ty } => {
let addr = builder.ins().global_value(environ.pointer_type(), gv);
let mut flags = ir::MemFlags::trusted();
// Store vector globals in little-endian format to avoid
// byte swaps on big-endian platforms since at-rest vectors
// should already be in little-endian format anyway.
if ty.is_vector() {
flags.set_endianness(ir::Endianness::Little);
}
// Put globals in the "table" abstract heap category as well.
flags.set_alias_region(Some(ir::AliasRegion::Table));
builder.ins().load(ty, flags, addr, offset)
Expand All @@ -191,6 +197,10 @@ pub fn translate_operator(
GlobalVariable::Memory { gv, offset, ty } => {
let addr = builder.ins().global_value(environ.pointer_type(), gv);
let mut flags = ir::MemFlags::trusted();
// Like `global.get`, store globals in little-endian format.
if ty.is_vector() {
flags.set_endianness(ir::Endianness::Little);
}
// Put globals in the "table" abstract heap category as well.
flags.set_alias_region(Some(ir::AliasRegion::Table));
let mut val = state.pop1();
Expand Down
4 changes: 2 additions & 2 deletions crates/wasmtime/src/runtime/externals/global.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ impl Global {
ValType::I64 => Val::from(*definition.as_i64()),
ValType::F32 => Val::F32(*definition.as_u32()),
ValType::F64 => Val::F64(*definition.as_u64()),
ValType::V128 => Val::V128((*definition.as_u128()).into()),
ValType::V128 => Val::V128(definition.get_u128().into()),
ValType::Ref(ref_ty) => {
let reference: Ref = match ref_ty.heap_type() {
HeapType::Func | HeapType::ConcreteFunc(_) => {
Expand Down Expand Up @@ -187,7 +187,7 @@ impl Global {
Val::I64(i) => *definition.as_i64_mut() = i,
Val::F32(f) => *definition.as_u32_mut() = f,
Val::F64(f) => *definition.as_u64_mut() = f,
Val::V128(i) => *definition.as_u128_mut() = i.into(),
Val::V128(i) => definition.set_u128(i.into()),
Val::FuncRef(f) => {
*definition.as_func_ref_mut() = f.map_or(ptr::null_mut(), |f| {
f.vm_func_ref(&mut store).as_ptr().cast()
Expand Down
2 changes: 1 addition & 1 deletion crates/wasmtime/src/runtime/trampoline/global.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pub fn generate_global_export(
Val::I64(x) => *global.as_i64_mut() = x,
Val::F32(x) => *global.as_f32_bits_mut() = x,
Val::F64(x) => *global.as_f64_bits_mut() = x,
Val::V128(x) => *global.as_u128_mut() = x.into(),
Val::V128(x) => global.set_u128(x.into()),
Val::FuncRef(f) => {
*global.as_func_ref_mut() =
f.map_or(ptr::null_mut(), |f| f.vm_func_ref(&mut store).as_ptr());
Expand Down
22 changes: 14 additions & 8 deletions crates/wasmtime/src/runtime/vm/vmcontext.rs
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ impl VMGlobalDefinition {
WasmValType::I64 => *global.as_i64_mut() = raw.get_i64(),
WasmValType::F32 => *global.as_f32_bits_mut() = raw.get_f32(),
WasmValType::F64 => *global.as_f64_bits_mut() = raw.get_f64(),
WasmValType::V128 => *global.as_u128_mut() = raw.get_v128(),
WasmValType::V128 => global.set_u128(raw.get_v128()),
WasmValType::Ref(r) => match r.heap_type.top() {
WasmHeapTopType::Extern => {
let r = VMGcRef::from_raw_u32(raw.get_externref());
Expand Down Expand Up @@ -478,7 +478,7 @@ impl VMGlobalDefinition {
WasmValType::I64 => ValRaw::i64(*self.as_i64()),
WasmValType::F32 => ValRaw::f32(*self.as_f32_bits()),
WasmValType::F64 => ValRaw::f64(*self.as_f64_bits()),
WasmValType::V128 => ValRaw::v128(*self.as_u128()),
WasmValType::V128 => ValRaw::v128(self.get_u128()),
WasmValType::Ref(r) => match r.heap_type.top() {
WasmHeapTopType::Extern => ValRaw::externref(match self.as_gc_ref() {
Some(r) => store.gc_store_mut()?.clone_gc_ref(r).as_raw_u32(),
Expand Down Expand Up @@ -575,14 +575,20 @@ impl VMGlobalDefinition {
&mut *(self.storage.as_mut().as_mut_ptr().cast::<u64>())
}

/// Return a reference to the value as an u128.
pub unsafe fn as_u128(&self) -> &u128 {
&*(self.storage.as_ref().as_ptr().cast::<u128>())
/// Gets the underlying 128-bit vector value.
//
// Note that vectors are stored in little-endian format while other types
// are stored in native-endian format.
pub unsafe fn get_u128(&self) -> u128 {
u128::from_le(*(self.storage.as_ref().as_ptr().cast::<u128>()))
}

/// Return a mutable reference to the value as an u128.
pub unsafe fn as_u128_mut(&mut self) -> &mut u128 {
&mut *(self.storage.as_mut().as_mut_ptr().cast::<u128>())
/// Sets the 128-bit vector values.
//
// Note that vectors are stored in little-endian format while other types
// are stored in native-endian format.
pub unsafe fn set_u128(&mut self, val: u128) {
*self.storage.as_mut().as_mut_ptr().cast::<u128>() = val.to_le();
}

/// Return a reference to the value as u128 bits.
Expand Down
Loading

0 comments on commit 128decd

Please sign in to comment.