Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pulley: Initial scaffold of SIMD support #9820

Merged
merged 2 commits into from
Dec 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 44 additions & 16 deletions cranelift/codegen/meta/src/pulley.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,47 @@ const OPS: &[Inst<'_>] = pulley_interpreter::for_each_op!(define);
const EXTENDED_OPS: &[Inst<'_>] = pulley_interpreter::for_each_extended_op!(define);

enum Operand<'a> {
Normal { name: &'a str, ty: &'a str },
Writable { name: &'a str, ty: &'a str },
TrapCode { name: &'a str, ty: &'a str },
Binop { reg: &'a str },
Normal {
name: &'a str,
ty: &'a str,
},
Writable {
name: &'a str,
ty: &'a str,
},
TrapCode {
name: &'a str,
ty: &'a str,
},
Binop {
dst: &'a str,
src1: &'a str,
src2: &'a str,
},
}

impl Inst<'_> {
fn operands(&self) -> impl Iterator<Item = Operand<'_>> {
self.fields
.iter()
.map(|(name, ty)| match (*name, *ty) {
("operands", "BinaryOperands < XReg >") => Operand::Binop { reg: "XReg" },
("operands", "BinaryOperands < FReg >") => Operand::Binop { reg: "FReg" },
("operands", binop) => {
// Parse "BinaryOperands < A >"` as A/A/A
// Parse "BinaryOperands < A, B >"` as A/B/A
// Parse "BinaryOperands < A, B, C >"` as A/B/C
let mut parts = binop
.strip_prefix("BinaryOperands <")
.unwrap()
.strip_suffix(">")
.unwrap()
.trim()
.split(',')
.map(|x| x.trim());
let dst = parts.next().unwrap();
let src1 = parts.next().unwrap_or(dst);
let src2 = parts.next().unwrap_or(dst);
Operand::Binop { dst, src1, src2 }
}
("dst", ty) => Operand::Writable { name, ty },
(name, ty) => Operand::Normal { name, ty },
})
Expand Down Expand Up @@ -109,7 +137,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
pat.push_str(",");
format_string.push_str(&format!(" // trap={{{name}:?}}"));
}
Operand::Binop { reg: _ } => {
Operand::Binop { .. } => {
pat.push_str("dst, src1, src2,");
format_string.push_str(" {dst}, {src1}, {src2}");
locals.push_str(&format!("let dst = reg_name(*dst.to_reg());\n"));
Expand Down Expand Up @@ -161,7 +189,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
}
}
Operand::TrapCode { .. } => {}
Operand::Binop { reg: _ } => {
Operand::Binop { .. } => {
pat.push_str("dst, src1, src2,");
uses.push("src1");
uses.push("src2");
Expand Down Expand Up @@ -221,7 +249,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
pat.push_str(",");
trap.push_str(&format!("sink.add_trap({name});\n"));
}
Operand::Binop { reg: _ } => {
Operand::Binop { .. } => {
pat.push_str("dst, src1, src2,");
args.push_str(
"pulley_interpreter::regs::BinaryOperands::new(dst, src1, src2),",
Expand Down Expand Up @@ -265,10 +293,10 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> {
Operand::Writable { name, ty } => {
isle.push_str(&format!("\n ({name} Writable{ty})"));
}
Operand::Binop { reg } => {
isle.push_str(&format!("\n (dst Writable{reg})"));
isle.push_str(&format!("\n (src1 {reg})"));
isle.push_str(&format!("\n (src2 {reg})"));
Operand::Binop { dst, src1, src2 } => {
isle.push_str(&format!("\n (dst Writable{dst})"));
isle.push_str(&format!("\n (src1 {src1})"));
isle.push_str(&format!("\n (src2 {src2})"));
}
}
}
Expand Down Expand Up @@ -303,13 +331,13 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> {
assert!(result.is_none(), "{} has >1 result", inst.snake_name);
result = Some(ty);
}
Operand::Binop { reg } => {
isle.push_str(&format!("{reg} {reg}"));
Operand::Binop { dst, src1, src2 } => {
isle.push_str(&format!("{src1} {src2}"));
rule.push_str("src1 src2");
ops.push("src1");
ops.push("src2");
assert!(result.is_none(), "{} has >1 result", inst.snake_name);
result = Some(reg);
result = Some(dst);
}
}
isle.push_str(" ");
Expand Down
37 changes: 25 additions & 12 deletions cranelift/codegen/src/isa/pulley_shared/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,23 @@ where
}

fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I {
Inst::gen_load(into_reg, mem.into(), ty, MemFlags::trusted()).into()
let mut flags = MemFlags::trusted();
// Stack loads/stores of vectors always use little-endianess to avoid
// implementing a byte-swap of vectors on big-endian platforms.
if ty.is_vector() {
flags.set_endianness(ir::Endianness::Little);
}
Inst::gen_load(into_reg, mem.into(), ty, flags).into()
}

fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I {
Inst::gen_store(mem.into(), from_reg, ty, MemFlags::trusted()).into()
let mut flags = MemFlags::trusted();
// Stack loads/stores of vectors always use little-endianess to avoid
// implementing a byte-swap of vectors on big-endian platforms.
if ty.is_vector() {
flags.set_endianness(ir::Endianness::Little);
}
Inst::gen_store(mem.into(), from_reg, ty, flags).into()
}

fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self::I {
Expand Down Expand Up @@ -510,17 +522,18 @@ where
_target_vector_bytes: u32,
_isa_flags: &PulleyFlags,
) -> u32 {
// Spill slots are the size of a "word" or a pointer, but Pulley
// registers are 8-byte for integers/floats regardless of pointer size.
// Calculate the number of slots necessary to store 8 bytes.
let slots_for_8bytes = match P::pointer_width() {
PointerWidth::PointerWidth32 => 2,
PointerWidth::PointerWidth64 => 1,
};
match rc {
// Spilling an integer or float register requires spilling 8 bytes,
// and spill slots are defined in terms of "word bytes" or the size
// of a pointer. That means on 32-bit pulley we need to take up two
// spill slots where on 64-bit pulley we need to only take up one
// spill slot for integers.
RegClass::Int | RegClass::Float => match P::pointer_width() {
PointerWidth::PointerWidth32 => 2,
PointerWidth::PointerWidth64 => 1,
},
RegClass::Vector => unreachable!(),
// Int/float registers are 8-bytes
RegClass::Int | RegClass::Float => slots_for_8bytes,
// Vector registers are 16 bytes
RegClass::Vector => 2 * slots_for_8bytes,
}
}

Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/src/isa/pulley_shared/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,16 @@
(rule (pulley_fstore amode src ty flags)
(SideEffectNoResult.Inst (MInst.FStore amode src ty flags)))

(decl pulley_vload (Amode Type MemFlags) VReg)
(rule (pulley_vload amode ty flags)
(let ((dst WritableVReg (temp_writable_vreg))
(_ Unit (emit (MInst.VLoad dst amode ty flags))))
dst))

(decl pulley_vstore (Amode VReg Type MemFlags) SideEffectNoResult)
(rule (pulley_vstore amode src ty flags)
(SideEffectNoResult.Inst (MInst.VStore amode src ty flags)))

(decl gen_br_table (XReg MachLabel BoxVecMachLabel) Unit)
(rule (gen_br_table idx default labels)
(emit (MInst.BrTable idx default labels)))
Expand Down
14 changes: 2 additions & 12 deletions cranelift/codegen/src/isa/pulley_shared/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,18 +453,8 @@ where
}

fn worst_case_size() -> CodeOffset {
// `BrIfXeq32 { a, b, taken, not_taken }` expands to `br_if_xeq32 a, b, taken; jump not_taken`.
//
// The first instruction is seven bytes long:
// * 1 byte opcode
// * 1 byte `a` register encoding
// * 1 byte `b` register encoding
// * 4 byte `taken` displacement
//
// And the second instruction is five bytes long:
// * 1 byte opcode
// * 4 byte `not_taken` displacement
12
// `Vconst128 { dst, imm }` is 18 bytes (opcode + dst + 16-byte imm)
18
}

fn ref_type_regclass(_settings: &settings::Flags) -> RegClass {
Expand Down
35 changes: 35 additions & 0 deletions cranelift/codegen/src/isa/pulley_shared/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@
(rule (lower (has_type $I64 (iadd a b)))
(pulley_xadd64 a b))

(rule (lower (has_type $I8X16 (iadd a b))) (pulley_vaddi8x16 a b))
(rule (lower (has_type $I16X8 (iadd a b))) (pulley_vaddi16x8 a b))
(rule (lower (has_type $I32X4 (iadd a b))) (pulley_vaddi32x4 a b))
(rule (lower (has_type $I64X2 (iadd a b))) (pulley_vaddi64x2 a b))

;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8 (isub a b)))
Expand Down Expand Up @@ -192,6 +197,11 @@
(rule (lower (has_type $I64 (ishl a b)))
(pulley_xshl64 a b))

(rule (lower (has_type $I8X16 (ishl a b))) (pulley_vshli8x16 a b))
(rule (lower (has_type $I16X8 (ishl a b))) (pulley_vshli16x8 a b))
(rule (lower (has_type $I32X4 (ishl a b))) (pulley_vshli32x4 a b))
(rule (lower (has_type $I64X2 (ishl a b))) (pulley_vshli64x2 a b))

;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I32 (ushr a b)))
Expand All @@ -200,6 +210,11 @@
(rule (lower (has_type $I64 (ushr a b)))
(pulley_xshr64_u a b))

(rule (lower (has_type $I8X16 (ushr a b))) (pulley_vshri8x16_u a b))
(rule (lower (has_type $I16X8 (ushr a b))) (pulley_vshri16x8_u a b))
(rule (lower (has_type $I32X4 (ushr a b))) (pulley_vshri32x4_u a b))
(rule (lower (has_type $I64X2 (ushr a b))) (pulley_vshri64x2_u a b))

;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I32 (sshr a b)))
Expand All @@ -208,6 +223,11 @@
(rule (lower (has_type $I64 (sshr a b)))
(pulley_xshr64_s a b))

(rule (lower (has_type $I8X16 (sshr a b))) (pulley_vshri8x16_s a b))
(rule (lower (has_type $I16X8 (sshr a b))) (pulley_vshri16x8_s a b))
(rule (lower (has_type $I32X4 (sshr a b))) (pulley_vshri32x4_s a b))
(rule (lower (has_type $I64X2 (sshr a b))) (pulley_vshri64x2_s a b))

;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (has_type (fits_in_32 _) (band a b)))
Expand Down Expand Up @@ -414,6 +434,9 @@
(rule 1 (lower (has_type $I64 (sload32 flags addr offset)))
(pulley_xload (amode addr offset) $I32 flags (ExtKind.Sign64)))

(rule 2 (lower (has_type (ty_vec128 ty) (load flags addr offset)))
(pulley_vload (amode addr offset) ty flags))

;;;; Rules for `store` and friends ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (store flags src @ (value_type (ty_int ty)) addr offset))
Expand All @@ -431,6 +454,9 @@
(rule (lower (istore32 flags src addr offset))
(side_effect (pulley_xstore (amode addr offset) src $I32 flags)))

(rule 2 (lower (store flags src @ (value_type (ty_vec128 ty)) addr offset))
(side_effect (pulley_vstore (amode addr offset) src ty flags)))

;;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (stack_addr stack_slot offset))
Expand Down Expand Up @@ -522,6 +548,9 @@
(rule (lower (has_type $I64 (bitcast _flags val @ (value_type $F64))))
(pulley_bitcast_int_from_float_64 val))

(rule 1 (lower (has_type (ty_vec128 _) (bitcast _flags val @ (value_type (ty_vec128 _)))))
val)

;;;; Rules for `fcvt_to_{u,s}int` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I32 (fcvt_to_uint val @ (value_type $F32))))
Expand Down Expand Up @@ -622,6 +651,8 @@

(rule (lower (has_type $F32 (fadd a b))) (pulley_fadd32 a b))
(rule (lower (has_type $F64 (fadd a b))) (pulley_fadd64 a b))
(rule (lower (has_type $F32X4 (fadd a b))) (pulley_vaddf32x4 a b))
(rule (lower (has_type $F64X2 (fadd a b))) (pulley_vaddf64x2 a b))

;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down Expand Up @@ -687,3 +718,7 @@

(rule (lower (has_type $F32 (fabs a))) (pulley_fabs32 a))
(rule (lower (has_type $F64 (fabs a))) (pulley_fabs64 a))

;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (ty_vec128 _) (vconst (u128_from_constant a)))) (pulley_vconst128 a))
10 changes: 10 additions & 0 deletions crates/cranelift/src/translate/code_translator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ pub fn translate_operator(
GlobalVariable::Memory { gv, offset, ty } => {
let addr = builder.ins().global_value(environ.pointer_type(), gv);
let mut flags = ir::MemFlags::trusted();
// Store vector globals in little-endian format to avoid
// byte swaps on big-endian platforms since at-rest vectors
// should already be in little-endian format anyway.
if ty.is_vector() {
flags.set_endianness(ir::Endianness::Little);
}
// Put globals in the "table" abstract heap category as well.
flags.set_alias_region(Some(ir::AliasRegion::Table));
builder.ins().load(ty, flags, addr, offset)
Expand All @@ -191,6 +197,10 @@ pub fn translate_operator(
GlobalVariable::Memory { gv, offset, ty } => {
let addr = builder.ins().global_value(environ.pointer_type(), gv);
let mut flags = ir::MemFlags::trusted();
// Like `global.get`, store globals in little-endian format.
if ty.is_vector() {
flags.set_endianness(ir::Endianness::Little);
}
// Put globals in the "table" abstract heap category as well.
flags.set_alias_region(Some(ir::AliasRegion::Table));
let mut val = state.pop1();
Expand Down
4 changes: 2 additions & 2 deletions crates/wasmtime/src/runtime/externals/global.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ impl Global {
ValType::I64 => Val::from(*definition.as_i64()),
ValType::F32 => Val::F32(*definition.as_u32()),
ValType::F64 => Val::F64(*definition.as_u64()),
ValType::V128 => Val::V128((*definition.as_u128()).into()),
ValType::V128 => Val::V128(definition.get_u128().into()),
ValType::Ref(ref_ty) => {
let reference: Ref = match ref_ty.heap_type() {
HeapType::Func | HeapType::ConcreteFunc(_) => {
Expand Down Expand Up @@ -187,7 +187,7 @@ impl Global {
Val::I64(i) => *definition.as_i64_mut() = i,
Val::F32(f) => *definition.as_u32_mut() = f,
Val::F64(f) => *definition.as_u64_mut() = f,
Val::V128(i) => *definition.as_u128_mut() = i.into(),
Val::V128(i) => definition.set_u128(i.into()),
Val::FuncRef(f) => {
*definition.as_func_ref_mut() = f.map_or(ptr::null_mut(), |f| {
f.vm_func_ref(&mut store).as_ptr().cast()
Expand Down
2 changes: 1 addition & 1 deletion crates/wasmtime/src/runtime/trampoline/global.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pub fn generate_global_export(
Val::I64(x) => *global.as_i64_mut() = x,
Val::F32(x) => *global.as_f32_bits_mut() = x,
Val::F64(x) => *global.as_f64_bits_mut() = x,
Val::V128(x) => *global.as_u128_mut() = x.into(),
Val::V128(x) => global.set_u128(x.into()),
Val::FuncRef(f) => {
*global.as_func_ref_mut() =
f.map_or(ptr::null_mut(), |f| f.vm_func_ref(&mut store).as_ptr());
Expand Down
22 changes: 14 additions & 8 deletions crates/wasmtime/src/runtime/vm/vmcontext.rs
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ impl VMGlobalDefinition {
WasmValType::I64 => *global.as_i64_mut() = raw.get_i64(),
WasmValType::F32 => *global.as_f32_bits_mut() = raw.get_f32(),
WasmValType::F64 => *global.as_f64_bits_mut() = raw.get_f64(),
WasmValType::V128 => *global.as_u128_mut() = raw.get_v128(),
WasmValType::V128 => global.set_u128(raw.get_v128()),
WasmValType::Ref(r) => match r.heap_type.top() {
WasmHeapTopType::Extern => {
let r = VMGcRef::from_raw_u32(raw.get_externref());
Expand Down Expand Up @@ -478,7 +478,7 @@ impl VMGlobalDefinition {
WasmValType::I64 => ValRaw::i64(*self.as_i64()),
WasmValType::F32 => ValRaw::f32(*self.as_f32_bits()),
WasmValType::F64 => ValRaw::f64(*self.as_f64_bits()),
WasmValType::V128 => ValRaw::v128(*self.as_u128()),
WasmValType::V128 => ValRaw::v128(self.get_u128()),
WasmValType::Ref(r) => match r.heap_type.top() {
WasmHeapTopType::Extern => ValRaw::externref(match self.as_gc_ref() {
Some(r) => store.gc_store_mut()?.clone_gc_ref(r).as_raw_u32(),
Expand Down Expand Up @@ -575,14 +575,20 @@ impl VMGlobalDefinition {
&mut *(self.storage.as_mut().as_mut_ptr().cast::<u64>())
}

/// Return a reference to the value as an u128.
pub unsafe fn as_u128(&self) -> &u128 {
&*(self.storage.as_ref().as_ptr().cast::<u128>())
/// Gets the underlying 128-bit vector value.
//
// Note that vectors are stored in little-endian format while other types
// are stored in native-endian format.
pub unsafe fn get_u128(&self) -> u128 {
u128::from_le(*(self.storage.as_ref().as_ptr().cast::<u128>()))
}

/// Return a mutable reference to the value as an u128.
pub unsafe fn as_u128_mut(&mut self) -> &mut u128 {
&mut *(self.storage.as_mut().as_mut_ptr().cast::<u128>())
/// Sets the 128-bit vector values.
//
// Note that vectors are stored in little-endian format while other types
// are stored in native-endian format.
pub unsafe fn set_u128(&mut self, val: u128) {
*self.storage.as_mut().as_mut_ptr().cast::<u128>() = val.to_le();
}

/// Return a reference to the value as u128 bits.
Expand Down
Loading
Loading