From d4707c3ae457da49eff7bdc07cfb6fd51f8420b2 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Thu, 12 Mar 2026 11:39:43 +0200 Subject: [PATCH 01/29] fix: byte-version of the `memcpy` (iteration guard) --- codegen/masm/src/emit/mem.rs | 12 +- .../src/rust_masm_tests/instructions.rs | 154 +++++++++++++++++- 2 files changed, 159 insertions(+), 7 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index d1d2d4984..d80d675e3 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -633,9 +633,9 @@ impl OpEmitter<'_> { body_emitter.emit_all( [ masm::Instruction::U32WrappingAddImm(1.into()), - masm::Instruction::Dup0, // [i++, i++, dst, count, value] - masm::Instruction::Dup3, // [count, i++, i++, dst, count, value] - masm::Instruction::U32Gte, // [i++ >= count, i++, dst, count, value] + masm::Instruction::Dup0, // [i++, i++, dst, count, value] + masm::Instruction::Dup3, // [count, i++, i++, dst, count, value] + masm::Instruction::U32Lt, // [i++ < count, i++, dst, count, value] ], span, ); @@ -879,9 +879,9 @@ impl OpEmitter<'_> { body_emitter.emit_all( [ masm::Instruction::U32WrappingAddImm(1.into()), - masm::Instruction::Dup0, // [i++, i++, src, dst, count] - masm::Instruction::Dup4, // [count, i++, i++, src, dst, count] - masm::Instruction::U32Gte, // [i++ >= count, i++, src, dst, count] + masm::Instruction::Dup0, // [i++, i++, src, dst, count] + masm::Instruction::Dup4, // [count, i++, i++, src, dst, count] + masm::Instruction::U32Lt, // [i++ < count, i++, src, dst, count] ], span, ); diff --git a/tests/integration/src/rust_masm_tests/instructions.rs b/tests/integration/src/rust_masm_tests/instructions.rs index 1d23401c4..4f3b5b5b1 100644 --- a/tests/integration/src/rust_masm_tests/instructions.rs +++ b/tests/integration/src/rust_masm_tests/instructions.rs @@ -14,7 +14,7 @@ use proptest::{ use super::run_masm_vs_rust; use crate::{ CompilerTest, - testing::{Initializer, eval_package}, + testing::{Initializer, eval_package, setup}, }; macro_rules! test_bin_op { @@ -834,3 +834,155 @@ fn test_hmerge() { _ => panic!("Unexpected test result: {res:?}"), } } + +#[test] +fn test_memory_copy_unaligned() { + let main_fn = r#"() -> Felt { + #[inline(never)] + fn do_copy(dst: &mut [u8; 48], src: &[u8; 64]) { + unsafe { + let src_ptr = src.as_ptr().add(3); + let dst_ptr = dst.as_mut_ptr(); + core::ptr::copy_nonoverlapping(src_ptr, dst_ptr, 48); + } + } + + let mut src = [0u8; 64]; + let mut i = 0usize; + while i < 64 { + src[i] = i as u8; + i += 1; + } + + let mut dst = [0u8; 48]; + do_copy(&mut dst, &src); + + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 48 { + if dst[i] != (i as u8).wrapping_add(3) { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( + "memory_copy_unaligned_src_len_48_u8s", + main_fn, + config, + [], + ); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} + +#[test] +fn test_memory_copy_unaligned_dst() { + let main_fn = r#"() -> Felt { + #[inline(never)] + fn do_copy(dst: &mut [u8; 53], src: &[u8; 64]) { + unsafe { + let src_ptr = src.as_ptr().add(3); + let dst_ptr = dst.as_mut_ptr().add(5); + core::ptr::copy_nonoverlapping(src_ptr, dst_ptr, 48); + } + } + + let mut src = [0u8; 64]; + let mut i = 0usize; + while i < 64 { + src[i] = i as u8; + i += 1; + } + + let mut dst = [0xffu8; 53]; + do_copy(&mut dst, &src); + + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 53 { + let expected = if i < 5 { 0xff } else { (i as u8).wrapping_sub(2) }; + if dst[i] != expected { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( + "memory_copy_unaligned_dst_len_48_u8s", + main_fn, + config, + [], + ); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} + +#[test] +fn test_memory_set_unaligned() { + let main_fn = r#"() -> Felt { + #[inline(never)] + fn do_set(dst: &mut [u8; 11]) { + unsafe { + let dst_ptr = dst.as_mut_ptr().add(3); + core::ptr::write_bytes(dst_ptr, 0x5a, 5); + } + } + + let mut dst = [0xffu8; 11]; + do_set(&mut dst); + + let expected = [0xffu8, 0xff, 0xff, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0xff, 0xff, 0xff]; + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 11 { + if dst[i] != expected[i] { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = + CompilerTest::rust_fn_body_with_stdlib_sys("memory_set_unaligned_u8s", main_fn, config, []); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} From f137a87b408e3573a7c78054e6215ed755d243f8 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Thu, 12 Mar 2026 16:17:05 +0200 Subject: [PATCH 02/29] fix: handle unaligned `u16` memory windows Fix the byte-addressed memory paths that cross a 32-bit element boundary. This keeps the `memcpy`/`memset` fallback coverage added in this branch working for short unaligned copies, including scalarized `u16` loads and stores at byte offset 3. --- codegen/masm/intrinsics/mem.masm | 28 ++-- codegen/masm/src/emit/mem.rs | 56 ++++++++ .../integration/src/codegen/intrinsics/mem.rs | 126 ++++++++++++++++++ .../src/rust_masm_tests/instructions.rs | 55 ++++++++ 4 files changed, 254 insertions(+), 11 deletions(-) diff --git a/codegen/masm/intrinsics/mem.masm b/codegen/masm/intrinsics/mem.masm index 1afe92e2f..31b0fd47d 100644 --- a/codegen/masm/intrinsics/mem.masm +++ b/codegen/masm/intrinsics/mem.masm @@ -167,21 +167,27 @@ pub proc load_sw # [addr, offset] # load the element containing the data we want mem_load else # [addr, offset] + # convert the byte offset to a bit offset + swap.1 push.8 u32wrapping_mul swap.1 # [addr, bit_offset] # the load crosses an element boundary # # 1. load the first element - dup.0 mem_load # [e0, addr, offset] + dup.0 mem_load # [e0, addr, bit_offset] # 2. load the second element - swap.1 # [addr, e0, offset] - push.1 u32overflowing_add # [overflowed, addr + 1, e0, offset] - assertz mem_load # [e1, e0, offset] - # shift low bits - push.32 dup.3 # [offset, 32, e1, e0, offset] - u32overflowing_sub assertz # [32 - offset, e1, e0, offset] - u32shr # [lo, e0, offset] - # shift high bits left by the offset - swap.2 # [offset, e0, lo] - u32shl # [hi, lo] + swap.1 # [addr, e0, bit_offset] + push.1 u32overflowing_add # [overflowed, addr + 1, e0, bit_offset] + assertz mem_load # [e1, e0, bit_offset] + # Reconstruct the 32-bit window whose first byte begins at the original byte pointer. + # `e0` contributes the low part after shifting right, and `e1` contributes the carried + # high part after shifting left into the vacated bits. + swap.1 # [e0, e1, bit_offset] + dup.2 # [bit_offset, e0, e1, bit_offset] + u32shr # [lo, e1, bit_offset] + movup.2 # [bit_offset, lo, e1] + push.32 swap.1 # [bit_offset, 32, lo, e1] + u32overflowing_sub assertz # [32 - bit_offset, lo, e1] + movup.2 swap.1 # [32 - bit_offset, e1, lo] + u32shl # [hi, lo] # combine the two halves u32or # [result] end diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index d80d675e3..6e4f7f9fb 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -214,11 +214,30 @@ impl OpEmitter<'_> { self.emit(masm::Instruction::U32And, span); return; } else { + if ty.size_in_bits() == 16 { + // A 16-bit value can start at byte offset 3, so it may span two adjacent + // elements. Load the full 32-bit unaligned window first, then keep the low + // 16 bits corresponding to the requested value. + self.push_native_ptr(imm, span); + self.raw_exec("::intrinsics::mem::load_sw", span); + self.emit_push(0xffffu32, span); + self.emit(masm::Instruction::U32And, span); + return; + } self.emit_push(imm.addr, span); self.emit_push(imm.offset, span); } } + if ty.size_in_bits() == 16 { + // The dynamic case has the same boundary issue as the immediate case: offset 3 reaches + // into the next element. Reuse the unaligned 32-bit load to assemble the window. + self.raw_exec("::intrinsics::mem::load_sw", span); + self.emit_push(0xffffu32, span); + self.emit(masm::Instruction::U32And, span); + return; + } + // Stack: [element_addr, byte_offset] // First, load the aligned word containing our value @@ -1052,6 +1071,35 @@ impl OpEmitter<'_> { return; } + if type_size == 16 { + // A 16-bit store at byte offset 3 updates one byte in the current element and one in + // the next. Merge the new 16-bit payload into the 32-bit unaligned window and let the + // existing unaligned word store split it back across both elements. + self.emit_all( + [ + masm::Instruction::Dup1, // [offset, addr, offset, value] + masm::Instruction::Dup1, // [addr, offset, addr, offset, value] + ], + span, + ); + self.raw_exec("::intrinsics::mem::load_sw", span); // [window, addr, offset, value] + self.emit_push(0xffff0000u32, span); + self.emit(masm::Instruction::U32And, span); // [masked_window, addr, offset, value] + self.emit(masm::Instruction::MovUp3, span); // [value, masked_window, addr, offset] + self.emit_push(0xffffu32, span); + self.emit(masm::Instruction::U32And, span); // [value16, masked_window, addr, offset] + self.emit(masm::Instruction::U32Or, span); // [combined, addr, offset] + self.emit_all( + [ + masm::Instruction::Swap2, // [offset, addr, combined] + masm::Instruction::Swap1, // [addr, offset, combined] + ], + span, + ); + self.raw_exec("::intrinsics::mem::store_sw", span); + return; + } + // Stack: [addr, offset, value] // Load the current aligned value self.emit_all( @@ -1113,6 +1161,14 @@ impl OpEmitter<'_> { /// - Before: [value] (where value is already truncated to the correct size) /// - After: [] fn store_small_imm(&mut self, ty: &Type, imm: NativePtr, span: SourceSpan) { + if ty.size_in_bits() == 16 && !imm.is_element_aligned() { + // Route unaligned 16-bit immediates through the dynamic path so they share the same + // cross-element windowing logic as byte-pointer stores. + self.push_native_ptr(imm, span); + self.store_small(ty, None, span); + return; + } + assert!(imm.alignment() as usize >= ty.min_alignment()); // For immediate pointers, we always load from the element-aligned address diff --git a/tests/integration/src/codegen/intrinsics/mem.rs b/tests/integration/src/codegen/intrinsics/mem.rs index bdac4e28b..5870dce36 100644 --- a/tests/integration/src/codegen/intrinsics/mem.rs +++ b/tests/integration/src/codegen/intrinsics/mem.rs @@ -365,6 +365,55 @@ fn load_u16() { } } +/// Tests that loading a u16 from byte offset 3 correctly reconstructs the value across the next +/// element boundary. +#[test] +fn load_unaligned_u16() { + setup::enable_compiler_instrumentation(); + + let write_to = 17 * 2u32.pow(16); + let read_from = write_to + 3; + + let (package, context) = + compile_test_module([Type::from(PointerType::new(Type::U16))], [Type::U16], |builder| { + let block = builder.current_block(); + let ptr = block.borrow().arguments()[0] as ValueRef; + let loaded = builder.load(ptr, SourceSpan::default()).unwrap(); + builder.ret(Some(loaded), SourceSpan::default()).unwrap(); + }); + + let config = proptest::test_runner::Config::with_cases(10); + let res = TestRunner::new(config).run(&any::(), move |value| { + let expected = value.to_ne_bytes(); + let initial_bytes = [0xff, 0xee, 0xdd, expected[0], expected[1], 0xbb, 0xaa, 0x99]; + let initializers = [Initializer::MemoryBytes { + addr: write_to, + bytes: &initial_bytes, + }]; + + let args = [Felt::new(read_from as u64)]; + let output = eval_package::( + &package, + initializers, + &args, + context.session(), + |_| Ok(()), + )?; + + prop_assert_eq!(output, value, "expected 0x{:x}; found 0x{:x}", value, output,); + + Ok(()) + }); + + match res { + Err(TestError::Fail(reason, value)) => { + panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); + } + Ok(_) => (), + _ => panic!("Unexpected test result: {res:?}"), + } +} + /// Tests the memory load intrinsic for loads of boolean (i.e. 1-bit) values #[test] fn load_bool() { @@ -570,6 +619,83 @@ fn store_u16() { } } +/// Tests that storing a u16 at byte offset 3 updates only the target bytes across the element +/// boundary. +#[test] +fn store_unaligned_u16() { + setup::enable_compiler_instrumentation(); + + let write_to = 17 * 2u32.pow(16); + let store_to = write_to + 3; + + let (package, context) = compile_test_module([Type::U16], [Type::U32], |builder| { + let block = builder.current_block(); + let value = block.borrow().arguments()[0] as ValueRef; + + let addr = builder.u32(store_to, SourceSpan::default()); + let ptr = builder + .inttoptr(addr, Type::from(PointerType::new(Type::U16)), SourceSpan::default()) + .unwrap(); + + builder.store(ptr, value, SourceSpan::default()).unwrap(); + + let loaded = builder.load(ptr, SourceSpan::default()).unwrap(); + builder.assert_eq(loaded, value, SourceSpan::default()).unwrap(); + + let result = builder.u32(1, SourceSpan::default()); + builder.ret(Some(result), SourceSpan::default()).unwrap(); + }); + + let config = proptest::test_runner::Config::with_cases(32); + let res = TestRunner::new(config).run(&any::(), move |store_value| { + let initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa]; + let initializers = [Initializer::MemoryBytes { + addr: write_to, + bytes: &initial_bytes, + }]; + + let args = [Felt::new(store_value as u64)]; + let output = + eval_package::(&package, initializers, &args, context.session(), |trace| { + let expected = store_value.to_ne_bytes(); + let word0 = trace.read_from_rust_memory::(write_to).ok_or_else(|| { + TestCaseError::fail(format!("failed to read from byte address {write_to}")) + })?; + let word1 = trace.read_from_rust_memory::(write_to + 4).ok_or_else(|| { + TestCaseError::fail(format!( + "failed to read from byte address {}", + write_to + 4 + )) + })?; + let stored0 = (word0 & 0xff) as u8; + let stored1 = ((word0 >> 8) & 0xff) as u8; + let stored2 = ((word0 >> 16) & 0xff) as u8; + let stored3 = ((word0 >> 24) & 0xff) as u8; + let stored4 = (word1 & 0xff) as u8; + let stored5 = ((word1 >> 8) & 0xff) as u8; + + prop_assert_eq!(stored0, 0xff); + prop_assert_eq!(stored1, 0xee); + prop_assert_eq!(stored2, 0xdd); + prop_assert_eq!(stored3, expected[0]); + prop_assert_eq!(stored4, expected[1]); + prop_assert_eq!(stored5, 0xaa); + Ok(()) + })?; + + prop_assert_eq!(output, 1u32); + Ok(()) + }); + + match res { + Err(TestError::Fail(reason, value)) => { + panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); + } + Ok(_) => (), + _ => panic!("Unexpected test result: {res:?}"), + } +} + /// Tests that u8 stores only affect the targeted byte and don't corrupt surrounding memory #[test] fn store_u8() { diff --git a/tests/integration/src/rust_masm_tests/instructions.rs b/tests/integration/src/rust_masm_tests/instructions.rs index 4f3b5b5b1..285207ef7 100644 --- a/tests/integration/src/rust_masm_tests/instructions.rs +++ b/tests/integration/src/rust_masm_tests/instructions.rs @@ -944,6 +944,61 @@ fn test_memory_copy_unaligned_dst() { .unwrap(); } +#[test] +fn test_memory_copy_unaligned_dst_short_count() { + let main_fn = r#"() -> Felt { + #[inline(never)] + fn do_copy(dst: &mut [u8; 8], src: &[u8; 16]) { + unsafe { + let src_ptr = src.as_ptr().add(3); + let dst_ptr = dst.as_mut_ptr().add(2); + core::ptr::copy_nonoverlapping(src_ptr, dst_ptr, 3); + } + } + + let mut src = [0u8; 16]; + let mut i = 0usize; + while i < 16 { + src[i] = i as u8; + i += 1; + } + + let mut dst = [0xffu8; 8]; + do_copy(&mut dst, &src); + + let expected = [0xffu8, 0xff, 3, 4, 5, 0xff, 0xff, 0xff]; + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 8 { + if dst[i] != expected[i] { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( + "memory_copy_unaligned_dst_short_count_u8s", + main_fn, + config, + [], + ); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} + #[test] fn test_memory_set_unaligned() { let main_fn = r#"() -> Felt { From 07c2968641507409c45ef9b82a0d2f08acce02b4 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Mon, 16 Mar 2026 14:58:48 +0200 Subject: [PATCH 03/29] fix: guard zero-count `memcpy` and `memset` Zero-length memory operations must be no-ops, but both loop headers seeded `while.true` with `count >= 0`, which executes one iteration when `count == 0`. Switch the entry condition to a strict unsigned `count > 0` check and add regressions for zero-count unaligned copy/set paths. --- codegen/masm/src/emit/mem.rs | 8 +- .../src/rust_masm_tests/instructions.rs | 102 ++++++++++++++++++ 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 6e4f7f9fb..873fd8363 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -665,9 +665,9 @@ impl OpEmitter<'_> { // [dst, count, value..] self.emit_push(0u32, span); // [i, dst, count, value..] self.emit(masm::Instruction::Dup2, span); // [count, i, dst, count, value..] - self.emit_push(Felt::ZERO, span); + self.emit_push(0u32, span); self.emit( - masm::Instruction::Gte, // [count > 0, i, dst, count, value..] + masm::Instruction::U32Gt, // [count > 0, i, dst, count, value..] span, ); self.current_block.push(masm::Op::While { @@ -912,9 +912,9 @@ impl OpEmitter<'_> { // [src, dst, count] self.emit_push(0u32, span); // [i, src, dst, count] self.emit(masm::Instruction::Dup3, span); // [count, i, src, dst, count] - self.emit_push(Felt::ZERO, span); + self.emit_push(0u32, span); self.emit( - masm::Instruction::Gte, // [count > 0, i, src, dst, count] + masm::Instruction::U32Gt, // [count > 0, i, src, dst, count] span, ); self.current_block.push(masm::Op::While { diff --git a/tests/integration/src/rust_masm_tests/instructions.rs b/tests/integration/src/rust_masm_tests/instructions.rs index 285207ef7..934577621 100644 --- a/tests/integration/src/rust_masm_tests/instructions.rs +++ b/tests/integration/src/rust_masm_tests/instructions.rs @@ -999,6 +999,61 @@ fn test_memory_copy_unaligned_dst_short_count() { .unwrap(); } +#[test] +fn test_memory_copy_unaligned_zero_count() { + let main_fn = r#"() -> Felt { + #[inline(never)] + fn do_copy(dst: &mut [u8; 8], src: &[u8; 16]) { + unsafe { + let src_ptr = src.as_ptr().add(1); + let dst_ptr = dst.as_mut_ptr().add(2); + core::ptr::copy_nonoverlapping(src_ptr, dst_ptr, 0); + } + } + + let mut src = [0u8; 16]; + let mut i = 0usize; + while i < 16 { + src[i] = i as u8; + i += 1; + } + + let mut dst = [0xffu8; 8]; + do_copy(&mut dst, &src); + + let expected = [0xffu8; 8]; + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 8 { + if dst[i] != expected[i] { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( + "memory_copy_unaligned_zero_count_u8s", + main_fn, + config, + [], + ); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} + #[test] fn test_memory_set_unaligned() { let main_fn = r#"() -> Felt { @@ -1041,3 +1096,50 @@ fn test_memory_set_unaligned() { }) .unwrap(); } + +#[test] +fn test_memory_set_unaligned_zero_count() { + let main_fn = r#"() -> Felt { + #[inline(never)] + fn do_set(dst: &mut [u8; 11]) { + unsafe { + let dst_ptr = dst.as_mut_ptr().add(3); + core::ptr::write_bytes(dst_ptr, 0x5a, 0); + } + } + + let mut dst = [0xffu8; 11]; + do_set(&mut dst); + + let expected = [0xffu8; 11]; + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 11 { + if dst[i] != expected[i] { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( + "memory_set_unaligned_zero_count_u8s", + main_fn, + config, + [], + ); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} From 42f386144a2df5d18cc27236f32587057dca248c Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Mon, 16 Mar 2026 14:59:24 +0200 Subject: [PATCH 04/29] test: use little-endian bytes in `u16` memory checks The unaligned `u16` regressions are asserting compiler memory layout, so they should not depend on the host endianness. Use `to_le_bytes()` in the expected byte construction to keep the tests portable and aligned with the byte-addressable memory model. --- tests/integration/src/codegen/intrinsics/mem.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/src/codegen/intrinsics/mem.rs b/tests/integration/src/codegen/intrinsics/mem.rs index 5870dce36..81a40acdf 100644 --- a/tests/integration/src/codegen/intrinsics/mem.rs +++ b/tests/integration/src/codegen/intrinsics/mem.rs @@ -384,7 +384,7 @@ fn load_unaligned_u16() { let config = proptest::test_runner::Config::with_cases(10); let res = TestRunner::new(config).run(&any::(), move |value| { - let expected = value.to_ne_bytes(); + let expected = value.to_le_bytes(); let initial_bytes = [0xff, 0xee, 0xdd, expected[0], expected[1], 0xbb, 0xaa, 0x99]; let initializers = [Initializer::MemoryBytes { addr: write_to, @@ -657,7 +657,7 @@ fn store_unaligned_u16() { let args = [Felt::new(store_value as u64)]; let output = eval_package::(&package, initializers, &args, context.session(), |trace| { - let expected = store_value.to_ne_bytes(); + let expected = store_value.to_le_bytes(); let word0 = trace.read_from_rust_memory::(write_to).ok_or_else(|| { TestCaseError::fail(format!("failed to read from byte address {write_to}")) })?; From 6caac998edd89b598c2c4ce143cfc0c2006bc07e Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Mon, 16 Mar 2026 15:01:07 +0200 Subject: [PATCH 05/29] refactor: extract counted loop emission `memset` and fallback `memcpy` were carrying separate copies of the same counted `while.true` control flow, which makes fixes easy to miss in one path. Extract the shared loop header and back-edge emission so the counted loop protocol is defined once and reused by both sites. --- codegen/masm/src/emit/mem.rs | 87 +++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 32 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 873fd8363..ca8fc8f3b 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -9,6 +9,53 @@ use crate::{OperandStack, lower::NativePtr}; /// Allocation impl OpEmitter<'_> { + /// Emit the loop header for a counted `while.true` loop. + /// + /// The caller provides the `dup` instruction needed to bring `count` to the top of the stack + /// after the loop index has been seeded with zero. + /// + /// Stack transition: + /// + /// - Before: `[loop_state..]` + /// - After: `[count > 0, i = 0, loop_state..]` + /// + /// For example: + /// + /// - `memset`: `[dst, count, value..] -> [count > 0, i = 0, dst, count, value..]` + /// - `memcpy`: `[src, dst, count] -> [count > 0, i = 0, src, dst, count]` + fn emit_counted_loop_header(&mut self, count_dup: masm::Instruction, span: SourceSpan) { + self.emit_push(0u32, span); + self.emit(count_dup, span); + self.emit_push(0u32, span); + self.emit(masm::Instruction::U32Gt, span); + } + + /// Emit the loop back-edge condition for a counted `while.true` loop. + /// + /// The caller provides the `dup` instruction needed to bring `count` to the top of the stack + /// after incrementing the loop index. + /// + /// Stack transition: + /// + /// - Before: `[i, loop_state..]` + /// - After: `[i + 1 < count, i + 1, loop_state..]` + /// + /// For example: + /// + /// - `memset`: `[i, dst, count, value..] -> [i + 1 < count, i + 1, dst, count, value..]` + /// - `memcpy`: `[i, src, dst, count] -> [i + 1 < count, i + 1, src, dst, count]` + fn emit_counted_loop_next_condition(&mut self, count_dup: masm::Instruction, span: SourceSpan) { + self.emit_all( + [ + masm::Instruction::U32WrappingAddImm(1.into()), + masm::Instruction::Dup0, + count_dup, + masm::Instruction::U32Lt, + ], + span, + ); + } + /// Grow the heap (from the perspective of Wasm programs) by N pages, returning the previous /// size of the heap (in pages) if successful, or -1 if the heap could not be grown. pub fn mem_grow(&mut self, span: SourceSpan) { @@ -649,27 +696,15 @@ impl OpEmitter<'_> { body_emitter.store(span); // [i, dst, count, value] // Loop body - increment iteration count, determine whether to continue loop - body_emitter.emit_all( - [ - masm::Instruction::U32WrappingAddImm(1.into()), - masm::Instruction::Dup0, // [i++, i++, dst, count, value] - masm::Instruction::Dup3, // [count, i++, i++, dst, count, value] - masm::Instruction::U32Lt, // [i++ < count, i++, dst, count, value] - ], - span, - ); + body_emitter.emit_counted_loop_next_condition(masm::Instruction::Dup3, span); + // [i++ < count, i++, dst, count, value] // Switch back to original block and emit loop header and 'while.true' instruction // // Loop header - prepare to loop until `count` iterations have been performed // [dst, count, value..] - self.emit_push(0u32, span); // [i, dst, count, value..] - self.emit(masm::Instruction::Dup2, span); // [count, i, dst, count, value..] - self.emit_push(0u32, span); - self.emit( - masm::Instruction::U32Gt, // [count > 0, i, dst, count, value..] - span, - ); + self.emit_counted_loop_header(masm::Instruction::Dup2, span); + // [count > 0, i, dst, count, value..] self.current_block.push(masm::Op::While { span, body: masm::Block::new(span, body), @@ -895,28 +930,16 @@ impl OpEmitter<'_> { body_emitter.store(span); // [i, src, dst, count] // Increment iteration count, determine whether to continue loop - body_emitter.emit_all( - [ - masm::Instruction::U32WrappingAddImm(1.into()), - masm::Instruction::Dup0, // [i++, i++, src, dst, count] - masm::Instruction::Dup4, // [count, i++, i++, src, dst, count] - masm::Instruction::U32Lt, // [i++ < count, i++, src, dst, count] - ], - span, - ); + body_emitter.emit_counted_loop_next_condition(masm::Instruction::Dup4, span); + // [i++ < count, i++, src, dst, count] // Switch back to original block and emit loop header and 'while.true' instruction // // Loop header - prepare to loop until `count` iterations have been performed // [src, dst, count] - self.emit_push(0u32, span); // [i, src, dst, count] - self.emit(masm::Instruction::Dup3, span); // [count, i, src, dst, count] - self.emit_push(0u32, span); - self.emit( - masm::Instruction::U32Gt, // [count > 0, i, src, dst, count] - span, - ); + self.emit_counted_loop_header(masm::Instruction::Dup3, span); + // [count > 0, i, src, dst, count] self.current_block.push(masm::Op::While { span, body: masm::Block::new(span, body), From e3588f99bdec61d5637537892cdadb626c8581a7 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Tue, 17 Mar 2026 07:29:28 +0200 Subject: [PATCH 06/29] fix: limit cross-element `u16` memory access Only offset 3 spans two elements for a `u16` load/store. Route the other unaligned offsets through the existing single-element logic so we don't spuriously touch `addr + 1` at the end of memory. --- codegen/masm/src/emit/mem.rs | 140 +++++++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 40 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index ca8fc8f3b..2d8a336b4 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -261,30 +261,23 @@ impl OpEmitter<'_> { self.emit(masm::Instruction::U32And, span); return; } else { - if ty.size_in_bits() == 16 { - // A 16-bit value can start at byte offset 3, so it may span two adjacent - // elements. Load the full 32-bit unaligned window first, then keep the low - // 16 bits corresponding to the requested value. - self.push_native_ptr(imm, span); - self.raw_exec("::intrinsics::mem::load_sw", span); - self.emit_push(0xffffu32, span); - self.emit(masm::Instruction::U32And, span); - return; - } self.emit_push(imm.addr, span); self.emit_push(imm.offset, span); } } if ty.size_in_bits() == 16 { - // The dynamic case has the same boundary issue as the immediate case: offset 3 reaches - // into the next element. Reuse the unaligned 32-bit load to assemble the window. - self.raw_exec("::intrinsics::mem::load_sw", span); - self.emit_push(0xffffu32, span); - self.emit(masm::Instruction::U32And, span); + self.load_u16_dynamic(span); return; } + self.load_small_from_current_element(ty, span); + } + + /// Load a sub-word value which is fully contained in the current 32-bit element. + /// + /// Stack transition: `[addr, offset] -> [value]`. + fn load_small_from_current_element(&mut self, ty: &Type, span: SourceSpan) { // Stack: [element_addr, byte_offset] // First, load the aligned word containing our value @@ -323,6 +316,37 @@ impl OpEmitter<'_> { self.emit(masm::Instruction::U32And, span); } + /// Load a `u16` value from a dynamic native pointer tuple. + /// + /// Offsets `0..=2` fit within the current element and can use the regular shift/mask path. + /// Offset `3` spans the next element, so it must assemble a 32-bit unaligned window first. + /// + /// Stack transition: `[addr, offset] -> [value]`. + fn load_u16_dynamic(&mut self, span: SourceSpan) { + self.emit_all( + [masm::Instruction::Dup1, masm::Instruction::EqImm(Felt::new(3).into())], + span, + ); + + let mut then_ops = Vec::default(); + let mut then_stack = OperandStack::new(self.context_rc()); + let mut then_emitter = OpEmitter::new(self.invoked, &mut then_ops, &mut then_stack); + then_emitter.raw_exec("::intrinsics::mem::load_sw", span); + then_emitter.emit_push(0xffffu32, span); + then_emitter.emit(masm::Instruction::U32And, span); + + let mut else_ops = Vec::default(); + let mut else_stack = OperandStack::new(self.context_rc()); + let mut else_emitter = OpEmitter::new(self.invoked, &mut else_ops, &mut else_stack); + else_emitter.load_small_from_current_element(&Type::U16, span); + + self.current_block.push(masm::Op::If { + span, + then_blk: masm::Block::new(span, then_ops), + else_blk: masm::Block::new(span, else_ops), + }); + } + fn load_double_word_imm(&mut self, ptr: NativePtr, span: SourceSpan) { if ptr.is_element_aligned() { self.emit_all( @@ -1095,34 +1119,20 @@ impl OpEmitter<'_> { } if type_size == 16 { - // A 16-bit store at byte offset 3 updates one byte in the current element and one in - // the next. Merge the new 16-bit payload into the 32-bit unaligned window and let the - // existing unaligned word store split it back across both elements. - self.emit_all( - [ - masm::Instruction::Dup1, // [offset, addr, offset, value] - masm::Instruction::Dup1, // [addr, offset, addr, offset, value] - ], - span, - ); - self.raw_exec("::intrinsics::mem::load_sw", span); // [window, addr, offset, value] - self.emit_push(0xffff0000u32, span); - self.emit(masm::Instruction::U32And, span); // [masked_window, addr, offset, value] - self.emit(masm::Instruction::MovUp3, span); // [value, masked_window, addr, offset] - self.emit_push(0xffffu32, span); - self.emit(masm::Instruction::U32And, span); // [value16, masked_window, addr, offset] - self.emit(masm::Instruction::U32Or, span); // [combined, addr, offset] - self.emit_all( - [ - masm::Instruction::Swap2, // [offset, addr, combined] - masm::Instruction::Swap1, // [addr, offset, combined] - ], - span, - ); - self.raw_exec("::intrinsics::mem::store_sw", span); + self.store_u16_dynamic(span); return; } + self.store_small_within_element( + u32::try_from(type_size).expect("invalid sub-word type size"), + span, + ); + } + + /// Store a sub-word value which is fully contained in the current 32-bit element. + /// + /// Stack transition: `[addr, offset, value] -> []`. + fn store_small_within_element(&mut self, type_size: u32, span: SourceSpan) { // Stack: [addr, offset, value] // Load the current aligned value self.emit_all( @@ -1171,6 +1181,56 @@ impl OpEmitter<'_> { ); } + /// Store a `u16` to a dynamic native pointer tuple. + /// + /// Offsets `0..=2` fit within the current element and can update that element in place. + /// Offset `3` spans into the next element and must use the unaligned 32-bit store intrinsic. + /// + /// Stack transition: `[addr, offset, value] -> []`. + fn store_u16_dynamic(&mut self, span: SourceSpan) { + self.emit_all( + [masm::Instruction::Dup1, masm::Instruction::EqImm(Felt::new(3).into())], + span, + ); + + let mut then_ops = Vec::default(); + let mut then_stack = OperandStack::new(self.context_rc()); + let mut then_emitter = OpEmitter::new(self.invoked, &mut then_ops, &mut then_stack); + then_emitter.emit_all( + [ + masm::Instruction::Dup1, // [offset, addr, offset, value] + masm::Instruction::Dup1, // [addr, offset, addr, offset, value] + ], + span, + ); + then_emitter.raw_exec("::intrinsics::mem::load_sw", span); // [window, addr, offset, value] + then_emitter.emit_push(0xffff0000u32, span); + then_emitter.emit(masm::Instruction::U32And, span); // [masked_window, addr, offset, value] + then_emitter.emit(masm::Instruction::MovUp3, span); // [value, masked_window, addr, offset] + then_emitter.emit_push(0xffffu32, span); + then_emitter.emit(masm::Instruction::U32And, span); // [value16, masked_window, addr, offset] + then_emitter.emit(masm::Instruction::U32Or, span); // [combined, addr, offset] + then_emitter.emit_all( + [ + masm::Instruction::Swap2, // [offset, addr, combined] + masm::Instruction::Swap1, // [addr, offset, combined] + ], + span, + ); + then_emitter.raw_exec("::intrinsics::mem::store_sw", span); + + let mut else_ops = Vec::default(); + let mut else_stack = OperandStack::new(self.context_rc()); + let mut else_emitter = OpEmitter::new(self.invoked, &mut else_ops, &mut else_stack); + else_emitter.store_small_within_element(16, span); + + self.current_block.push(masm::Op::If { + span, + then_blk: masm::Block::new(span, then_ops), + else_blk: masm::Block::new(span, else_ops), + }); + } + /// Store a sub-word value using an immediate pointer /// /// This function stores sub-word values (u8, u16, etc.) to memory at a specific immediate address. From 744b707f1c9a484d1f3d3e3ec5a1dcbd6134979d Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Tue, 17 Mar 2026 07:36:58 +0200 Subject: [PATCH 07/29] test: cover unaligned `u16` offsets and immediates Add regression cases for byte offsets 1 and 2 in the integration suite, and add emitter-level tests that exercise unaligned `load_imm` and `store_imm` for `u16` addresses. --- codegen/masm/src/emit/mod.rs | 36 +++++++ .../integration/src/codegen/intrinsics/mem.rs | 101 +++++++++++++----- 2 files changed, 109 insertions(+), 28 deletions(-) diff --git a/codegen/masm/src/emit/mod.rs b/codegen/masm/src/emit/mod.rs index 16e37a332..cc41d2bda 100644 --- a/codegen/masm/src/emit/mod.rs +++ b/codegen/masm/src/emit/mod.rs @@ -2137,6 +2137,42 @@ mod tests { assert_eq!(emitter.stack()[1], Type::U32); } + #[test] + fn op_emitter_unaligned_u16_load_imm_test() { + let mut block = Vec::default(); + let context = Rc::new(Context::default()); + let mut stack = OperandStack::new(context.clone()); + let mut invoked = BTreeSet::default(); + let mut emitter = OpEmitter::new(&mut invoked, &mut block, &mut stack); + + emitter.load_imm(130, Type::U16, SourceSpan::default()); + + assert_eq!(emitter.stack_len(), 1); + assert_eq!(emitter.stack()[0], Type::U16); + assert!( + block.iter().any(|op| matches!(op, Op::If { .. })), + "expected unaligned `u16` immediate load to emit the dynamic offset split" + ); + } + + #[test] + fn op_emitter_unaligned_u16_store_imm_test() { + let mut block = Vec::default(); + let context = Rc::new(Context::default()); + let mut stack = OperandStack::new(context.clone()); + let mut invoked = BTreeSet::default(); + let mut emitter = OpEmitter::new(&mut invoked, &mut block, &mut stack); + + emitter.push(Type::U16); + emitter.store_imm(130, SourceSpan::default()); + + assert_eq!(emitter.stack_len(), 0); + assert!( + block.iter().any(|op| matches!(op, Op::If { .. })), + "expected unaligned `u16` immediate store to emit the dynamic offset split" + ); + } + #[test] fn op_emitter_truncate_stack_drops_all_with_remainder() { let mut block = Vec::default(); diff --git a/tests/integration/src/codegen/intrinsics/mem.rs b/tests/integration/src/codegen/intrinsics/mem.rs index 81a40acdf..9d7021928 100644 --- a/tests/integration/src/codegen/intrinsics/mem.rs +++ b/tests/integration/src/codegen/intrinsics/mem.rs @@ -365,14 +365,12 @@ fn load_u16() { } } -/// Tests that loading a u16 from byte offset 3 correctly reconstructs the value across the next -/// element boundary. -#[test] -fn load_unaligned_u16() { +/// Runs a `u16` load test from the specified unaligned byte offset. +fn run_load_unaligned_u16(offset: u32) { setup::enable_compiler_instrumentation(); let write_to = 17 * 2u32.pow(16); - let read_from = write_to + 3; + let read_from = write_to + offset; let (package, context) = compile_test_module([Type::from(PointerType::new(Type::U16))], [Type::U16], |builder| { @@ -385,7 +383,9 @@ fn load_unaligned_u16() { let config = proptest::test_runner::Config::with_cases(10); let res = TestRunner::new(config).run(&any::(), move |value| { let expected = value.to_le_bytes(); - let initial_bytes = [0xff, 0xee, 0xdd, expected[0], expected[1], 0xbb, 0xaa, 0x99]; + let mut initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; + initial_bytes[offset as usize] = expected[0]; + initial_bytes[offset as usize + 1] = expected[1]; let initializers = [Initializer::MemoryBytes { addr: write_to, bytes: &initial_bytes, @@ -414,6 +414,25 @@ fn load_unaligned_u16() { } } +/// Tests that loading a `u16` from byte offset 1 stays within the current element. +#[test] +fn load_unaligned_u16_offset_1() { + run_load_unaligned_u16(1); +} + +/// Tests that loading a `u16` from byte offset 2 stays within the current element. +#[test] +fn load_unaligned_u16_offset_2() { + run_load_unaligned_u16(2); +} + +/// Tests that loading a `u16` from byte offset 3 correctly reconstructs the value across the next +/// element boundary. +#[test] +fn load_unaligned_u16() { + run_load_unaligned_u16(3); +} + /// Tests the memory load intrinsic for loads of boolean (i.e. 1-bit) values #[test] fn load_bool() { @@ -619,14 +638,12 @@ fn store_u16() { } } -/// Tests that storing a u16 at byte offset 3 updates only the target bytes across the element -/// boundary. -#[test] -fn store_unaligned_u16() { +/// Runs a `u16` store test at the specified unaligned byte offset. +fn run_store_unaligned_u16(offset: u32) { setup::enable_compiler_instrumentation(); let write_to = 17 * 2u32.pow(16); - let store_to = write_to + 3; + let store_to = write_to + offset; let (package, context) = compile_test_module([Type::U16], [Type::U32], |builder| { let block = builder.current_block(); @@ -639,16 +656,13 @@ fn store_unaligned_u16() { builder.store(ptr, value, SourceSpan::default()).unwrap(); - let loaded = builder.load(ptr, SourceSpan::default()).unwrap(); - builder.assert_eq(loaded, value, SourceSpan::default()).unwrap(); - let result = builder.u32(1, SourceSpan::default()); builder.ret(Some(result), SourceSpan::default()).unwrap(); }); let config = proptest::test_runner::Config::with_cases(32); let res = TestRunner::new(config).run(&any::(), move |store_value| { - let initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa]; + let initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; let initializers = [Initializer::MemoryBytes { addr: write_to, bytes: &initial_bytes, @@ -658,6 +672,10 @@ fn store_unaligned_u16() { let output = eval_package::(&package, initializers, &args, context.session(), |trace| { let expected = store_value.to_le_bytes(); + let mut expected_bytes = initial_bytes; + expected_bytes[offset as usize] = expected[0]; + expected_bytes[offset as usize + 1] = expected[1]; + let word0 = trace.read_from_rust_memory::(write_to).ok_or_else(|| { TestCaseError::fail(format!("failed to read from byte address {write_to}")) })?; @@ -667,19 +685,27 @@ fn store_unaligned_u16() { write_to + 4 )) })?; - let stored0 = (word0 & 0xff) as u8; - let stored1 = ((word0 >> 8) & 0xff) as u8; - let stored2 = ((word0 >> 16) & 0xff) as u8; - let stored3 = ((word0 >> 24) & 0xff) as u8; - let stored4 = (word1 & 0xff) as u8; - let stored5 = ((word1 >> 8) & 0xff) as u8; - - prop_assert_eq!(stored0, 0xff); - prop_assert_eq!(stored1, 0xee); - prop_assert_eq!(stored2, 0xdd); - prop_assert_eq!(stored3, expected[0]); - prop_assert_eq!(stored4, expected[1]); - prop_assert_eq!(stored5, 0xaa); + let observed_bytes = [ + (word0 & 0xff) as u8, + ((word0 >> 8) & 0xff) as u8, + ((word0 >> 16) & 0xff) as u8, + ((word0 >> 24) & 0xff) as u8, + (word1 & 0xff) as u8, + ((word1 >> 8) & 0xff) as u8, + ((word1 >> 16) & 0xff) as u8, + ((word1 >> 24) & 0xff) as u8, + ]; + + for (index, (stored, expected_byte)) in + observed_bytes.into_iter().zip(expected_bytes).enumerate() + { + prop_assert_eq!( + stored, + expected_byte, + "unexpected byte at address {}", + write_to + index as u32 + ); + } Ok(()) })?; @@ -696,6 +722,25 @@ fn store_unaligned_u16() { } } +/// Tests that storing a `u16` at byte offset 1 updates only the target bytes. +#[test] +fn store_unaligned_u16_offset_1() { + run_store_unaligned_u16(1); +} + +/// Tests that storing a `u16` at byte offset 2 updates only the target bytes. +#[test] +fn store_unaligned_u16_offset_2() { + run_store_unaligned_u16(2); +} + +/// Tests that storing a `u16` at byte offset 3 updates only the target bytes across the element +/// boundary. +#[test] +fn store_unaligned_u16() { + run_store_unaligned_u16(3); +} + /// Tests that u8 stores only affect the targeted byte and don't corrupt surrounding memory #[test] fn store_u8() { From 125fa5cd030f9a8b79370983761a3ace8419a3db Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Tue, 17 Mar 2026 07:38:41 +0200 Subject: [PATCH 08/29] test: add aligned byte `memcpy` coverage Cover the aligned byte-copy fast path and a case where only `count` is misaligned so the fast-path predicate is regression-tested as well. --- .../src/rust_masm_tests/instructions.rs | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/tests/integration/src/rust_masm_tests/instructions.rs b/tests/integration/src/rust_masm_tests/instructions.rs index 934577621..5842e0445 100644 --- a/tests/integration/src/rust_masm_tests/instructions.rs +++ b/tests/integration/src/rust_masm_tests/instructions.rs @@ -835,6 +835,125 @@ fn test_hmerge() { } } +#[test] +fn test_memory_copy_aligned_fast_path() { + let main_fn = r#"() -> Felt { + #[inline(never)] + fn do_copy(dst: &mut [u32; 12], src: &[u32; 16]) { + unsafe { + let src_ptr = (src.as_ptr() as *const u8).add(4); + let dst_ptr = dst.as_mut_ptr() as *mut u8; + core::ptr::copy_nonoverlapping(src_ptr, dst_ptr, 48); + } + } + + let mut src = [0u32; 16]; + let src_bytes = src.as_mut_ptr() as *mut u8; + let mut i = 0usize; + while i < 64 { + unsafe { *src_bytes.add(i) = i as u8; } + i += 1; + } + + let mut dst = [0u32; 12]; + do_copy(&mut dst, &src); + + let dst_bytes = dst.as_ptr() as *const u8; + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 48 { + let observed = unsafe { *dst_bytes.add(i) }; + if observed != (i as u8).wrapping_add(4) { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( + "memory_copy_aligned_fast_path_u8s", + main_fn, + config, + [], + ); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} + +#[test] +fn test_memory_copy_aligned_addresses_misaligned_count() { + let main_fn = r#"() -> Felt { + #[inline(never)] + fn do_copy(dst: &mut [u32; 12], src: &[u32; 16]) { + unsafe { + let src_ptr = (src.as_ptr() as *const u8).add(4); + let dst_ptr = dst.as_mut_ptr() as *mut u8; + core::ptr::copy_nonoverlapping(src_ptr, dst_ptr, 47); + } + } + + let mut src = [0u32; 16]; + let src_bytes = src.as_mut_ptr() as *mut u8; + let mut i = 0usize; + while i < 64 { + unsafe { *src_bytes.add(i) = i as u8; } + i += 1; + } + + let mut dst = [0xffff_ffffu32; 12]; + do_copy(&mut dst, &src); + + let dst_bytes = dst.as_ptr() as *const u8; + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 48 { + let observed = unsafe { *dst_bytes.add(i) }; + let expected = if i < 47 { + (i as u8).wrapping_add(4) + } else { + 0xff + }; + if observed != expected { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( + "memory_copy_aligned_addresses_misaligned_count_u8s", + main_fn, + config, + [], + ); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} + #[test] fn test_memory_copy_unaligned() { let main_fn = r#"() -> Felt { From 54b4f7436e737f47c91092eda1c85931e986148f Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Tue, 17 Mar 2026 13:52:39 +0200 Subject: [PATCH 09/29] test: strengthen unaligned u16 emitter assertions --- codegen/masm/src/emit/mod.rs | 44 +++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/codegen/masm/src/emit/mod.rs b/codegen/masm/src/emit/mod.rs index cc41d2bda..ca625d50b 100644 --- a/codegen/masm/src/emit/mod.rs +++ b/codegen/masm/src/emit/mod.rs @@ -748,6 +748,40 @@ mod tests { }; } + fn assert_unaligned_u16_split(block: &[Op], intrinsic: &str) { + assert!( + matches!( + block.get(block.len().saturating_sub(2)), + Some(Op::Inst(inst)) + if matches!(inst.inner(), masm::Instruction::EqImm(imm) if *imm == Felt::new(3)) + ), + "expected the `offset == 3` guard before the unaligned `u16` split" + ); + + let Some(Op::If { + then_blk, else_blk, .. + }) = block.last() + else { + panic!("expected the unaligned `u16` path to end in a split `if`"); + }; + + let execs = then_blk + .iter() + .filter_map(|op| match op { + Op::Inst(inst) => match inst.inner() { + masm::Instruction::Exec(target) => Some(target.to_string()), + _ => None, + }, + _ => None, + }) + .collect::>(); + assert!( + execs.iter().any(|target| target == intrinsic), + "expected then-branch to delegate to `{intrinsic}`, found execs: {execs:?}" + ); + assert!(!else_blk.is_empty(), "expected else-branch to preserve the within-element path"); + } + #[test] fn op_emitter_stack_manipulation_test() { let mut block = Vec::default(); @@ -2149,10 +2183,7 @@ mod tests { assert_eq!(emitter.stack_len(), 1); assert_eq!(emitter.stack()[0], Type::U16); - assert!( - block.iter().any(|op| matches!(op, Op::If { .. })), - "expected unaligned `u16` immediate load to emit the dynamic offset split" - ); + assert_unaligned_u16_split(&block, "::intrinsics::mem::load_sw"); } #[test] @@ -2167,10 +2198,7 @@ mod tests { emitter.store_imm(130, SourceSpan::default()); assert_eq!(emitter.stack_len(), 0); - assert!( - block.iter().any(|op| matches!(op, Op::If { .. })), - "expected unaligned `u16` immediate store to emit the dynamic offset split" - ); + assert_unaligned_u16_split(&block, "::intrinsics::mem::store_sw"); } #[test] From 8c82689b033ee4b0bda3dad458f9a3caf92c1c32 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Tue, 17 Mar 2026 14:00:45 +0200 Subject: [PATCH 10/29] test: cover signed unaligned i16 memory access --- .../integration/src/codegen/intrinsics/mem.rs | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/tests/integration/src/codegen/intrinsics/mem.rs b/tests/integration/src/codegen/intrinsics/mem.rs index 9d7021928..687ad95fa 100644 --- a/tests/integration/src/codegen/intrinsics/mem.rs +++ b/tests/integration/src/codegen/intrinsics/mem.rs @@ -433,6 +433,74 @@ fn load_unaligned_u16() { run_load_unaligned_u16(3); } +/// Runs an `i16` load test from the specified unaligned byte offset. +fn run_load_unaligned_i16(offset: u32) { + setup::enable_compiler_instrumentation(); + + let write_to = 17 * 2u32.pow(16); + let read_from = write_to + offset; + + let (package, context) = + compile_test_module([Type::from(PointerType::new(Type::I16))], [Type::I16], |builder| { + let block = builder.current_block(); + let ptr = block.borrow().arguments()[0] as ValueRef; + let loaded = builder.load(ptr, SourceSpan::default()).unwrap(); + builder.ret(Some(loaded), SourceSpan::default()).unwrap(); + }); + + let config = proptest::test_runner::Config::with_cases(10); + let res = TestRunner::new(config).run(&any::(), move |value| { + let expected = value.to_le_bytes(); + let mut initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; + initial_bytes[offset as usize] = expected[0]; + initial_bytes[offset as usize + 1] = expected[1]; + let initializers = [Initializer::MemoryBytes { + addr: write_to, + bytes: &initial_bytes, + }]; + + let args = [Felt::new(read_from as u64)]; + let output = eval_package::( + &package, + initializers, + &args, + context.session(), + |_| Ok(()), + )?; + + prop_assert_eq!(output, value, "expected 0x{:x}; found 0x{:x}", value, output,); + + Ok(()) + }); + + match res { + Err(TestError::Fail(reason, value)) => { + panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); + } + Ok(_) => (), + _ => panic!("Unexpected test result: {res:?}"), + } +} + +/// Tests that loading an `i16` from byte offset 1 stays within the current element. +#[test] +fn load_unaligned_i16_offset_1() { + run_load_unaligned_i16(1); +} + +/// Tests that loading an `i16` from byte offset 2 stays within the current element. +#[test] +fn load_unaligned_i16_offset_2() { + run_load_unaligned_i16(2); +} + +/// Tests that loading an `i16` from byte offset 3 correctly reconstructs the value across the next +/// element boundary. +#[test] +fn load_unaligned_i16() { + run_load_unaligned_i16(3); +} + /// Tests the memory load intrinsic for loads of boolean (i.e. 1-bit) values #[test] fn load_bool() { @@ -741,6 +809,109 @@ fn store_unaligned_u16() { run_store_unaligned_u16(3); } +/// Runs an `i16` store test at the specified unaligned byte offset. +fn run_store_unaligned_i16(offset: u32) { + setup::enable_compiler_instrumentation(); + + let write_to = 17 * 2u32.pow(16); + let store_to = write_to + offset; + + let (package, context) = compile_test_module([Type::I16], [Type::U32], |builder| { + let block = builder.current_block(); + let value = block.borrow().arguments()[0] as ValueRef; + + let addr = builder.u32(store_to, SourceSpan::default()); + let ptr = builder + .inttoptr(addr, Type::from(PointerType::new(Type::I16)), SourceSpan::default()) + .unwrap(); + + builder.store(ptr, value, SourceSpan::default()).unwrap(); + + let result = builder.u32(1, SourceSpan::default()); + builder.ret(Some(result), SourceSpan::default()).unwrap(); + }); + + let config = proptest::test_runner::Config::with_cases(32); + let res = TestRunner::new(config).run(&any::(), move |store_value| { + let initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; + let initializers = [Initializer::MemoryBytes { + addr: write_to, + bytes: &initial_bytes, + }]; + + let args = [Felt::new(store_value as u16 as u64)]; + let output = + eval_package::(&package, initializers, &args, context.session(), |trace| { + let expected = store_value.to_le_bytes(); + let mut expected_bytes = initial_bytes; + expected_bytes[offset as usize] = expected[0]; + expected_bytes[offset as usize + 1] = expected[1]; + + let word0 = trace.read_from_rust_memory::(write_to).ok_or_else(|| { + TestCaseError::fail(format!("failed to read from byte address {write_to}")) + })?; + let word1 = trace.read_from_rust_memory::(write_to + 4).ok_or_else(|| { + TestCaseError::fail(format!( + "failed to read from byte address {}", + write_to + 4 + )) + })?; + let observed_bytes = [ + (word0 & 0xff) as u8, + ((word0 >> 8) & 0xff) as u8, + ((word0 >> 16) & 0xff) as u8, + ((word0 >> 24) & 0xff) as u8, + (word1 & 0xff) as u8, + ((word1 >> 8) & 0xff) as u8, + ((word1 >> 16) & 0xff) as u8, + ((word1 >> 24) & 0xff) as u8, + ]; + + for (index, (stored, expected_byte)) in + observed_bytes.into_iter().zip(expected_bytes).enumerate() + { + prop_assert_eq!( + stored, + expected_byte, + "unexpected byte at address {}", + write_to + index as u32 + ); + } + Ok(()) + })?; + + prop_assert_eq!(output, 1u32); + Ok(()) + }); + + match res { + Err(TestError::Fail(reason, value)) => { + panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); + } + Ok(_) => (), + _ => panic!("Unexpected test result: {res:?}"), + } +} + +/// Tests that storing an `i16` at byte offset 1 updates only the target bytes. +#[test] +fn store_unaligned_i16_offset_1() { + run_store_unaligned_i16(1); +} + +/// Tests that storing an `i16` at byte offset 2 updates only the target bytes. +#[test] +fn store_unaligned_i16_offset_2() { + run_store_unaligned_i16(2); +} + +/// Tests that storing an `i16` at byte offset 3 updates only the target bytes across the element +/// boundary. +#[test] +fn store_unaligned_i16() { + run_store_unaligned_i16(3); +} + /// Tests that u8 stores only affect the targeted byte and don't corrupt surrounding memory #[test] fn store_u8() { From 83b71fa3c0d13f4df737e508ddc5c3fb8625a1ac Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Tue, 17 Mar 2026 14:01:59 +0200 Subject: [PATCH 11/29] refactor: share split-element u16 branch emission --- codegen/masm/src/emit/mem.rs | 44 ++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 2d8a336b4..9c3c61279 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -56,6 +56,28 @@ impl OpEmitter<'_> { ); } + /// Emit the branch used by dynamic `u16` accesses to detect the cross-element case. + /// + /// The current stack must contain a native pointer tuple where the byte offset is one element + /// below the top of the stack, e.g. `[addr, offset]` for loads or `[addr, offset, value]` for + /// stores. + fn emit_u16_split_offset_branch( + &mut self, + then_ops: Vec, + else_ops: Vec, + span: SourceSpan, + ) { + self.emit_all( + [masm::Instruction::Dup1, masm::Instruction::EqImm(Felt::new(3).into())], + span, + ); + self.current_block.push(masm::Op::If { + span, + then_blk: masm::Block::new(span, then_ops), + else_blk: masm::Block::new(span, else_ops), + }); + } + /// Grow the heap (from the perspective of Wasm programs) by N pages, returning the previous /// size of the heap (in pages) if successful, or -1 if the heap could not be grown. pub fn mem_grow(&mut self, span: SourceSpan) { @@ -323,11 +345,6 @@ impl OpEmitter<'_> { /// /// Stack transition: `[addr, offset] -> [value]`. fn load_u16_dynamic(&mut self, span: SourceSpan) { - self.emit_all( - [masm::Instruction::Dup1, masm::Instruction::EqImm(Felt::new(3).into())], - span, - ); - let mut then_ops = Vec::default(); let mut then_stack = OperandStack::new(self.context_rc()); let mut then_emitter = OpEmitter::new(self.invoked, &mut then_ops, &mut then_stack); @@ -340,11 +357,7 @@ impl OpEmitter<'_> { let mut else_emitter = OpEmitter::new(self.invoked, &mut else_ops, &mut else_stack); else_emitter.load_small_from_current_element(&Type::U16, span); - self.current_block.push(masm::Op::If { - span, - then_blk: masm::Block::new(span, then_ops), - else_blk: masm::Block::new(span, else_ops), - }); + self.emit_u16_split_offset_branch(then_ops, else_ops, span); } fn load_double_word_imm(&mut self, ptr: NativePtr, span: SourceSpan) { @@ -1188,11 +1201,6 @@ impl OpEmitter<'_> { /// /// Stack transition: `[addr, offset, value] -> []`. fn store_u16_dynamic(&mut self, span: SourceSpan) { - self.emit_all( - [masm::Instruction::Dup1, masm::Instruction::EqImm(Felt::new(3).into())], - span, - ); - let mut then_ops = Vec::default(); let mut then_stack = OperandStack::new(self.context_rc()); let mut then_emitter = OpEmitter::new(self.invoked, &mut then_ops, &mut then_stack); @@ -1224,11 +1232,7 @@ impl OpEmitter<'_> { let mut else_emitter = OpEmitter::new(self.invoked, &mut else_ops, &mut else_stack); else_emitter.store_small_within_element(16, span); - self.current_block.push(masm::Op::If { - span, - then_blk: masm::Block::new(span, then_ops), - else_blk: masm::Block::new(span, else_ops), - }); + self.emit_u16_split_offset_branch(then_ops, else_ops, span); } /// Store a sub-word value using an immediate pointer From da329740cc2c98f01f097f46e22d129eb4403c1e Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Tue, 17 Mar 2026 14:02:39 +0200 Subject: [PATCH 12/29] docs: complete memcpy semantics comment --- codegen/masm/src/emit/mem.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 9c3c61279..cbd6da037 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -761,7 +761,11 @@ impl OpEmitter<'_> { /// /// The semantics of this instruction are as follows: /// - /// * The `` + /// * `count` is expressed in units of the pointee type, not bytes + /// * the effective byte length is `count * size_of(*src)` + /// * `count == 0` leaves memory unchanged and performs no copy + /// * source and destination pointers are interpreted in the address space described by their + /// pointer type pub fn memcpy(&mut self, span: SourceSpan) { let src = self.stack.pop().expect("operand stack is empty"); let dst = self.stack.pop().expect("operand stack is empty"); From 35c1cd58c9f1c1572e0ab93c38e3986b6f680d0a Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Tue, 17 Mar 2026 14:03:47 +0200 Subject: [PATCH 13/29] fix: assert word alignment in memcpy word fast paths --- codegen/masm/src/emit/mem.rs | 59 +++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index cbd6da037..42746eff8 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -78,6 +78,21 @@ impl OpEmitter<'_> { }); } + /// Convert the byte pointer on top of the stack to a word-aligned element address. + /// + /// This traps unless the input byte address is aligned to a 16-byte Miden word boundary. + fn emit_word_aligned_element_addr_from_byte_ptr(&mut self, span: SourceSpan) { + self.emit_all( + [ + masm::Instruction::U32DivModImm(16.into()), + masm::Instruction::Assertz, + masm::Instruction::U32OverflowingMulImm(4.into()), + masm::Instruction::Assertz, + ], + span, + ); + } + /// Grow the heap (from the perspective of Wasm programs) by N pages, returning the previous /// size of the heap (in pages) if successful, or -1 if the heap could not be grown. pub fn mem_grow(&mut self, span: SourceSpan) { @@ -859,17 +874,21 @@ impl OpEmitter<'_> { // We have to convert byte addresses to element addresses self.emit_all( [ - // Convert `src` to element address, and assert aligned to an element address - // - // TODO: We should probably also assert that the address is word-aligned, but - // that is going to happen anyway. That said, the closer to the source the - // better for debugging. - masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, + // Convert `src` to a word-aligned element address + ], + span, + ); + self.emit_word_aligned_element_addr_from_byte_ptr(span); + self.emit_all( + [ // Convert `dst` to an element address the same way masm::Instruction::Swap1, - masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, + ], + span, + ); + self.emit_word_aligned_element_addr_from_byte_ptr(span); + self.emit_all( + [ // Swap with `count` to get us into the correct ordering: [count, src, dst] masm::Instruction::Swap2, ], @@ -885,17 +904,21 @@ impl OpEmitter<'_> { let factor = size / 16; self.emit_all( [ - // Convert `src` to element address, and assert aligned to an element address - // - // TODO: We should probably also assert that the address is word-aligned, but - // that is going to happen anyway. That said, the closer to the source the - // better for debugging. - masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, + // Convert `src` to a word-aligned element address + ], + span, + ); + self.emit_word_aligned_element_addr_from_byte_ptr(span); + self.emit_all( + [ // Convert `dst` to an element address the same way masm::Instruction::Swap1, - masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, + ], + span, + ); + self.emit_word_aligned_element_addr_from_byte_ptr(span); + self.emit_all( + [ // Swap with `count` to get us into the correct ordering: [count, src, dst] masm::Instruction::Swap2, // Compute the corrected count From efbc2761e288a52875e9b01eb691caf87c3ef482 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 11:21:05 +0200 Subject: [PATCH 14/29] Clarify 16-bit memcpy helper naming --- codegen/masm/src/emit/mem.rs | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 42746eff8..f2710f61c 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -56,12 +56,12 @@ impl OpEmitter<'_> { ); } - /// Emit the branch used by dynamic `u16` accesses to detect the cross-element case. + /// Emit the branch used by dynamic 16-bit accesses to detect the cross-element case. /// /// The current stack must contain a native pointer tuple where the byte offset is one element /// below the top of the stack, e.g. `[addr, offset]` for loads or `[addr, offset, value]` for /// stores. - fn emit_u16_split_offset_branch( + fn emit_16bit_split_offset_branch( &mut self, then_ops: Vec, else_ops: Vec, @@ -304,7 +304,7 @@ impl OpEmitter<'_> { } if ty.size_in_bits() == 16 { - self.load_u16_dynamic(span); + self.load_16bit_dynamic(span); return; } @@ -353,13 +353,16 @@ impl OpEmitter<'_> { self.emit(masm::Instruction::U32And, span); } - /// Load a `u16` value from a dynamic native pointer tuple. + /// Load a 16-bit value from a dynamic native pointer tuple. /// /// Offsets `0..=2` fit within the current element and can use the regular shift/mask path. /// Offset `3` spans the next element, so it must assemble a 32-bit unaligned window first. /// + /// This helper moves raw 16-bit payloads only. Signedness is preserved by the caller's typed + /// result rather than by the load sequence itself. + /// /// Stack transition: `[addr, offset] -> [value]`. - fn load_u16_dynamic(&mut self, span: SourceSpan) { + fn load_16bit_dynamic(&mut self, span: SourceSpan) { let mut then_ops = Vec::default(); let mut then_stack = OperandStack::new(self.context_rc()); let mut then_emitter = OpEmitter::new(self.invoked, &mut then_ops, &mut then_stack); @@ -372,7 +375,7 @@ impl OpEmitter<'_> { let mut else_emitter = OpEmitter::new(self.invoked, &mut else_ops, &mut else_stack); else_emitter.load_small_from_current_element(&Type::U16, span); - self.emit_u16_split_offset_branch(then_ops, else_ops, span); + self.emit_16bit_split_offset_branch(then_ops, else_ops, span); } fn load_double_word_imm(&mut self, ptr: NativePtr, span: SourceSpan) { @@ -1159,7 +1162,7 @@ impl OpEmitter<'_> { } if type_size == 16 { - self.store_u16_dynamic(span); + self.store_16bit_dynamic(span); return; } @@ -1221,13 +1224,16 @@ impl OpEmitter<'_> { ); } - /// Store a `u16` to a dynamic native pointer tuple. + /// Store a 16-bit value to a dynamic native pointer tuple. /// /// Offsets `0..=2` fit within the current element and can update that element in place. /// Offset `3` spans into the next element and must use the unaligned 32-bit store intrinsic. /// + /// This helper moves raw 16-bit payloads only. Signedness is preserved by the caller's typed + /// value rather than by the store sequence itself. + /// /// Stack transition: `[addr, offset, value] -> []`. - fn store_u16_dynamic(&mut self, span: SourceSpan) { + fn store_16bit_dynamic(&mut self, span: SourceSpan) { let mut then_ops = Vec::default(); let mut then_stack = OperandStack::new(self.context_rc()); let mut then_emitter = OpEmitter::new(self.invoked, &mut then_ops, &mut then_stack); @@ -1259,7 +1265,7 @@ impl OpEmitter<'_> { let mut else_emitter = OpEmitter::new(self.invoked, &mut else_ops, &mut else_stack); else_emitter.store_small_within_element(16, span); - self.emit_u16_split_offset_branch(then_ops, else_ops, span); + self.emit_16bit_split_offset_branch(then_ops, else_ops, span); } /// Store a sub-word value using an immediate pointer From 9ca44998c43b3e26431cf29d6e7c2ed424295517 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 11:23:10 +0200 Subject: [PATCH 15/29] Clean up memcpy word-copy fast path flow --- codegen/masm/src/emit/mem.rs | 42 +++++++----------------------------- 1 file changed, 8 insertions(+), 34 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index f2710f61c..b73c01015 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -874,29 +874,13 @@ impl OpEmitter<'_> { } // Word-sized values have an optimized intrinsic we can lean on 16 => { - // We have to convert byte addresses to element addresses - self.emit_all( - [ - // Convert `src` to a word-aligned element address - ], - span, - ); + // Convert `src` to a word-aligned element address. self.emit_word_aligned_element_addr_from_byte_ptr(span); - self.emit_all( - [ - // Convert `dst` to an element address the same way - masm::Instruction::Swap1, - ], - span, - ); + // Convert `dst` to an element address the same way. + self.emit(masm::Instruction::Swap1, span); self.emit_word_aligned_element_addr_from_byte_ptr(span); - self.emit_all( - [ - // Swap with `count` to get us into the correct ordering: [count, src, dst] - masm::Instruction::Swap2, - ], - span, - ); + // Swap with `count` to get us into the correct ordering: [count, src, dst]. + self.emit(masm::Instruction::Swap2, span); self.raw_exec("::miden::core::mem::memcopy_words", span); return; } @@ -905,20 +889,10 @@ impl OpEmitter<'_> { // multiplying `count` by the number of words in each value size if size > 16 && size.is_multiple_of(16) => { let factor = size / 16; - self.emit_all( - [ - // Convert `src` to a word-aligned element address - ], - span, - ); + // Convert `src` to a word-aligned element address. self.emit_word_aligned_element_addr_from_byte_ptr(span); - self.emit_all( - [ - // Convert `dst` to an element address the same way - masm::Instruction::Swap1, - ], - span, - ); + // Convert `dst` to an element address the same way. + self.emit(masm::Instruction::Swap1, span); self.emit_word_aligned_element_addr_from_byte_ptr(span); self.emit_all( [ From 6639266361345323a44d528f92fbf4197837f595 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 11:25:56 +0200 Subject: [PATCH 16/29] Deduplicate unaligned 16-bit intrinsic tests --- .../integration/src/codegen/intrinsics/mem.rs | 576 ++++++++---------- 1 file changed, 254 insertions(+), 322 deletions(-) diff --git a/tests/integration/src/codegen/intrinsics/mem.rs b/tests/integration/src/codegen/intrinsics/mem.rs index 687ad95fa..2c1929ef8 100644 --- a/tests/integration/src/codegen/intrinsics/mem.rs +++ b/tests/integration/src/codegen/intrinsics/mem.rs @@ -365,141 +365,120 @@ fn load_u16() { } } -/// Runs a `u16` load test from the specified unaligned byte offset. -fn run_load_unaligned_u16(offset: u32) { - setup::enable_compiler_instrumentation(); - - let write_to = 17 * 2u32.pow(16); - let read_from = write_to + offset; - - let (package, context) = - compile_test_module([Type::from(PointerType::new(Type::U16))], [Type::U16], |builder| { - let block = builder.current_block(); - let ptr = block.borrow().arguments()[0] as ValueRef; - let loaded = builder.load(ptr, SourceSpan::default()).unwrap(); - builder.ret(Some(loaded), SourceSpan::default()).unwrap(); - }); - - let config = proptest::test_runner::Config::with_cases(10); - let res = TestRunner::new(config).run(&any::(), move |value| { - let expected = value.to_le_bytes(); - let mut initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; - initial_bytes[offset as usize] = expected[0]; - initial_bytes[offset as usize + 1] = expected[1]; - let initializers = [Initializer::MemoryBytes { - addr: write_to, - bytes: &initial_bytes, - }]; - - let args = [Felt::new(read_from as u64)]; - let output = eval_package::( - &package, - initializers, - &args, - context.session(), - |_| Ok(()), - )?; - - prop_assert_eq!(output, value, "expected 0x{:x}; found 0x{:x}", value, output,); +macro_rules! define_unaligned_16bit_load_tests { + ( + $run_fn:ident, + $rust_ty:ty, + $hir_ty:expr, + $offset_1_test:ident, + $offset_2_test:ident, + $offset_3_test:ident + ) => { + #[doc = concat!( + "Runs a `", + stringify!($rust_ty), + "` load test from the specified unaligned byte offset." + )] + fn $run_fn(offset: u32) { + setup::enable_compiler_instrumentation(); + + let write_to = 17 * 2u32.pow(16); + let read_from = write_to + offset; + + let (package, context) = compile_test_module( + [Type::from(PointerType::new($hir_ty))], + [$hir_ty], + |builder| { + let block = builder.current_block(); + let ptr = block.borrow().arguments()[0] as ValueRef; + let loaded = builder.load(ptr, SourceSpan::default()).unwrap(); + builder.ret(Some(loaded), SourceSpan::default()).unwrap(); + }, + ); + + let config = proptest::test_runner::Config::with_cases(10); + let res = TestRunner::new(config).run(&any::<$rust_ty>(), move |value| { + let expected = value.to_le_bytes(); + let mut initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; + initial_bytes[offset as usize] = expected[0]; + initial_bytes[offset as usize + 1] = expected[1]; + let initializers = [Initializer::MemoryBytes { + addr: write_to, + bytes: &initial_bytes, + }]; + + let args = [Felt::new(read_from as u64)]; + let output = eval_package::<$rust_ty, _, _>( + &package, + initializers, + &args, + context.session(), + |_| Ok(()), + )?; + + prop_assert_eq!(output, value, "expected 0x{:x}; found 0x{:x}", value, output,); - Ok(()) - }); + Ok(()) + }); - match res { - Err(TestError::Fail(reason, value)) => { - panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); + match res { + Err(TestError::Fail(reason, value)) => { + panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); + } + Ok(_) => (), + _ => panic!("Unexpected test result: {res:?}"), + } } - Ok(_) => (), - _ => panic!("Unexpected test result: {res:?}"), - } -} - -/// Tests that loading a `u16` from byte offset 1 stays within the current element. -#[test] -fn load_unaligned_u16_offset_1() { - run_load_unaligned_u16(1); -} - -/// Tests that loading a `u16` from byte offset 2 stays within the current element. -#[test] -fn load_unaligned_u16_offset_2() { - run_load_unaligned_u16(2); -} - -/// Tests that loading a `u16` from byte offset 3 correctly reconstructs the value across the next -/// element boundary. -#[test] -fn load_unaligned_u16() { - run_load_unaligned_u16(3); -} - -/// Runs an `i16` load test from the specified unaligned byte offset. -fn run_load_unaligned_i16(offset: u32) { - setup::enable_compiler_instrumentation(); - - let write_to = 17 * 2u32.pow(16); - let read_from = write_to + offset; - - let (package, context) = - compile_test_module([Type::from(PointerType::new(Type::I16))], [Type::I16], |builder| { - let block = builder.current_block(); - let ptr = block.borrow().arguments()[0] as ValueRef; - let loaded = builder.load(ptr, SourceSpan::default()).unwrap(); - builder.ret(Some(loaded), SourceSpan::default()).unwrap(); - }); - - let config = proptest::test_runner::Config::with_cases(10); - let res = TestRunner::new(config).run(&any::(), move |value| { - let expected = value.to_le_bytes(); - let mut initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; - initial_bytes[offset as usize] = expected[0]; - initial_bytes[offset as usize + 1] = expected[1]; - let initializers = [Initializer::MemoryBytes { - addr: write_to, - bytes: &initial_bytes, - }]; - - let args = [Felt::new(read_from as u64)]; - let output = eval_package::( - &package, - initializers, - &args, - context.session(), - |_| Ok(()), - )?; - prop_assert_eq!(output, value, "expected 0x{:x}; found 0x{:x}", value, output,); - - Ok(()) - }); - - match res { - Err(TestError::Fail(reason, value)) => { - panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); + #[doc = concat!( + "Tests that loading a `", + stringify!($rust_ty), + "` from byte offset 1 stays within the current element." + )] + #[test] + fn $offset_1_test() { + $run_fn(1); } - Ok(_) => (), - _ => panic!("Unexpected test result: {res:?}"), - } -} -/// Tests that loading an `i16` from byte offset 1 stays within the current element. -#[test] -fn load_unaligned_i16_offset_1() { - run_load_unaligned_i16(1); -} + #[doc = concat!( + "Tests that loading a `", + stringify!($rust_ty), + "` from byte offset 2 stays within the current element." + )] + #[test] + fn $offset_2_test() { + $run_fn(2); + } -/// Tests that loading an `i16` from byte offset 2 stays within the current element. -#[test] -fn load_unaligned_i16_offset_2() { - run_load_unaligned_i16(2); + #[doc = concat!( + "Tests that loading a `", + stringify!($rust_ty), + "` from byte offset 3 correctly reconstructs the value across the next element \ + boundary." + )] + #[test] + fn $offset_3_test() { + $run_fn(3); + } + }; } -/// Tests that loading an `i16` from byte offset 3 correctly reconstructs the value across the next -/// element boundary. -#[test] -fn load_unaligned_i16() { - run_load_unaligned_i16(3); -} +define_unaligned_16bit_load_tests!( + run_load_unaligned_u16, + u16, + Type::U16, + load_unaligned_u16_offset_1, + load_unaligned_u16_offset_2, + load_unaligned_u16 +); +define_unaligned_16bit_load_tests!( + run_load_unaligned_i16, + i16, + Type::I16, + load_unaligned_i16_offset_1, + load_unaligned_i16_offset_2, + load_unaligned_i16 +); /// Tests the memory load intrinsic for loads of boolean (i.e. 1-bit) values #[test] @@ -706,211 +685,164 @@ fn store_u16() { } } -/// Runs a `u16` store test at the specified unaligned byte offset. -fn run_store_unaligned_u16(offset: u32) { - setup::enable_compiler_instrumentation(); - - let write_to = 17 * 2u32.pow(16); - let store_to = write_to + offset; - - let (package, context) = compile_test_module([Type::U16], [Type::U32], |builder| { - let block = builder.current_block(); - let value = block.borrow().arguments()[0] as ValueRef; - - let addr = builder.u32(store_to, SourceSpan::default()); - let ptr = builder - .inttoptr(addr, Type::from(PointerType::new(Type::U16)), SourceSpan::default()) - .unwrap(); - - builder.store(ptr, value, SourceSpan::default()).unwrap(); - - let result = builder.u32(1, SourceSpan::default()); - builder.ret(Some(result), SourceSpan::default()).unwrap(); - }); - - let config = proptest::test_runner::Config::with_cases(32); - let res = TestRunner::new(config).run(&any::(), move |store_value| { - let initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; - let initializers = [Initializer::MemoryBytes { - addr: write_to, - bytes: &initial_bytes, - }]; - - let args = [Felt::new(store_value as u64)]; - let output = - eval_package::(&package, initializers, &args, context.session(), |trace| { - let expected = store_value.to_le_bytes(); - let mut expected_bytes = initial_bytes; - expected_bytes[offset as usize] = expected[0]; - expected_bytes[offset as usize + 1] = expected[1]; - - let word0 = trace.read_from_rust_memory::(write_to).ok_or_else(|| { - TestCaseError::fail(format!("failed to read from byte address {write_to}")) - })?; - let word1 = trace.read_from_rust_memory::(write_to + 4).ok_or_else(|| { - TestCaseError::fail(format!( - "failed to read from byte address {}", - write_to + 4 - )) - })?; - let observed_bytes = [ - (word0 & 0xff) as u8, - ((word0 >> 8) & 0xff) as u8, - ((word0 >> 16) & 0xff) as u8, - ((word0 >> 24) & 0xff) as u8, - (word1 & 0xff) as u8, - ((word1 >> 8) & 0xff) as u8, - ((word1 >> 16) & 0xff) as u8, - ((word1 >> 24) & 0xff) as u8, - ]; - - for (index, (stored, expected_byte)) in - observed_bytes.into_iter().zip(expected_bytes).enumerate() - { - prop_assert_eq!( - stored, - expected_byte, - "unexpected byte at address {}", - write_to + index as u32 - ); - } +macro_rules! define_unaligned_16bit_store_tests { + ( + $run_fn:ident, + $rust_ty:ty, + $hir_ty:expr, + $to_felt:expr, + $offset_1_test:ident, + $offset_2_test:ident, + $offset_3_test:ident + ) => { + #[doc = concat!( + "Runs a `", + stringify!($rust_ty), + "` store test at the specified unaligned byte offset." + )] + fn $run_fn(offset: u32) { + setup::enable_compiler_instrumentation(); + + let write_to = 17 * 2u32.pow(16); + let store_to = write_to + offset; + + let (package, context) = compile_test_module([$hir_ty], [Type::U32], |builder| { + let block = builder.current_block(); + let value = block.borrow().arguments()[0] as ValueRef; + + let addr = builder.u32(store_to, SourceSpan::default()); + let ptr = builder + .inttoptr(addr, Type::from(PointerType::new($hir_ty)), SourceSpan::default()) + .unwrap(); + + builder.store(ptr, value, SourceSpan::default()).unwrap(); + + let result = builder.u32(1, SourceSpan::default()); + builder.ret(Some(result), SourceSpan::default()).unwrap(); + }); + + let config = proptest::test_runner::Config::with_cases(32); + let res = TestRunner::new(config).run(&any::<$rust_ty>(), move |store_value| { + let initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; + let initializers = [Initializer::MemoryBytes { + addr: write_to, + bytes: &initial_bytes, + }]; + + let args = [($to_felt)(store_value)]; + let output = eval_package::( + &package, + initializers, + &args, + context.session(), + |trace| { + let expected = store_value.to_le_bytes(); + let mut expected_bytes = initial_bytes; + expected_bytes[offset as usize] = expected[0]; + expected_bytes[offset as usize + 1] = expected[1]; + + let word0 = + trace.read_from_rust_memory::(write_to).ok_or_else(|| { + TestCaseError::fail(format!( + "failed to read from byte address {write_to}" + )) + })?; + let word1 = + trace.read_from_rust_memory::(write_to + 4).ok_or_else(|| { + TestCaseError::fail(format!( + "failed to read from byte address {}", + write_to + 4 + )) + })?; + let observed_bytes = [ + (word0 & 0xff) as u8, + ((word0 >> 8) & 0xff) as u8, + ((word0 >> 16) & 0xff) as u8, + ((word0 >> 24) & 0xff) as u8, + (word1 & 0xff) as u8, + ((word1 >> 8) & 0xff) as u8, + ((word1 >> 16) & 0xff) as u8, + ((word1 >> 24) & 0xff) as u8, + ]; + + for (index, (stored, expected_byte)) in + observed_bytes.into_iter().zip(expected_bytes).enumerate() + { + prop_assert_eq!( + stored, + expected_byte, + "unexpected byte at address {}", + write_to + index as u32 + ); + } + + Ok(()) + }, + )?; + + prop_assert_eq!(output, 1u32); Ok(()) - })?; - - prop_assert_eq!(output, 1u32); - Ok(()) - }); - - match res { - Err(TestError::Fail(reason, value)) => { - panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); - } - Ok(_) => (), - _ => panic!("Unexpected test result: {res:?}"), - } -} - -/// Tests that storing a `u16` at byte offset 1 updates only the target bytes. -#[test] -fn store_unaligned_u16_offset_1() { - run_store_unaligned_u16(1); -} - -/// Tests that storing a `u16` at byte offset 2 updates only the target bytes. -#[test] -fn store_unaligned_u16_offset_2() { - run_store_unaligned_u16(2); -} - -/// Tests that storing a `u16` at byte offset 3 updates only the target bytes across the element -/// boundary. -#[test] -fn store_unaligned_u16() { - run_store_unaligned_u16(3); -} - -/// Runs an `i16` store test at the specified unaligned byte offset. -fn run_store_unaligned_i16(offset: u32) { - setup::enable_compiler_instrumentation(); + }); - let write_to = 17 * 2u32.pow(16); - let store_to = write_to + offset; - - let (package, context) = compile_test_module([Type::I16], [Type::U32], |builder| { - let block = builder.current_block(); - let value = block.borrow().arguments()[0] as ValueRef; - - let addr = builder.u32(store_to, SourceSpan::default()); - let ptr = builder - .inttoptr(addr, Type::from(PointerType::new(Type::I16)), SourceSpan::default()) - .unwrap(); - - builder.store(ptr, value, SourceSpan::default()).unwrap(); - - let result = builder.u32(1, SourceSpan::default()); - builder.ret(Some(result), SourceSpan::default()).unwrap(); - }); - - let config = proptest::test_runner::Config::with_cases(32); - let res = TestRunner::new(config).run(&any::(), move |store_value| { - let initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; - let initializers = [Initializer::MemoryBytes { - addr: write_to, - bytes: &initial_bytes, - }]; - - let args = [Felt::new(store_value as u16 as u64)]; - let output = - eval_package::(&package, initializers, &args, context.session(), |trace| { - let expected = store_value.to_le_bytes(); - let mut expected_bytes = initial_bytes; - expected_bytes[offset as usize] = expected[0]; - expected_bytes[offset as usize + 1] = expected[1]; - - let word0 = trace.read_from_rust_memory::(write_to).ok_or_else(|| { - TestCaseError::fail(format!("failed to read from byte address {write_to}")) - })?; - let word1 = trace.read_from_rust_memory::(write_to + 4).ok_or_else(|| { - TestCaseError::fail(format!( - "failed to read from byte address {}", - write_to + 4 - )) - })?; - let observed_bytes = [ - (word0 & 0xff) as u8, - ((word0 >> 8) & 0xff) as u8, - ((word0 >> 16) & 0xff) as u8, - ((word0 >> 24) & 0xff) as u8, - (word1 & 0xff) as u8, - ((word1 >> 8) & 0xff) as u8, - ((word1 >> 16) & 0xff) as u8, - ((word1 >> 24) & 0xff) as u8, - ]; - - for (index, (stored, expected_byte)) in - observed_bytes.into_iter().zip(expected_bytes).enumerate() - { - prop_assert_eq!( - stored, - expected_byte, - "unexpected byte at address {}", - write_to + index as u32 - ); + match res { + Err(TestError::Fail(reason, value)) => { + panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); } - Ok(()) - })?; - - prop_assert_eq!(output, 1u32); - Ok(()) - }); + Ok(_) => (), + _ => panic!("Unexpected test result: {res:?}"), + } + } - match res { - Err(TestError::Fail(reason, value)) => { - panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); + #[doc = concat!( + "Tests that storing a `", + stringify!($rust_ty), + "` at byte offset 1 updates only the target bytes." + )] + #[test] + fn $offset_1_test() { + $run_fn(1); } - Ok(_) => (), - _ => panic!("Unexpected test result: {res:?}"), - } -} -/// Tests that storing an `i16` at byte offset 1 updates only the target bytes. -#[test] -fn store_unaligned_i16_offset_1() { - run_store_unaligned_i16(1); -} + #[doc = concat!( + "Tests that storing a `", + stringify!($rust_ty), + "` at byte offset 2 updates only the target bytes." + )] + #[test] + fn $offset_2_test() { + $run_fn(2); + } -/// Tests that storing an `i16` at byte offset 2 updates only the target bytes. -#[test] -fn store_unaligned_i16_offset_2() { - run_store_unaligned_i16(2); + #[doc = concat!( + "Tests that storing a `", + stringify!($rust_ty), + "` at byte offset 3 updates only the target bytes across the element boundary." + )] + #[test] + fn $offset_3_test() { + $run_fn(3); + } + }; } -/// Tests that storing an `i16` at byte offset 3 updates only the target bytes across the element -/// boundary. -#[test] -fn store_unaligned_i16() { - run_store_unaligned_i16(3); -} +define_unaligned_16bit_store_tests!( + run_store_unaligned_u16, + u16, + Type::U16, + |store_value: u16| Felt::new(store_value as u64), + store_unaligned_u16_offset_1, + store_unaligned_u16_offset_2, + store_unaligned_u16 +); +define_unaligned_16bit_store_tests!( + run_store_unaligned_i16, + i16, + Type::I16, + |store_value: i16| Felt::new(store_value as u16 as u64), + store_unaligned_i16_offset_1, + store_unaligned_i16_offset_2, + store_unaligned_i16 +); /// Tests that u8 stores only affect the targeted byte and don't corrupt surrounding memory #[test] From 1826cee1c20fbdaa22df70b7d73f639927f38405 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 11:27:17 +0200 Subject: [PATCH 17/29] Cover memcpy word-copy fast paths --- .../src/rust_masm_tests/instructions.rs | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/tests/integration/src/rust_masm_tests/instructions.rs b/tests/integration/src/rust_masm_tests/instructions.rs index 5842e0445..511377088 100644 --- a/tests/integration/src/rust_masm_tests/instructions.rs +++ b/tests/integration/src/rust_masm_tests/instructions.rs @@ -892,6 +892,120 @@ fn test_memory_copy_aligned_fast_path() { .unwrap(); } +#[test] +fn test_memory_copy_u128_fast_path() { + let main_fn = r#"() -> Felt { + #[inline(never)] + fn do_copy(dst: &mut [u128; 2], src: &[u128; 3]) { + unsafe { + let src_ptr = src.as_ptr().add(1); + let dst_ptr = dst.as_mut_ptr(); + core::ptr::copy_nonoverlapping(src_ptr, dst_ptr, 2); + } + } + + let src = [ + 0x00112233445566778899aabbccddeeff_u128, + 0x102132435465768798a9bacbdcedfe0f_u128, + 0xfedcba98765432100123456789abcdef_u128, + ]; + let mut dst = [0u128; 2]; + do_copy(&mut dst, &src); + + let expected = [src[1], src[2]]; + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 2 { + if dst[i] != expected[i] { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( + "memory_copy_u128_fast_path", + main_fn, + config, + [], + ); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} + +#[test] +fn test_memory_copy_multiword_fast_path() { + let main_fn = r#"() -> Felt { + struct Chunk([u128; 2]); + + #[inline(never)] + fn do_copy(dst: &mut [Chunk; 1], src: &[Chunk; 2]) { + unsafe { + let src_ptr = src.as_ptr().add(1); + let dst_ptr = dst.as_mut_ptr(); + core::ptr::copy_nonoverlapping(src_ptr, dst_ptr, 1); + } + } + + let src = [ + Chunk([ + 0x00112233445566778899aabbccddeeff_u128, + 0x112233445566778899aabbccddeeff00_u128, + ]), + Chunk([ + 0xaabbccddeeff00112233445566778899_u128, + 0xffeeddccbbaa99887766554433221100_u128, + ]), + ]; + let mut dst = [Chunk([0u128; 2])]; + do_copy(&mut dst, &src); + + let expected = &src[1].0; + let observed = &dst[0].0; + let mut mismatches = 0u32; + let mut i = 0usize; + while i < 2 { + if observed[i] != expected[i] { + mismatches += 1; + } + i += 1; + } + + Felt::from_u32(mismatches) + }"#; + + setup::enable_compiler_instrumentation(); + let config = WasmTranslationConfig::default(); + let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( + "memory_copy_multiword_fast_path", + main_fn, + config, + [], + ); + + let package = test.compile_package(); + let args: [Felt; 0] = []; + + eval_package::(&package, [], &args, &test.session, |trace| { + let res: Felt = trace.parse_result().unwrap(); + assert_eq!(res, Felt::ZERO); + Ok(()) + }) + .unwrap(); +} + #[test] fn test_memory_copy_aligned_addresses_misaligned_count() { let main_fn = r#"() -> Felt { From 5a786e81f6abeb6fa174b89f0e8cd246a3fec432 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 11:27:46 +0200 Subject: [PATCH 18/29] Document cross-element 16-bit store window --- codegen/masm/src/emit/mem.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index b73c01015..392af8e3e 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -1219,6 +1219,8 @@ impl OpEmitter<'_> { span, ); then_emitter.raw_exec("::intrinsics::mem::load_sw", span); // [window, addr, offset, value] + // Preserve the upper half of the unaligned 32-bit window so only the two addressed bytes + // are replaced before delegating the write-back to `store_sw`. then_emitter.emit_push(0xffff0000u32, span); then_emitter.emit(masm::Instruction::U32And, span); // [masked_window, addr, offset, value] then_emitter.emit(masm::Instruction::MovUp3, span); // [value, masked_window, addr, offset] From c69fb7dbd5c8a1d845c89fb24df75a7e0c3ea92c Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 11:28:38 +0200 Subject: [PATCH 19/29] Restrict memcpy word fast paths to byte pointers --- codegen/masm/src/emit/mem.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 392af8e3e..5f5d03b4a 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -784,6 +784,8 @@ impl OpEmitter<'_> { /// * `count == 0` leaves memory unchanged and performs no copy /// * source and destination pointers are interpreted in the address space described by their /// pointer type + /// * optimized word-copy fast paths are only used for byte-addressable pointers; native + /// pointers fall back to the generic loop pub fn memcpy(&mut self, span: SourceSpan) { let src = self.stack.pop().expect("operand stack is empty"); let dst = self.stack.pop().expect("operand stack is empty"); @@ -794,6 +796,10 @@ impl OpEmitter<'_> { assert_eq!(ty, dst.ty(), "expected src and dst operands to have the same type"); let value_ty = ty.pointee().unwrap().clone(); let value_size = u32::try_from(value_ty.size_in_bytes()).expect("invalid value size"); + let is_byte_pointer = match &ty { + Type::Ptr(ptr_ty) => ptr_ty.is_byte_pointer(), + _ => unreachable!("memcpy expects pointer operands"), + }; // Use optimized intrinsics when available match value_size { @@ -873,7 +879,7 @@ impl OpEmitter<'_> { return; } // Word-sized values have an optimized intrinsic we can lean on - 16 => { + 16 if is_byte_pointer => { // Convert `src` to a word-aligned element address. self.emit_word_aligned_element_addr_from_byte_ptr(span); // Convert `dst` to an element address the same way. @@ -887,7 +893,7 @@ impl OpEmitter<'_> { // Values which can be broken up into word-sized chunks can piggy-back on the // intrinsic for word-sized values, but we have to compute a new `count` by // multiplying `count` by the number of words in each value - size if size > 16 && size.is_multiple_of(16) => { + size if is_byte_pointer && size > 16 && size.is_multiple_of(16) => { let factor = size / 16; // Convert `src` to a word-aligned element address. self.emit_word_aligned_element_addr_from_byte_ptr(span); From eac758a75aba4e238d8498f09d7408385eb0fde3 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 11:30:18 +0200 Subject: [PATCH 20/29] Introduce raw MASM branch block builder --- codegen/masm/src/emit/mem.rs | 177 ++++++++++++++++++----------------- 1 file changed, 93 insertions(+), 84 deletions(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 5f5d03b4a..bd52fcff4 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -63,8 +63,8 @@ impl OpEmitter<'_> { /// stores. fn emit_16bit_split_offset_branch( &mut self, - then_ops: Vec, - else_ops: Vec, + then_blk: masm::Block, + else_blk: masm::Block, span: SourceSpan, ) { self.emit_all( @@ -73,8 +73,8 @@ impl OpEmitter<'_> { ); self.current_block.push(masm::Op::If { span, - then_blk: masm::Block::new(span, then_ops), - else_blk: masm::Block::new(span, else_ops), + then_blk, + else_blk, }); } @@ -93,6 +93,22 @@ impl OpEmitter<'_> { ); } + /// Build a raw MASM block whose stack protocol is managed by the caller. + /// + /// This is used for branch bodies which operate on a known stack shape from the enclosing + /// emitter, but which do not need to synchronize typed operand-stack state back to it. + fn build_raw_block( + &mut self, + span: SourceSpan, + emit: impl FnOnce(&mut OpEmitter<'_>), + ) -> masm::Block { + let mut ops = Vec::default(); + let mut stack = OperandStack::new(self.context_rc()); + let mut emitter = OpEmitter::new(self.invoked, &mut ops, &mut stack); + emit(&mut emitter); + masm::Block::new(span, ops) + } + /// Grow the heap (from the perspective of Wasm programs) by N pages, returning the previous /// size of the heap (in pages) if successful, or -1 if the heap could not be grown. pub fn mem_grow(&mut self, span: SourceSpan) { @@ -363,19 +379,17 @@ impl OpEmitter<'_> { /// /// Stack transition: `[addr, offset] -> [value]`. fn load_16bit_dynamic(&mut self, span: SourceSpan) { - let mut then_ops = Vec::default(); - let mut then_stack = OperandStack::new(self.context_rc()); - let mut then_emitter = OpEmitter::new(self.invoked, &mut then_ops, &mut then_stack); - then_emitter.raw_exec("::intrinsics::mem::load_sw", span); - then_emitter.emit_push(0xffffu32, span); - then_emitter.emit(masm::Instruction::U32And, span); - - let mut else_ops = Vec::default(); - let mut else_stack = OperandStack::new(self.context_rc()); - let mut else_emitter = OpEmitter::new(self.invoked, &mut else_ops, &mut else_stack); - else_emitter.load_small_from_current_element(&Type::U16, span); - - self.emit_16bit_split_offset_branch(then_ops, else_ops, span); + let then_blk = self.build_raw_block(span, |then_emitter| { + then_emitter.raw_exec("::intrinsics::mem::load_sw", span); + then_emitter.emit_push(0xffffu32, span); + then_emitter.emit(masm::Instruction::U32And, span); + }); + + let else_blk = self.build_raw_block(span, |else_emitter| { + else_emitter.load_small_from_current_element(&Type::U16, span); + }); + + self.emit_16bit_split_offset_branch(then_blk, else_blk, span); } fn load_double_word_imm(&mut self, ptr: NativePtr, span: SourceSpan) { @@ -837,44 +851,41 @@ impl OpEmitter<'_> { ); // then: convert byte addresses/count to element units and delegate to core - let mut then_ops = Vec::default(); - let mut then_stack = OperandStack::new(self.context_rc()); - let mut then_emitter = OpEmitter::new(self.invoked, &mut then_ops, &mut then_stack); - then_emitter.emit_all( - [ - // Convert `src` to element address - masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, - // Convert `dst` to an element address - masm::Instruction::Swap1, - masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, - // Bring `count` to top to convert to element count - masm::Instruction::Swap2, - masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, - ], - span, - ); - then_emitter.raw_exec("::miden::core::mem::memcopy_elements", span); - - // else: fall back to the generic implementation - let mut else_ops = Vec::default(); - let mut else_stack = OperandStack::new(self.context_rc()); - let mut else_emitter = OpEmitter::new(self.invoked, &mut else_ops, &mut else_stack); - else_emitter.emit_memcpy_fallback_loop( - src.clone(), - dst.clone(), - count.clone(), - value_ty.clone(), - value_size, - span, - ); + let then_blk = self.build_raw_block(span, |then_emitter| { + then_emitter.emit_all( + [ + // Convert `src` to element address + masm::Instruction::U32DivModImm(4.into()), + masm::Instruction::Assertz, + // Convert `dst` to an element address + masm::Instruction::Swap1, + masm::Instruction::U32DivModImm(4.into()), + masm::Instruction::Assertz, + // Bring `count` to top to convert to element count + masm::Instruction::Swap2, + masm::Instruction::U32DivModImm(4.into()), + masm::Instruction::Assertz, + ], + span, + ); + then_emitter.raw_exec("::miden::core::mem::memcopy_elements", span); + }); + + let else_blk = self.build_raw_block(span, |else_emitter| { + else_emitter.emit_memcpy_fallback_loop( + src.clone(), + dst.clone(), + count.clone(), + value_ty.clone(), + value_size, + span, + ); + }); self.current_block.push(masm::Op::If { span, - then_blk: masm::Block::new(span, then_ops), - else_blk: masm::Block::new(span, else_ops), + then_blk, + else_blk, }); return; } @@ -1214,40 +1225,38 @@ impl OpEmitter<'_> { /// /// Stack transition: `[addr, offset, value] -> []`. fn store_16bit_dynamic(&mut self, span: SourceSpan) { - let mut then_ops = Vec::default(); - let mut then_stack = OperandStack::new(self.context_rc()); - let mut then_emitter = OpEmitter::new(self.invoked, &mut then_ops, &mut then_stack); - then_emitter.emit_all( - [ - masm::Instruction::Dup1, // [offset, addr, offset, value] - masm::Instruction::Dup1, // [addr, offset, addr, offset, value] - ], - span, - ); - then_emitter.raw_exec("::intrinsics::mem::load_sw", span); // [window, addr, offset, value] - // Preserve the upper half of the unaligned 32-bit window so only the two addressed bytes - // are replaced before delegating the write-back to `store_sw`. - then_emitter.emit_push(0xffff0000u32, span); - then_emitter.emit(masm::Instruction::U32And, span); // [masked_window, addr, offset, value] - then_emitter.emit(masm::Instruction::MovUp3, span); // [value, masked_window, addr, offset] - then_emitter.emit_push(0xffffu32, span); - then_emitter.emit(masm::Instruction::U32And, span); // [value16, masked_window, addr, offset] - then_emitter.emit(masm::Instruction::U32Or, span); // [combined, addr, offset] - then_emitter.emit_all( - [ - masm::Instruction::Swap2, // [offset, addr, combined] - masm::Instruction::Swap1, // [addr, offset, combined] - ], - span, - ); - then_emitter.raw_exec("::intrinsics::mem::store_sw", span); + let then_blk = self.build_raw_block(span, |then_emitter| { + then_emitter.emit_all( + [ + masm::Instruction::Dup1, // [offset, addr, offset, value] + masm::Instruction::Dup1, // [addr, offset, addr, offset, value] + ], + span, + ); + then_emitter.raw_exec("::intrinsics::mem::load_sw", span); // [window, addr, offset, value] + // Preserve the upper half of the unaligned 32-bit window so only the two addressed + // bytes are replaced before delegating the write-back to `store_sw`. + then_emitter.emit_push(0xffff0000u32, span); + then_emitter.emit(masm::Instruction::U32And, span); // [masked_window, addr, offset, value] + then_emitter.emit(masm::Instruction::MovUp3, span); // [value, masked_window, addr, offset] + then_emitter.emit_push(0xffffu32, span); + then_emitter.emit(masm::Instruction::U32And, span); // [value16, masked_window, addr, offset] + then_emitter.emit(masm::Instruction::U32Or, span); // [combined, addr, offset] + then_emitter.emit_all( + [ + masm::Instruction::Swap2, // [offset, addr, combined] + masm::Instruction::Swap1, // [addr, offset, combined] + ], + span, + ); + then_emitter.raw_exec("::intrinsics::mem::store_sw", span); + }); - let mut else_ops = Vec::default(); - let mut else_stack = OperandStack::new(self.context_rc()); - let mut else_emitter = OpEmitter::new(self.invoked, &mut else_ops, &mut else_stack); - else_emitter.store_small_within_element(16, span); + let else_blk = self.build_raw_block(span, |else_emitter| { + else_emitter.store_small_within_element(16, span); + }); - self.emit_16bit_split_offset_branch(then_ops, else_ops, span); + self.emit_16bit_split_offset_branch(then_blk, else_blk, span); } /// Store a sub-word value using an immediate pointer From a0ef2e6ac2abe86378e7cc28fd850885bdb556ec Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 11:30:43 +0200 Subject: [PATCH 21/29] Rename aligned byte memcpy regression --- tests/integration/src/rust_masm_tests/instructions.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/src/rust_masm_tests/instructions.rs b/tests/integration/src/rust_masm_tests/instructions.rs index 511377088..653cdeebd 100644 --- a/tests/integration/src/rust_masm_tests/instructions.rs +++ b/tests/integration/src/rust_masm_tests/instructions.rs @@ -836,7 +836,7 @@ fn test_hmerge() { } #[test] -fn test_memory_copy_aligned_fast_path() { +fn test_memory_copy_aligned_bytes() { let main_fn = r#"() -> Felt { #[inline(never)] fn do_copy(dst: &mut [u32; 12], src: &[u32; 16]) { @@ -875,7 +875,7 @@ fn test_memory_copy_aligned_fast_path() { setup::enable_compiler_instrumentation(); let config = WasmTranslationConfig::default(); let mut test = CompilerTest::rust_fn_body_with_stdlib_sys( - "memory_copy_aligned_fast_path_u8s", + "memory_copy_aligned_bytes_u8s", main_fn, config, [], From 9362cfe5c9f5a61e8698ef695ad64cb1c8e117a6 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 11:32:11 +0200 Subject: [PATCH 22/29] Add signed 16-bit immediate path tests --- codegen/masm/src/emit/mod.rs | 40 +++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/codegen/masm/src/emit/mod.rs b/codegen/masm/src/emit/mod.rs index ca625d50b..2fba21a5b 100644 --- a/codegen/masm/src/emit/mod.rs +++ b/codegen/masm/src/emit/mod.rs @@ -748,21 +748,21 @@ mod tests { }; } - fn assert_unaligned_u16_split(block: &[Op], intrinsic: &str) { + fn assert_unaligned_16bit_split(block: &[Op], intrinsic: &str) { assert!( matches!( block.get(block.len().saturating_sub(2)), Some(Op::Inst(inst)) if matches!(inst.inner(), masm::Instruction::EqImm(imm) if *imm == Felt::new(3)) ), - "expected the `offset == 3` guard before the unaligned `u16` split" + "expected the `offset == 3` guard before the unaligned 16-bit split" ); let Some(Op::If { then_blk, else_blk, .. }) = block.last() else { - panic!("expected the unaligned `u16` path to end in a split `if`"); + panic!("expected the unaligned 16-bit path to end in a split `if`"); }; let execs = then_blk @@ -2183,7 +2183,22 @@ mod tests { assert_eq!(emitter.stack_len(), 1); assert_eq!(emitter.stack()[0], Type::U16); - assert_unaligned_u16_split(&block, "::intrinsics::mem::load_sw"); + assert_unaligned_16bit_split(&block, "::intrinsics::mem::load_sw"); + } + + #[test] + fn op_emitter_unaligned_i16_load_imm_test() { + let mut block = Vec::default(); + let context = Rc::new(Context::default()); + let mut stack = OperandStack::new(context.clone()); + let mut invoked = BTreeSet::default(); + let mut emitter = OpEmitter::new(&mut invoked, &mut block, &mut stack); + + emitter.load_imm(130, Type::I16, SourceSpan::default()); + + assert_eq!(emitter.stack_len(), 1); + assert_eq!(emitter.stack()[0], Type::I16); + assert_unaligned_16bit_split(&block, "::intrinsics::mem::load_sw"); } #[test] @@ -2198,7 +2213,22 @@ mod tests { emitter.store_imm(130, SourceSpan::default()); assert_eq!(emitter.stack_len(), 0); - assert_unaligned_u16_split(&block, "::intrinsics::mem::store_sw"); + assert_unaligned_16bit_split(&block, "::intrinsics::mem::store_sw"); + } + + #[test] + fn op_emitter_unaligned_i16_store_imm_test() { + let mut block = Vec::default(); + let context = Rc::new(Context::default()); + let mut stack = OperandStack::new(context.clone()); + let mut invoked = BTreeSet::default(); + let mut emitter = OpEmitter::new(&mut invoked, &mut block, &mut stack); + + emitter.push(Type::I16); + emitter.store_imm(130, SourceSpan::default()); + + assert_eq!(emitter.stack_len(), 0); + assert_unaligned_16bit_split(&block, "::intrinsics::mem::store_sw"); } #[test] From a38a75b49d6c0892e8ac6fd9c9cf55c876f9f851 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 13:47:50 +0200 Subject: [PATCH 23/29] Refactor u16 memory access lowering --- codegen/masm/intrinsics/mem.masm | 32 +++++++++++++ codegen/masm/src/emit/mem.rs | 81 +++----------------------------- codegen/masm/src/emit/mod.rs | 32 ++++--------- 3 files changed, 46 insertions(+), 99 deletions(-) diff --git a/codegen/masm/intrinsics/mem.masm b/codegen/masm/intrinsics/mem.masm index 31b0fd47d..a9309b486 100644 --- a/codegen/masm/intrinsics/mem.masm +++ b/codegen/masm/intrinsics/mem.masm @@ -193,6 +193,18 @@ pub proc load_sw # [addr, offset] end end +# Load a 16-bit integer from the given native pointer tuple. +# +# A native pointer tuple consists of an element address where the data begins, and a byte offset, +# which is the offset of the first byte, in the 32-bit representation of that element. +# +# Stack transition: [addr, offset] -> [value] +pub proc load_u16 + exec.load_sw + push.65535 + u32and +end + # This handles emitting code that handles aligning an unaligned 64-bit value which is split across # three elements. # @@ -442,6 +454,26 @@ pub proc store_sw # [addr, offset, value] end end +# Store a 16-bit integer to the given native pointer tuple. +# +# A native pointer tuple consists of an element address where the data begins, and a byte offset, +# which is the offset of the first byte, in the 32-bit representation of that element. +# +# Stack transition: [addr, offset, value] -> [] +pub proc store_u16 + # Load the current 32-bit window at the destination, keep its upper half, then overwrite the + # target two bytes before delegating the write-back to `store_sw`. + dup.1 dup.1 exec.load_sw # [window, addr, offset, value] + push.4294901760 # 0xffff0000 + u32and # [masked_window, addr, offset, value] + movup.3 # [value, masked_window, addr, offset] + push.65535 + u32and # [value16, masked_window, addr, offset] + u32or # [combined, addr, offset] + swap.2 swap.1 # [addr, offset, combined] + exec.store_sw +end + # Store two 32-bit words to the given native pointer tuple. # # A native pointer tuple consists of an element address where the data begins, and a byte offset, diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index bd52fcff4..8a12fd3c3 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -56,28 +56,6 @@ impl OpEmitter<'_> { ); } - /// Emit the branch used by dynamic 16-bit accesses to detect the cross-element case. - /// - /// The current stack must contain a native pointer tuple where the byte offset is one element - /// below the top of the stack, e.g. `[addr, offset]` for loads or `[addr, offset, value]` for - /// stores. - fn emit_16bit_split_offset_branch( - &mut self, - then_blk: masm::Block, - else_blk: masm::Block, - span: SourceSpan, - ) { - self.emit_all( - [masm::Instruction::Dup1, masm::Instruction::EqImm(Felt::new(3).into())], - span, - ); - self.current_block.push(masm::Op::If { - span, - then_blk, - else_blk, - }); - } - /// Convert the byte pointer on top of the stack to a word-aligned element address. /// /// This traps unless the input byte address is aligned to a 16-byte Miden word boundary. @@ -371,25 +349,12 @@ impl OpEmitter<'_> { /// Load a 16-bit value from a dynamic native pointer tuple. /// - /// Offsets `0..=2` fit within the current element and can use the regular shift/mask path. - /// Offset `3` spans the next element, so it must assemble a 32-bit unaligned window first. - /// - /// This helper moves raw 16-bit payloads only. Signedness is preserved by the caller's typed - /// result rather than by the load sequence itself. + /// This delegates to a dedicated intrinsic which owns the complete stack protocol for both the + /// within-element and cross-element cases. /// /// Stack transition: `[addr, offset] -> [value]`. fn load_16bit_dynamic(&mut self, span: SourceSpan) { - let then_blk = self.build_raw_block(span, |then_emitter| { - then_emitter.raw_exec("::intrinsics::mem::load_sw", span); - then_emitter.emit_push(0xffffu32, span); - then_emitter.emit(masm::Instruction::U32And, span); - }); - - let else_blk = self.build_raw_block(span, |else_emitter| { - else_emitter.load_small_from_current_element(&Type::U16, span); - }); - - self.emit_16bit_split_offset_branch(then_blk, else_blk, span); + self.raw_exec("::intrinsics::mem::load_u16", span); } fn load_double_word_imm(&mut self, ptr: NativePtr, span: SourceSpan) { @@ -1217,46 +1182,12 @@ impl OpEmitter<'_> { /// Store a 16-bit value to a dynamic native pointer tuple. /// - /// Offsets `0..=2` fit within the current element and can update that element in place. - /// Offset `3` spans into the next element and must use the unaligned 32-bit store intrinsic. - /// - /// This helper moves raw 16-bit payloads only. Signedness is preserved by the caller's typed - /// value rather than by the store sequence itself. + /// This delegates to a dedicated intrinsic which owns the complete stack protocol for both the + /// within-element and cross-element cases. /// /// Stack transition: `[addr, offset, value] -> []`. fn store_16bit_dynamic(&mut self, span: SourceSpan) { - let then_blk = self.build_raw_block(span, |then_emitter| { - then_emitter.emit_all( - [ - masm::Instruction::Dup1, // [offset, addr, offset, value] - masm::Instruction::Dup1, // [addr, offset, addr, offset, value] - ], - span, - ); - then_emitter.raw_exec("::intrinsics::mem::load_sw", span); // [window, addr, offset, value] - // Preserve the upper half of the unaligned 32-bit window so only the two addressed - // bytes are replaced before delegating the write-back to `store_sw`. - then_emitter.emit_push(0xffff0000u32, span); - then_emitter.emit(masm::Instruction::U32And, span); // [masked_window, addr, offset, value] - then_emitter.emit(masm::Instruction::MovUp3, span); // [value, masked_window, addr, offset] - then_emitter.emit_push(0xffffu32, span); - then_emitter.emit(masm::Instruction::U32And, span); // [value16, masked_window, addr, offset] - then_emitter.emit(masm::Instruction::U32Or, span); // [combined, addr, offset] - then_emitter.emit_all( - [ - masm::Instruction::Swap2, // [offset, addr, combined] - masm::Instruction::Swap1, // [addr, offset, combined] - ], - span, - ); - then_emitter.raw_exec("::intrinsics::mem::store_sw", span); - }); - - let else_blk = self.build_raw_block(span, |else_emitter| { - else_emitter.store_small_within_element(16, span); - }); - - self.emit_16bit_split_offset_branch(then_blk, else_blk, span); + self.raw_exec("::intrinsics::mem::store_u16", span); } /// Store a sub-word value using an immediate pointer diff --git a/codegen/masm/src/emit/mod.rs b/codegen/masm/src/emit/mod.rs index 2fba21a5b..e2149650f 100644 --- a/codegen/masm/src/emit/mod.rs +++ b/codegen/masm/src/emit/mod.rs @@ -748,24 +748,9 @@ mod tests { }; } - fn assert_unaligned_16bit_split(block: &[Op], intrinsic: &str) { - assert!( - matches!( - block.get(block.len().saturating_sub(2)), - Some(Op::Inst(inst)) - if matches!(inst.inner(), masm::Instruction::EqImm(imm) if *imm == Felt::new(3)) - ), - "expected the `offset == 3` guard before the unaligned 16-bit split" - ); - - let Some(Op::If { - then_blk, else_blk, .. - }) = block.last() - else { - panic!("expected the unaligned 16-bit path to end in a split `if`"); - }; - - let execs = then_blk + /// Assert that the emitted block ends by delegating to the dedicated 16-bit memory intrinsic. + fn assert_unaligned_16bit_intrinsic(block: &[Op], intrinsic: &str) { + let execs = block .iter() .filter_map(|op| match op { Op::Inst(inst) => match inst.inner() { @@ -777,9 +762,8 @@ mod tests { .collect::>(); assert!( execs.iter().any(|target| target == intrinsic), - "expected then-branch to delegate to `{intrinsic}`, found execs: {execs:?}" + "expected block to delegate to `{intrinsic}`, found execs: {execs:?}" ); - assert!(!else_blk.is_empty(), "expected else-branch to preserve the within-element path"); } #[test] @@ -2183,7 +2167,7 @@ mod tests { assert_eq!(emitter.stack_len(), 1); assert_eq!(emitter.stack()[0], Type::U16); - assert_unaligned_16bit_split(&block, "::intrinsics::mem::load_sw"); + assert_unaligned_16bit_intrinsic(&block, "::intrinsics::mem::load_u16"); } #[test] @@ -2198,7 +2182,7 @@ mod tests { assert_eq!(emitter.stack_len(), 1); assert_eq!(emitter.stack()[0], Type::I16); - assert_unaligned_16bit_split(&block, "::intrinsics::mem::load_sw"); + assert_unaligned_16bit_intrinsic(&block, "::intrinsics::mem::load_u16"); } #[test] @@ -2213,7 +2197,7 @@ mod tests { emitter.store_imm(130, SourceSpan::default()); assert_eq!(emitter.stack_len(), 0); - assert_unaligned_16bit_split(&block, "::intrinsics::mem::store_sw"); + assert_unaligned_16bit_intrinsic(&block, "::intrinsics::mem::store_u16"); } #[test] @@ -2228,7 +2212,7 @@ mod tests { emitter.store_imm(130, SourceSpan::default()); assert_eq!(emitter.stack_len(), 0); - assert_unaligned_16bit_split(&block, "::intrinsics::mem::store_sw"); + assert_unaligned_16bit_intrinsic(&block, "::intrinsics::mem::store_u16"); } #[test] From 73694015d8f83f59bb33245a89c75928a1ff6726 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 13:49:43 +0200 Subject: [PATCH 24/29] Add constant-address u16 load coverage --- .../integration/src/codegen/intrinsics/mem.rs | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/tests/integration/src/codegen/intrinsics/mem.rs b/tests/integration/src/codegen/intrinsics/mem.rs index 2c1929ef8..9c4bf3288 100644 --- a/tests/integration/src/codegen/intrinsics/mem.rs +++ b/tests/integration/src/codegen/intrinsics/mem.rs @@ -480,6 +480,118 @@ define_unaligned_16bit_load_tests!( load_unaligned_i16 ); +macro_rules! define_constant_address_16bit_load_tests { + ( + $run_fn:ident, + $rust_ty:ty, + $hir_ty:expr, + $offset_1_test:ident, + $offset_2_test:ident, + $offset_3_test:ident + ) => { + #[doc = concat!( + "Runs a `", + stringify!($rust_ty), + "` load test from a constant byte address at the specified offset." + )] + fn $run_fn(offset: u32) { + setup::enable_compiler_instrumentation(); + + let write_to = 17 * 2u32.pow(16); + let read_from = write_to + offset; + + let (package, context) = compile_test_module([], [$hir_ty], |builder| { + let addr = builder.u32(read_from, SourceSpan::default()); + let ptr = builder + .inttoptr(addr, Type::from(PointerType::new($hir_ty)), SourceSpan::default()) + .unwrap(); + let loaded = builder.load(ptr, SourceSpan::default()).unwrap(); + builder.ret(Some(loaded), SourceSpan::default()).unwrap(); + }); + + let config = proptest::test_runner::Config::with_cases(10); + let res = TestRunner::new(config).run(&any::<$rust_ty>(), move |value| { + let expected = value.to_le_bytes(); + let mut initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; + initial_bytes[offset as usize] = expected[0]; + initial_bytes[offset as usize + 1] = expected[1]; + let initializers = [Initializer::MemoryBytes { + addr: write_to, + bytes: &initial_bytes, + }]; + + let output = eval_package::<$rust_ty, _, _>( + &package, + initializers, + &[], + context.session(), + |_| Ok(()), + )?; + + prop_assert_eq!(output, value, "expected 0x{:x}; found 0x{:x}", value, output,); + + Ok(()) + }); + + match res { + Err(TestError::Fail(reason, value)) => { + panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); + } + Ok(_) => (), + _ => panic!("Unexpected test result: {res:?}"), + } + } + + #[doc = concat!( + "Tests that loading a `", + stringify!($rust_ty), + "` from a constant byte address at offset 1 stays within the current element." + )] + #[test] + fn $offset_1_test() { + $run_fn(1); + } + + #[doc = concat!( + "Tests that loading a `", + stringify!($rust_ty), + "` from a constant byte address at offset 2 stays within the current element." + )] + #[test] + fn $offset_2_test() { + $run_fn(2); + } + + #[doc = concat!( + "Tests that loading a `", + stringify!($rust_ty), + "` from a constant byte address at offset 3 reconstructs the value across the \ + next element boundary." + )] + #[test] + fn $offset_3_test() { + $run_fn(3); + } + }; +} + +define_constant_address_16bit_load_tests!( + run_load_const_addr_u16, + u16, + Type::U16, + load_const_addr_u16_offset_1, + load_const_addr_u16_offset_2, + load_const_addr_u16_offset_3 +); +define_constant_address_16bit_load_tests!( + run_load_const_addr_i16, + i16, + Type::I16, + load_const_addr_i16_offset_1, + load_const_addr_i16_offset_2, + load_const_addr_i16_offset_3 +); + /// Tests the memory load intrinsic for loads of boolean (i.e. 1-bit) values #[test] fn load_bool() { From 93f42b767a9baf5df75de43d9c575a19988cb395 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 18 Mar 2026 15:22:12 +0200 Subject: [PATCH 25/29] Remove redundant u16 load tests --- .../integration/src/codegen/intrinsics/mem.rs | 112 ------------------ 1 file changed, 112 deletions(-) diff --git a/tests/integration/src/codegen/intrinsics/mem.rs b/tests/integration/src/codegen/intrinsics/mem.rs index 9c4bf3288..2c1929ef8 100644 --- a/tests/integration/src/codegen/intrinsics/mem.rs +++ b/tests/integration/src/codegen/intrinsics/mem.rs @@ -480,118 +480,6 @@ define_unaligned_16bit_load_tests!( load_unaligned_i16 ); -macro_rules! define_constant_address_16bit_load_tests { - ( - $run_fn:ident, - $rust_ty:ty, - $hir_ty:expr, - $offset_1_test:ident, - $offset_2_test:ident, - $offset_3_test:ident - ) => { - #[doc = concat!( - "Runs a `", - stringify!($rust_ty), - "` load test from a constant byte address at the specified offset." - )] - fn $run_fn(offset: u32) { - setup::enable_compiler_instrumentation(); - - let write_to = 17 * 2u32.pow(16); - let read_from = write_to + offset; - - let (package, context) = compile_test_module([], [$hir_ty], |builder| { - let addr = builder.u32(read_from, SourceSpan::default()); - let ptr = builder - .inttoptr(addr, Type::from(PointerType::new($hir_ty)), SourceSpan::default()) - .unwrap(); - let loaded = builder.load(ptr, SourceSpan::default()).unwrap(); - builder.ret(Some(loaded), SourceSpan::default()).unwrap(); - }); - - let config = proptest::test_runner::Config::with_cases(10); - let res = TestRunner::new(config).run(&any::<$rust_ty>(), move |value| { - let expected = value.to_le_bytes(); - let mut initial_bytes = [0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88]; - initial_bytes[offset as usize] = expected[0]; - initial_bytes[offset as usize + 1] = expected[1]; - let initializers = [Initializer::MemoryBytes { - addr: write_to, - bytes: &initial_bytes, - }]; - - let output = eval_package::<$rust_ty, _, _>( - &package, - initializers, - &[], - context.session(), - |_| Ok(()), - )?; - - prop_assert_eq!(output, value, "expected 0x{:x}; found 0x{:x}", value, output,); - - Ok(()) - }); - - match res { - Err(TestError::Fail(reason, value)) => { - panic!("FAILURE: {}\nMinimal failing case: {value:?}", reason.message()); - } - Ok(_) => (), - _ => panic!("Unexpected test result: {res:?}"), - } - } - - #[doc = concat!( - "Tests that loading a `", - stringify!($rust_ty), - "` from a constant byte address at offset 1 stays within the current element." - )] - #[test] - fn $offset_1_test() { - $run_fn(1); - } - - #[doc = concat!( - "Tests that loading a `", - stringify!($rust_ty), - "` from a constant byte address at offset 2 stays within the current element." - )] - #[test] - fn $offset_2_test() { - $run_fn(2); - } - - #[doc = concat!( - "Tests that loading a `", - stringify!($rust_ty), - "` from a constant byte address at offset 3 reconstructs the value across the \ - next element boundary." - )] - #[test] - fn $offset_3_test() { - $run_fn(3); - } - }; -} - -define_constant_address_16bit_load_tests!( - run_load_const_addr_u16, - u16, - Type::U16, - load_const_addr_u16_offset_1, - load_const_addr_u16_offset_2, - load_const_addr_u16_offset_3 -); -define_constant_address_16bit_load_tests!( - run_load_const_addr_i16, - i16, - Type::I16, - load_const_addr_i16_offset_1, - load_const_addr_i16_offset_2, - load_const_addr_i16_offset_3 -); - /// Tests the memory load intrinsic for loads of boolean (i.e. 1-bit) values #[test] fn load_bool() { From 17b7e4ad2f28a66967257fc4b5fd3add77d1a44b Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 25 Mar 2026 17:30:22 +0200 Subject: [PATCH 26/29] chore: address PR 1004 review notes --- codegen/masm/intrinsics/mem.masm | 4 +-- codegen/masm/src/emit/mem.rs | 54 ++++++++++++++++++++++++-------- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/codegen/masm/intrinsics/mem.masm b/codegen/masm/intrinsics/mem.masm index a9309b486..364ed5d1d 100644 --- a/codegen/masm/intrinsics/mem.masm +++ b/codegen/masm/intrinsics/mem.masm @@ -199,7 +199,7 @@ end # which is the offset of the first byte, in the 32-bit representation of that element. # # Stack transition: [addr, offset] -> [value] -pub proc load_u16 +pub proc load_u16(addr: ptr, offset: u8) -> u16 exec.load_sw push.65535 u32and @@ -460,7 +460,7 @@ end # which is the offset of the first byte, in the 32-bit representation of that element. # # Stack transition: [addr, offset, value] -> [] -pub proc store_u16 +pub proc store_u16(addr: ptr, offset: u8, value: u16) # Load the current 32-bit window at the destination, keep its upper half, then overwrite the # target two bytes before delegating the write-back to `store_sw`. dup.1 dup.1 exec.load_sw # [window, addr, offset, value] diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 8a12fd3c3..a63927654 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -9,10 +9,19 @@ use crate::{OperandStack, lower::NativePtr}; /// Allocation impl OpEmitter<'_> { + /// Build a MASM `assertz` instruction with an inline diagnostic. + fn assertz_with_message(message: &'static str, span: SourceSpan) -> masm::Instruction { + masm::Instruction::AssertzWithError(masm::Immediate::Value(masm::Span::new( + span, + message.into(), + ))) + } + /// Emit the loop header for a counted `while.true` loop. /// - /// The caller provides the `dup` instruction needed to bring `count` to the top of the stack - /// after the loop index has been seeded with zero. + /// The caller provides the concrete `dup` instruction needed to bring `count` to the top of + /// the stack after the loop index has been seeded with zero, because each caller carries + /// `count` at a different depth in its loop state. /// /// Stack transition: /// @@ -32,8 +41,9 @@ impl OpEmitter<'_> { /// Emit the loop back-edge condition for a counted `while.true` loop. /// - /// The caller provides the `dup` instruction needed to bring `count` to the top of the stack - /// after incrementing the loop index. + /// The caller provides the concrete `dup` instruction needed to bring `count` to the top of + /// the stack after incrementing the loop index, because each caller carries `count` at a + /// different depth in its loop state. /// /// Stack transition: /// @@ -63,19 +73,25 @@ impl OpEmitter<'_> { self.emit_all( [ masm::Instruction::U32DivModImm(16.into()), - masm::Instruction::Assertz, + Self::assertz_with_message( + "expected a 16-byte-aligned byte pointer for the word-copy fast path", + span, + ), masm::Instruction::U32OverflowingMulImm(4.into()), - masm::Instruction::Assertz, + Self::assertz_with_message( + "word-copy fast path element address conversion overflowed", + span, + ), ], span, ); } - /// Build a raw MASM block whose stack protocol is managed by the caller. + /// Build a MASM block whose stack protocol is managed by the caller. /// /// This is used for branch bodies which operate on a known stack shape from the enclosing /// emitter, but which do not need to synchronize typed operand-stack state back to it. - fn build_raw_block( + fn build_masm_block( &mut self, span: SourceSpan, emit: impl FnOnce(&mut OpEmitter<'_>), @@ -816,27 +832,39 @@ impl OpEmitter<'_> { ); // then: convert byte addresses/count to element units and delegate to core - let then_blk = self.build_raw_block(span, |then_emitter| { + let then_blk = self.build_masm_block(span, |then_emitter| { then_emitter.emit_all( [ // Convert `src` to element address masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, + Self::assertz_with_message( + "memcpy byte-copy fast path expected the source pointer to be \ + 4-byte aligned", + span, + ), // Convert `dst` to an element address masm::Instruction::Swap1, masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, + Self::assertz_with_message( + "memcpy byte-copy fast path expected the destination pointer to \ + be 4-byte aligned", + span, + ), // Bring `count` to top to convert to element count masm::Instruction::Swap2, masm::Instruction::U32DivModImm(4.into()), - masm::Instruction::Assertz, + Self::assertz_with_message( + "memcpy byte-copy fast path expected the byte count to be \ + divisible by 4", + span, + ), ], span, ); then_emitter.raw_exec("::miden::core::mem::memcopy_elements", span); }); - let else_blk = self.build_raw_block(span, |else_emitter| { + let else_blk = self.build_masm_block(span, |else_emitter| { else_emitter.emit_memcpy_fallback_loop( src.clone(), dst.clone(), From 2aa50e4f7a704e2cb5880506c28d79bd3caa40c3 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Wed, 25 Mar 2026 18:01:14 +0200 Subject: [PATCH 27/29] chore: add error messages to all emitted `Instruction::Assert*` ops --- codegen/masm/src/emit/felt.rs | 12 +++-- codegen/masm/src/emit/int128.rs | 16 +++++-- codegen/masm/src/emit/int32.rs | 32 ++++++++++--- codegen/masm/src/emit/int64.rs | 40 ++++++++++++---- codegen/masm/src/emit/mem.rs | 42 +++++++++-------- codegen/masm/src/emit/mod.rs | 46 +++++++++++++++++- codegen/masm/src/emit/primop.rs | 75 +++++++++++++++++++++--------- codegen/masm/src/emit/smallint.rs | 8 +++- codegen/masm/src/emit/unary.rs | 18 +++++-- codegen/masm/src/lower/lowering.rs | 11 +++-- 10 files changed, 228 insertions(+), 72 deletions(-) diff --git a/codegen/masm/src/emit/felt.rs b/codegen/masm/src/emit/felt.rs index abc43d77c..6578cf5b2 100644 --- a/codegen/masm/src/emit/felt.rs +++ b/codegen/masm/src/emit/felt.rs @@ -32,7 +32,13 @@ impl OpEmitter<'_> { /// `[a, ..] => [a, ..]` #[inline(always)] pub fn assert_felt_is_zero(&mut self, span: SourceSpan) { - self.emit_all([masm::Instruction::Dup0, masm::Instruction::Assertz], span); + self.emit_all( + [ + masm::Instruction::Dup0, + Self::assertz_with_message_inst("expected felt value to be zero", span), + ], + span, + ); } /// Convert a field element to i128 by zero-extension. @@ -85,7 +91,7 @@ impl OpEmitter<'_> { // Split into u32 limbs masm::Instruction::U32Split, // Assert most significant 32 bits are unused - masm::Instruction::Assertz, + Self::assertz_with_message_inst("felt value does not fit in 32 bits", span), ], span, ); @@ -105,7 +111,7 @@ impl OpEmitter<'_> { // Split into u32 limbs masm::Instruction::U32Split, // Assert most significant 32 bits are unused - masm::Instruction::Assertz, + Self::assertz_with_message_inst("felt value does not fit in 32 bits", span), ], span, ); diff --git a/codegen/masm/src/emit/int128.rs b/codegen/masm/src/emit/int128.rs index 2b2db8a50..22bc850dc 100644 --- a/codegen/masm/src/emit/int128.rs +++ b/codegen/masm/src/emit/int128.rs @@ -78,7 +78,11 @@ impl OpEmitter<'_> { // // What remains on the stack at this point are the low 64-bits, // which is also our result. - self.emit_n(2, masm::Instruction::Assertz, span); + self.emit_n( + 2, + Self::assertz_with_message_inst("128-bit value does not fit in u64", span), + span, + ); } /// Convert a 128-bit value to u32 @@ -95,7 +99,11 @@ impl OpEmitter<'_> { // // What remains on the stack at this point are the low 32-bits, // which is also our result. - self.emit_n(3, masm::Instruction::Assertz, span); + self.emit_n( + 3, + Self::assertz_with_message_inst("128-bit value does not fit in u32", span), + span, + ); } /// Convert a unsigned 128-bit value to i64 @@ -139,7 +147,7 @@ impl OpEmitter<'_> { [ // Assert that both 32-bit limbs of the most significant 64 bits match, // consuming them in the process - masm::Instruction::AssertEq, + Self::assert_eq_with_message_inst("128-bit value does not fit in i64", span), // At this point, the stack is: [is_signed, x1, x0] // // Select an expected value for the sign bit based on the is_signed flag @@ -158,7 +166,7 @@ impl OpEmitter<'_> { // any other combination will trap. // // [x1, x0] - masm::Instruction::AssertEq, + Self::assert_eq_with_message_inst("128-bit value does not fit in i64", span), ], span, ); diff --git a/codegen/masm/src/emit/int32.rs b/codegen/masm/src/emit/int32.rs index 56afce7a8..71dcbd7c7 100644 --- a/codegen/masm/src/emit/int32.rs +++ b/codegen/masm/src/emit/int32.rs @@ -107,7 +107,7 @@ impl OpEmitter<'_> { #[inline] pub fn assert_signed_int32(&mut self, span: SourceSpan) { self.is_signed_int32(span); - self.emit(masm::Instruction::Assert, span); + self.emit(Self::assert_with_message_inst("expected a signed i32 value", span), span); } /// Emits code to assert that a 32-bit value on the operand stack does not have the i32 sign bit @@ -119,7 +119,7 @@ impl OpEmitter<'_> { #[inline] pub fn assert_unsigned_int32(&mut self, span: SourceSpan) { self.is_signed_int32(span); - self.emit(masm::Instruction::Assertz, span); + self.emit(Self::assertz_with_message_inst("expected a non-negative i32 value", span), span); } /// Assert that the 32-bit value on the stack is a valid i32 value @@ -131,7 +131,7 @@ impl OpEmitter<'_> { // the value is <= i32::MIN, which is 1 more than i32::MAX. self.push_i32(i32::MIN, span); self.emit(masm::Instruction::U32Lte, span); - self.emit(masm::Instruction::Assert, span); + self.emit(Self::assert_with_message_inst("value does not fit in i32", span), span); } /// Emits code to assert that a 32-bit value on the operand stack is equal to the given constant @@ -148,7 +148,10 @@ impl OpEmitter<'_> { [ masm::Instruction::Dup0, masm::Instruction::EqImm(Felt::new(value as u64).into()), - masm::Instruction::Assert, + Self::assert_with_message_inst( + format!("expected u32 value to equal {value}"), + span, + ), ], span, ); @@ -164,7 +167,13 @@ impl OpEmitter<'_> { /// `[expected, input, ..] => [input, ..]` #[inline] pub fn assert_eq_u32(&mut self, span: SourceSpan) { - self.emit_all([masm::Instruction::Dup1, masm::Instruction::AssertEq], span); + self.emit_all( + [ + masm::Instruction::Dup1, + Self::assert_eq_with_message_inst("expected u32 values to be equal", span), + ], + span, + ); } /// Emits code to select a constant u32 value, using the `n`th value on the operand @@ -244,7 +253,10 @@ impl OpEmitter<'_> { // Apply the mask masm::Instruction::U32And, // Assert that the masked bits and the mask are equal - masm::Instruction::AssertEq, + Self::assert_eq_with_message_inst( + format!("value does not fit in signed {n}-bit range"), + span, + ), ], span, ); @@ -293,7 +305,13 @@ impl OpEmitter<'_> { self.emit_push(mask, span); self.emit(masm::Instruction::U32And, span); // Assert the masked value is all 0s - self.emit(masm::Instruction::Assertz, span); + self.emit( + Self::assertz_with_message_inst( + format!("value does not fit in unsigned {n}-bit range"), + span, + ), + span, + ); } /// Convert an i32/u32 value on the stack to an unsigned N-bit integer value diff --git a/codegen/masm/src/emit/int64.rs b/codegen/masm/src/emit/int64.rs index 53c0bd935..668e3b178 100644 --- a/codegen/masm/src/emit/int64.rs +++ b/codegen/masm/src/emit/int64.rs @@ -14,7 +14,7 @@ impl OpEmitter<'_> { // Assert that value is <= P, then unsplit the limbs to get a felt self.push_u64(P, span); self.lt_u64(span); - self.emit(masm::Instruction::Assert, span); + self.emit(Self::assert_with_message_inst("u64 value does not fit in felt", span), span); // `u32unsplit` expects `[hi, lo]` on the stack; u64 values are represented as `[lo, hi]`. self.emit(masm::Instruction::Swap1, span); self.u32unsplit(span); @@ -41,14 +41,27 @@ impl OpEmitter<'_> { // Bring `hi` to the top of the stack and assert it is zero. This consumes `hi`, // leaving only `lo` on the stack. masm::Instruction::Swap1, - masm::Instruction::Assertz, + // Assert hi bits are zero + Self::assertz_with_message_inst( + format!("u64 value does not fit in unsigned {n}-bit range"), + span, + ), // Check that the remaining bits fit in range masm::Instruction::Dup0, ], span, ); self.emit_push(Felt::new(2u64.pow(n) - 1), span); - self.emit_all([masm::Instruction::U32Lte, masm::Instruction::Assert], span); + self.emit_all( + [ + masm::Instruction::U32Lte, + Self::assert_with_message_inst( + format!("u64 value does not fit in unsigned {n}-bit range"), + span, + ), + ], + span, + ); } /// Convert an i64 value to a signed N-bit integer, where N <= 32 @@ -75,7 +88,10 @@ impl OpEmitter<'_> { self.emit_all( [ // [is_unsigned, x_lo] - masm::Instruction::AssertEq, + Self::assert_eq_with_message_inst( + format!("i64 value does not fit in signed {n}-bit range"), + span, + ), // [x_lo, is_unsigned, x_lo] masm::Instruction::Dup1, ], @@ -104,7 +120,10 @@ impl OpEmitter<'_> { // [expected_sign_bits, sign_bits, x_lo] masm::Instruction::CDrop, // [x_lo] - masm::Instruction::AssertEq, + Self::assert_eq_with_message_inst( + format!("i64 value does not fit in signed {n}-bit range"), + span, + ), ], span, ); @@ -220,7 +239,7 @@ impl OpEmitter<'_> { // the value is <= i64::MIN, which is 1 more than i64::MAX. self.push_i64(i64::MIN, span); self.lte_u64(span); - self.emit(masm::Instruction::Assert, span); + self.emit(Self::assert_with_message_inst("value does not fit in i64", span), span); } /// Duplicate the i64/u64 value on top of the stack @@ -428,7 +447,7 @@ impl OpEmitter<'_> { match overflow { Overflow::Checked => { self.raw_exec("::miden::core::math::u64::overflowing_add", span); - self.emit(masm::Instruction::Assertz, span); + self.emit(Self::assertz_with_message_inst("u64 addition overflowed", span), span); } Overflow::Unchecked | Overflow::Wrapping => { self.raw_exec("::miden::core::math::u64::wrapping_add", span); @@ -493,7 +512,10 @@ impl OpEmitter<'_> { match overflow { Overflow::Checked => { self.raw_exec("::miden::core::math::u64::overflowing_sub", span); - self.emit(masm::Instruction::Assertz, span); + self.emit( + Self::assertz_with_message_inst("u64 subtraction overflowed", span), + span, + ); } Overflow::Unchecked | Overflow::Wrapping => { self.raw_exec("::miden::core::math::u64::wrapping_sub", span); @@ -575,7 +597,7 @@ impl OpEmitter<'_> { masm::Instruction::Drop, // Bring overflow back to the top and assert it is zero masm::Instruction::MovUp2, - masm::Instruction::Assertz, + Self::assertz_with_message_inst("u64 multiplication overflowed", span), ], span, ); diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index a63927654..943e65584 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -9,14 +9,6 @@ use crate::{OperandStack, lower::NativePtr}; /// Allocation impl OpEmitter<'_> { - /// Build a MASM `assertz` instruction with an inline diagnostic. - fn assertz_with_message(message: &'static str, span: SourceSpan) -> masm::Instruction { - masm::Instruction::AssertzWithError(masm::Immediate::Value(masm::Span::new( - span, - message.into(), - ))) - } - /// Emit the loop header for a counted `while.true` loop. /// /// The caller provides the concrete `dup` instruction needed to bring `count` to the top of @@ -73,12 +65,12 @@ impl OpEmitter<'_> { self.emit_all( [ masm::Instruction::U32DivModImm(16.into()), - Self::assertz_with_message( + Self::assertz_with_message_inst( "expected a 16-byte-aligned byte pointer for the word-copy fast path", span, ), masm::Instruction::U32OverflowingMulImm(4.into()), - Self::assertz_with_message( + Self::assertz_with_message_inst( "word-copy fast path element address conversion overflowed", span, ), @@ -728,7 +720,10 @@ impl OpEmitter<'_> { body_emitter.emit_all( [ masm::Instruction::U32WideningMadd, // [value_size * i + dst, i, dst, count, value] - masm::Instruction::Assertz, // [aligned_dst, i, dst, count, value..] + Self::assertz_with_message_inst( + "memset destination address computation overflowed", + span, + ), // [aligned_dst, i, dst, count, value..] ], span, ); @@ -837,7 +832,7 @@ impl OpEmitter<'_> { [ // Convert `src` to element address masm::Instruction::U32DivModImm(4.into()), - Self::assertz_with_message( + Self::assertz_with_message_inst( "memcpy byte-copy fast path expected the source pointer to be \ 4-byte aligned", span, @@ -845,7 +840,7 @@ impl OpEmitter<'_> { // Convert `dst` to an element address masm::Instruction::Swap1, masm::Instruction::U32DivModImm(4.into()), - Self::assertz_with_message( + Self::assertz_with_message_inst( "memcpy byte-copy fast path expected the destination pointer to \ be 4-byte aligned", span, @@ -853,7 +848,7 @@ impl OpEmitter<'_> { // Bring `count` to top to convert to element count masm::Instruction::Swap2, masm::Instruction::U32DivModImm(4.into()), - Self::assertz_with_message( + Self::assertz_with_message_inst( "memcpy byte-copy fast path expected the byte count to be \ divisible by 4", span, @@ -910,7 +905,10 @@ impl OpEmitter<'_> { masm::Instruction::Swap2, // Compute the corrected count masm::Instruction::U32WideningMulImm(factor.into()), - masm::Instruction::Assertz, // [count * (size / 16), src, dst] + Self::assertz_with_message_inst( + "memcpy word-copy fast path element count overflowed", + span, + ), // [count * (size / 16), src, dst] ], span, ); @@ -952,9 +950,12 @@ impl OpEmitter<'_> { body_emitter.emit_all( [ masm::Instruction::U32WideningMadd, - masm::Instruction::Assertz, // [new_dst := i * offset + dst, i, src, dst, count] - masm::Instruction::Dup2, // [src, new_dst, i, src, dst, count] - masm::Instruction::Dup2, // [i, src, new_dst, i, src, dst, count] + Self::assertz_with_message_inst( + "memcpy destination address computation overflowed", + span, + ), // [new_dst := i * offset + dst, i, src, dst, count] + masm::Instruction::Dup2, // [src, new_dst, i, src, dst, count] + masm::Instruction::Dup2, // [i, src, new_dst, i, src, dst, count] ], span, ); @@ -962,7 +963,10 @@ impl OpEmitter<'_> { body_emitter.emit_all( [ masm::Instruction::U32WideningMadd, - masm::Instruction::Assertz, // [new_src := i * offset + src, new_dst, i, src, dst, count] + Self::assertz_with_message_inst( + "memcpy source address computation overflowed", + span, + ), // [new_src := i * offset + src, new_dst, i, src, dst, count] ], span, ); diff --git a/codegen/masm/src/emit/mod.rs b/codegen/masm/src/emit/mod.rs index e2149650f..24605cea2 100644 --- a/codegen/masm/src/emit/mod.rs +++ b/codegen/masm/src/emit/mod.rs @@ -1,4 +1,4 @@ -use alloc::rc::Rc; +use alloc::{rc::Rc, sync::Arc}; use midenc_session::diagnostics::Span; @@ -155,6 +155,48 @@ pub struct OpEmitter<'a> { current_block: &'a mut Vec, } impl<'a> OpEmitter<'a> { + /// Build a MASM `assert` instruction with an inline diagnostic. + #[inline] + pub fn assert_with_message_inst( + message: impl Into>, + span: SourceSpan, + ) -> masm::Instruction { + masm::Instruction::AssertWithError(masm::Immediate::Value(Span::new(span, message.into()))) + } + + /// Build a MASM `assert_eq` instruction with an inline diagnostic. + #[inline] + pub fn assert_eq_with_message_inst( + message: impl Into>, + span: SourceSpan, + ) -> masm::Instruction { + masm::Instruction::AssertEqWithError(masm::Immediate::Value(Span::new( + span, + message.into(), + ))) + } + + /// Build a MASM `assert_eqw` instruction with an inline diagnostic. + #[inline] + pub fn assert_eqw_with_message_inst( + message: impl Into>, + span: SourceSpan, + ) -> masm::Instruction { + masm::Instruction::AssertEqwWithError(masm::Immediate::Value(Span::new( + span, + message.into(), + ))) + } + + /// Build a MASM `assertz` instruction with an inline diagnostic. + #[inline] + pub fn assertz_with_message_inst( + message: impl Into>, + span: SourceSpan, + ) -> masm::Instruction { + masm::Instruction::AssertzWithError(masm::Immediate::Value(Span::new(span, message.into()))) + } + #[inline(always)] pub fn new( invoked: &'a mut BTreeSet, @@ -2077,7 +2119,7 @@ mod tests { emitter.assert_eq_imm(ten, SourceSpan::default()); assert_eq!(emitter.stack_len(), 2); - emitter.assert_eq(SourceSpan::default()); + emitter.assert_eq(None, SourceSpan::default()); assert_eq!(emitter.stack_len(), 0); } diff --git a/codegen/masm/src/emit/primop.rs b/codegen/masm/src/emit/primop.rs index 529aa9208..76f13ab25 100644 --- a/codegen/masm/src/emit/primop.rs +++ b/codegen/masm/src/emit/primop.rs @@ -1,4 +1,5 @@ use miden_assembly_syntax::parser::WordValue; +use midenc_dialect_hir::assertions; use midenc_hir::{ Felt, Immediate, SourceSpan, Type, dialects::builtin::attributes::{ArgumentExtension, Signature}, @@ -8,12 +9,26 @@ use super::{OpEmitter, int64, masm}; use crate::TraceEvent; impl OpEmitter<'_> { + /// Format a diagnostic message for a HIR assertion code when one is available. + fn assertion_message(code: Option, default: impl Into) -> String { + let default = default.into(); + match code.filter(|code| *code != 0) { + Some(assertions::ASSERT_FAILED_ALIGNMENT) => { + "pointer address does not meet minimum alignment for the type".into() + } + Some(code) => format!("{default} (assertion code 0x{code:08x})"), + None => default, + } + } + /// Assert that an integer value on the stack has the value 1 /// /// This operation consumes the input value. - pub fn assert(&mut self, _code: Option, span: SourceSpan) { + pub fn assert(&mut self, code: Option, span: SourceSpan) { let arg = self.stack.pop().expect("operand stack is empty"); - match arg.ty() { + let ty = arg.ty().clone(); + let message = Self::assertion_message(code, format!("expected {ty} value to equal 1")); + match ty { Type::Felt | Type::U32 | Type::I32 @@ -22,7 +37,7 @@ impl OpEmitter<'_> { | Type::U8 | Type::I8 | Type::I1 => { - self.emit(masm::Instruction::Assert, span); + self.emit(Self::assert_with_message_inst(message, span), span); } Type::I128 | Type::U128 => { self.emit_all( @@ -31,13 +46,19 @@ impl OpEmitter<'_> { span, WordValue([Felt::ZERO, Felt::ZERO, Felt::ZERO, Felt::ONE]).into(), ))), - masm::Instruction::AssertEqw, + Self::assert_eqw_with_message_inst(message, span), ], span, ); } Type::U64 | Type::I64 => { - self.emit_all([masm::Instruction::Assertz, masm::Instruction::Assert], span); + self.emit_all( + [ + Self::assertz_with_message_inst(message.clone(), span), + Self::assert_with_message_inst(message, span), + ], + span, + ); } ty if !ty.is_integer() => { panic!("invalid argument to assert: expected integer, got {ty}") @@ -49,9 +70,11 @@ impl OpEmitter<'_> { /// Assert that an integer value on the stack has the value 0 /// /// This operation consumes the input value. - pub fn assertz(&mut self, _code: Option, span: SourceSpan) { + pub fn assertz(&mut self, code: Option, span: SourceSpan) { let arg = self.stack.pop().expect("operand stack is empty"); - match arg.ty() { + let ty = arg.ty().clone(); + let message = Self::assertion_message(code, format!("expected {ty} value to equal 0")); + match ty { Type::Felt | Type::U32 | Type::I32 @@ -60,10 +83,16 @@ impl OpEmitter<'_> { | Type::U8 | Type::I8 | Type::I1 => { - self.emit(masm::Instruction::Assertz, span); + self.emit(Self::assertz_with_message_inst(message, span), span); } Type::U64 | Type::I64 => { - self.emit_all([masm::Instruction::Assertz, masm::Instruction::Assertz], span); + self.emit_all( + [ + Self::assertz_with_message_inst(message.clone(), span), + Self::assertz_with_message_inst(message, span), + ], + span, + ); } Type::U128 | Type::I128 => { self.emit_all( @@ -72,7 +101,7 @@ impl OpEmitter<'_> { span, WordValue([Felt::ZERO; 4]).into(), ))), - masm::Instruction::AssertEqw, + Self::assert_eqw_with_message_inst(message, span), ], span, ); @@ -87,11 +116,12 @@ impl OpEmitter<'_> { /// Assert that the top two integer values on the stack have the same value /// /// This operation consumes the input values. - pub fn assert_eq(&mut self, span: SourceSpan) { + pub fn assert_eq(&mut self, code: Option, span: SourceSpan) { let rhs = self.pop().expect("operand stack is empty"); let lhs = self.pop().expect("operand stack is empty"); - let ty = lhs.ty(); + let ty = lhs.ty().clone(); assert_eq!(ty, rhs.ty(), "expected assert_eq operands to have the same type"); + let message = Self::assertion_message(code, format!("expected {ty} values to be equal")); match ty { Type::Felt | Type::U32 @@ -101,17 +131,19 @@ impl OpEmitter<'_> { | Type::U8 | Type::I8 | Type::I1 => { - self.emit(masm::Instruction::AssertEq, span); + self.emit(Self::assert_eq_with_message_inst(message, span), span); + } + Type::U128 | Type::I128 => { + self.emit(Self::assert_eqw_with_message_inst(message, span), span) } - Type::U128 | Type::I128 => self.emit(masm::Instruction::AssertEqw, span), Type::U64 | Type::I64 => { self.emit_all( [ // compare the hi bits masm::Instruction::MovUp2, - masm::Instruction::AssertEq, + Self::assert_eq_with_message_inst(message.clone(), span), // compare the low bits - masm::Instruction::AssertEq, + Self::assert_eq_with_message_inst(message, span), ], span, ); @@ -130,7 +162,8 @@ impl OpEmitter<'_> { #[allow(unused)] pub fn assert_eq_imm(&mut self, imm: Immediate, span: SourceSpan) { let lhs = self.pop().expect("operand stack is empty"); - let ty = lhs.ty(); + let ty = lhs.ty().clone(); + let message = format!("expected {ty} value to equal {imm}"); assert_eq!(ty, imm.ty(), "expected assert_eq_imm operands to have the same type"); match ty { Type::Felt @@ -144,14 +177,14 @@ impl OpEmitter<'_> { self.emit_all( [ masm::Instruction::EqImm(imm.as_felt().unwrap().into()), - masm::Instruction::Assert, + Self::assert_with_message_inst(message, span), ], span, ); } Type::I128 | Type::U128 => { self.push_immediate(imm, span); - self.emit(masm::Instruction::AssertEqw, span) + self.emit(Self::assert_eqw_with_message_inst(message, span), span) } Type::I64 | Type::U64 => { let imm = match imm { @@ -163,9 +196,9 @@ impl OpEmitter<'_> { self.emit_all( [ masm::Instruction::EqImm(Felt::new(hi as u64).into()), - masm::Instruction::Assert, + Self::assert_with_message_inst(message.clone(), span), masm::Instruction::EqImm(Felt::new(lo as u64).into()), - masm::Instruction::Assert, + Self::assert_with_message_inst(message, span), ], span, ) diff --git a/codegen/masm/src/emit/smallint.rs b/codegen/masm/src/emit/smallint.rs index 3a66de0ce..20abe1b77 100644 --- a/codegen/masm/src/emit/smallint.rs +++ b/codegen/masm/src/emit/smallint.rs @@ -46,7 +46,13 @@ impl OpEmitter<'_> { 1 => (), n => { self.is_signed_smallint(n, span); - self.emit(masm::Instruction::Assert, span); + self.emit( + Self::assert_with_message_inst( + format!("{n}-bit integer signedness check failed"), + span, + ), + span, + ); } } } diff --git a/codegen/masm/src/emit/unary.rs b/codegen/masm/src/emit/unary.rs index 265343a02..7cda1d5ee 100644 --- a/codegen/masm/src/emit/unary.rs +++ b/codegen/masm/src/emit/unary.rs @@ -345,7 +345,16 @@ impl OpEmitter<'_> { // bit being set will make the i8 larger than 0 or 1 self.emit(masm::Instruction::Dup0, span); self.emit_push(2u32, span); - self.emit_all([masm::Instruction::Lt, masm::Instruction::Assert], span); + self.emit_all( + [ + masm::Instruction::Lt, + Self::assert_with_message_inst( + "expected i8 value to be 0 or 1 when casting to i1", + span, + ), + ], + span, + ); } // i1 (Type::I1, _) => self.zext_smallint(src_bits, dst_bits, span), @@ -436,7 +445,7 @@ impl OpEmitter<'_> { masm::Instruction::Swap1, masm::Instruction::Sub, masm::Instruction::U32OverflowingSubImm(1.into()), - masm::Instruction::Assertz, + Self::assertz_with_message_inst("ilog2 is undefined for zero", span), ], span, ); @@ -935,7 +944,10 @@ impl OpEmitter<'_> { self.emit_all( [ // Assert that the high bits are zero - masm::Instruction::Assertz, + Self::assertz_with_message_inst( + "u64 exponent for pow2 must fit in u32", + span, + ), // This asserts if value > 63, thus result is guaranteed to fit in u64 masm::Instruction::Pow2, // Obtain the u64 representation by splitting the felt result diff --git a/codegen/masm/src/lower/lowering.rs b/codegen/masm/src/lower/lowering.rs index b0e1bd65f..c4a840321 100644 --- a/codegen/masm/src/lower/lowering.rs +++ b/codegen/masm/src/lower/lowering.rs @@ -13,7 +13,9 @@ use midenc_session::diagnostics::{Report, Severity, Spanned}; use smallvec::{SmallVec, smallvec}; use super::*; -use crate::{Constraint, emitter::BlockEmitter, masm, opt::operands::SolverOptions}; +use crate::{ + Constraint, emit::OpEmitter, emitter::BlockEmitter, masm, opt::operands::SolverOptions, +}; /// Convert a resolved callee [`midenc_hir::SymbolPath`] into a MASM [`masm::InvocationTarget`]. fn invocation_target_from_symbol_path( @@ -462,7 +464,9 @@ impl HirLowering for hir::Assertz { impl HirLowering for hir::AssertEq { fn emit(&self, emitter: &mut BlockEmitter<'_>) -> Result<(), Report> { - emitter.emitter().assert_eq(self.span()); + let code = *self.get_code(); + + emitter.emitter().assert_eq(Some(code), self.span()); Ok(()) } @@ -475,7 +479,8 @@ impl HirLowering for ub::Unreachable { let span = self.span(); let mut op_emitter = emitter.emitter(); op_emitter.emit_push(0u32, span); - op_emitter.emit(masm::Instruction::Assert, span); + op_emitter + .emit(OpEmitter::assert_with_message_inst("entered unreachable code", span), span); Ok(()) } From c803c426880c69823d7b79be7aa447fa31cb2a29 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Fri, 27 Mar 2026 10:15:06 +0300 Subject: [PATCH 28/29] fix: build after the rebase --- codegen/masm/src/emit/mem.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 943e65584..337d23118 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -69,7 +69,7 @@ impl OpEmitter<'_> { "expected a 16-byte-aligned byte pointer for the word-copy fast path", span, ), - masm::Instruction::U32OverflowingMulImm(4.into()), + masm::Instruction::U32WrappingMulImm(4.into()), Self::assertz_with_message_inst( "word-copy fast path element address conversion overflowed", span, From 4a529fd7917d3af2b90af7563a7ae9bf18043e78 Mon Sep 17 00:00:00 2001 From: Denys Zadorozhnyi Date: Fri, 27 Mar 2026 12:13:14 +0300 Subject: [PATCH 29/29] fix: migrate the #1003 fix to the VM v0.22 --- codegen/masm/src/emit/mem.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/codegen/masm/src/emit/mem.rs b/codegen/masm/src/emit/mem.rs index 337d23118..8c0707bfd 100644 --- a/codegen/masm/src/emit/mem.rs +++ b/codegen/masm/src/emit/mem.rs @@ -69,7 +69,9 @@ impl OpEmitter<'_> { "expected a 16-byte-aligned byte pointer for the word-copy fast path", span, ), - masm::Instruction::U32WrappingMulImm(4.into()), + // `u32widening_mul` leaves `[lo, hi]` on the stack; assert on `hi` and keep `lo`. + masm::Instruction::U32WideningMulImm(4.into()), + masm::Instruction::Swap1, Self::assertz_with_message_inst( "word-copy fast path element address conversion overflowed", span, @@ -720,6 +722,7 @@ impl OpEmitter<'_> { body_emitter.emit_all( [ masm::Instruction::U32WideningMadd, // [value_size * i + dst, i, dst, count, value] + masm::Instruction::Swap1, Self::assertz_with_message_inst( "memset destination address computation overflowed", span, @@ -905,6 +908,7 @@ impl OpEmitter<'_> { masm::Instruction::Swap2, // Compute the corrected count masm::Instruction::U32WideningMulImm(factor.into()), + masm::Instruction::Swap1, Self::assertz_with_message_inst( "memcpy word-copy fast path element count overflowed", span, @@ -950,6 +954,7 @@ impl OpEmitter<'_> { body_emitter.emit_all( [ masm::Instruction::U32WideningMadd, + masm::Instruction::Swap1, Self::assertz_with_message_inst( "memcpy destination address computation overflowed", span, @@ -963,6 +968,7 @@ impl OpEmitter<'_> { body_emitter.emit_all( [ masm::Instruction::U32WideningMadd, + masm::Instruction::Swap1, Self::assertz_with_message_inst( "memcpy source address computation overflowed", span,