diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a6c0f9799..b84bf90d9c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 #### Enhancements
 
+- Added new `memcopy` procedure for the `std::mem` module. ([#2361](https://github.com/0xMiden/miden-vm/pull/2361)).
 - Added new `adv.push_mapval_count` instruction ([#2349](https://github.com/0xMiden/miden-vm/pull/2349)).
 - Added new `memcopy_elements` procedure for the `std::mem` module ([#2352](https://github.com/0xMiden/miden-vm/pull/2352)).
 - Added new `peek` procedure for the `std::collections::smt` module ([#2387](https://github.com/0xMiden/miden-vm/pull/2387)).
diff --git a/docs/src/user_docs/libcore/mem.md b/docs/src/user_docs/libcore/mem.md
index 641bf7a157..e82b572279 100644
--- a/docs/src/user_docs/libcore/mem.md
+++ b/docs/src/user_docs/libcore/mem.md
@@ -8,8 +8,9 @@ Module `miden::core::mem` contains a set of utility procedures for working with
 
 | Procedure                              | Description   |
 | -------------------------------------- | ------------- |
+| `memcopy`                              | Copies `n` elements from `read_ptr` to `write_ptr`.<br/><br/>If possible, this procedure will copy word-aligned words internally using `memcopy_words` procedure, decreasing the total number of cycles required to make a copy.<br/><br/>It is advised to copy small chunks of memory (num_elements < 12) using `memcopy_elements` procedure instead, since it won't have a computation logic overhead.<br/><br/>**Inputs:** `[n, read_ptr, write_ptr]`<br/>**Outputs:** `[]`<br/><br/>Total cycles:<ul><li>Read and write pointers are mutually word-unaligned: $27 + 14 * num\_elements$</li><li>if `n` is less than 12 elements: 27 + 14 * num_elements</li><li>Read and write pointers are mutually word-aligned: $74 + 14 * num\_prefix\_elements + 16 * num\_words + 14 * num\_suffix\_elements$</li></ul> |
 | `memcopy_words`                        | Copies `n` words from `read_ptr` to `write_ptr`.<br/><br/>`read_ptr` and `write_ptr` pointers *must be* word-aligned.<br/><br/>**Inputs:** `[n, read_ptr, write_ptr]`<br/>**Outputs:** `[]`<br/><br/>Total cycles: $15 + 16 * num\_words$ |
-| `memcopy_elements`                        | Copies `n` elements from `read_ptr` to `write_ptr`.<br/><br/>**Inputs:** `[n, read_ptr, write_ptr]`<br/>**Outputs:** `[]`<br/><br/>Total cycles: $7 + 14 * num\_elements$ |
+| `memcopy_elements`                     | Copies `n` elements from `read_ptr` to `write_ptr`.<br/><br/>It is advised to copy big chunks of memory (num_elements >= 12) using `memcopy` procedure instead, since it will handle the memory more efficiently.<br/><br/>**Inputs:** `[n, read_ptr, write_ptr]`<br/>**Outputs:** `[]`<br/><br/>Total cycles: $7 + 14 * num\_elements$ |
 | `pipe_double_words_to_memory`          | Copies an even number of words from the advice_stack to memory.<br/><br/>**Inputs:** `[C, B, A, write_ptr, end_ptr]`<br/>**Outputs:** `[C, B, A, write_ptr]`<br/><br/>Notice that the `end_ptr - write_ptr` value must be positive and a multiple of 8.<br/><br/>Total cycles: $9 + 6 * num\_word\_pairs$ |
 | `pipe_words_to_memory`                 | Copies an arbitrary number of words from the advice stack to memory.<br/><br/>**Inputs:** `[num_words, write_ptr]`<br/>**Outputs:** `[C, B, A, write_ptr']`<br/><br/>Total cycles:<ul><li>Even `num_words`: $43 + 9 * num\_words / 2$</li><li>Odd `num_words`: $60 + 9 * round\_down(num\_words / 2)$</li></ul> |
 | `pipe_preimage_to_memory`              | Moves an arbitrary number of words from the advice stack to memory and asserts it matches the commitment.<br/><br/>**Inputs:** `[num_words, write_ptr, COMMITMENT]`<br/>**Outputs:** `[write_ptr']`<br/><br/>Total cycles:<ul><li>Even `num_words`: $62 + 9 * num\_words / 2$</li><li>Odd `num_words`: $79 + 9 * round\_down(num\_words / 2)$</li></ul> |
diff --git a/libcore/asm/mem.masm b/libcore/asm/mem.masm
index 939f1e2041..4f5b424d02 100644
--- a/libcore/asm/mem.masm
+++ b/libcore/asm/mem.masm
@@ -2,6 +2,137 @@ use miden::core::crypto::hashes::rpo256
 
 # ===== MEMORY FUNCTIONS ==========================================================================
 
+#! Copies `n` elements from `read_ptr` to `write_ptr`.
+#!
+#! If possible, this procedure will copy word-aligned words internally using `memcopy_words` 
+#! procedure, decreasing the total number of cycles required to make a copy.
+#!
+#! It is advised to copy small chunks of memory (num_elements < 12) using `memcopy_elements` 
+#! procedure instead, since it won't have a computation logic overhead.
+#!
+#! Inputs:  [n, read_ptr, write_ptr]
+#! Outputs: []
+#!
+#! Where:
+#! - n is the number of elements which should be copied.
+#! - read_ptr is the memory pointer where the elements to copy are stored.
+#! - write_ptr is the memory pointer where the elements will be copied.
+#!
+#! Panics if:
+#! - read_ptr and write_ptr are not a u32 values.
+#!
+#! Total cycles:
+#! - if read and write pointers are mutually word-unaligned: 27 + 14 * num_elements
+#! - if `n` is less than 12 elements: 27 + 14 * num_elements
+#! - if read and write pointers are mutually mutually word-aligned, it's equal to the sum of:
+#!   - prologue: 19 cycles
+#!   - prefix handling: 31 + 14 * num_prefix_elements, where num_prefix_elements is 
+#!     the number of elements required to make the read (and write) pointer word-aligned
+#!   - words handling: 15 + 16 * num_words
+#!   - suffix handling: 9 + 14 * num_suffix_elements
+#!     - in total: 74 + 14 * num_prefix_elements + 16 * num_words + 14 * num_suffix_elements
+pub proc memcopy
+    # check whether the difference between `read_ptr` and `write_ptr` is a multiple of 4, otherwise
+    # these pointers will never be simultaneously word-alined, so we won't be able to use 
+    # `memcopy_words`
+    movdn.2 u32assert2.err="memory addresses must be valid u32 values" # 2 cycles
+    # => [read_ptr, write_ptr, n]
+
+    # Compute the remainders of dividing read_ptr and write_ptr by 4. Compare them to get the 
+    # "alignment flag".
+    dup.1 u32mod.4 dup.1 u32mod.4 eq # 11 cycles
+    # => [alignment_flag, read_ptr, write_ptr, n]
+
+    # if the number of elements to copy is small (less than 12), use only the `memcopy_elements`
+    # procedure 
+    dup.3 u32lt.12 or # 6 cycles
+    # => [alignment_flag, read_ptr, write_ptr, n]
+
+    if.true
+        # if read pointer and write pointer are mutually word-aligned (i.e. 
+        # (read_ptr - write_ptr) mod 4 = 0) we will be able to use `memcopy_words` in the cycle to
+        # optimize the copy
+        # => [read_ptr, write_ptr, n]
+
+        ### "prefix" handling #####################################################################
+
+        # copy `m` values, where `m` is the number of elements needed to make pointers word-aligned.
+        # If `m` > `n` (meaning we don't have enough values to make pointers word-aligned), copy all
+        # `n` available values.  
+
+        # get the number of elements required to make read_ptr multiple of 4
+        dup.2 exec.elements_to_next_multiple_of_4 # 20 cycles
+        # => [m, read_ptr, write_ptr, n]
+
+        # update the total number of values left to copy
+        movup.3 dup.1 sub movdn.3 # 5 cycles
+        # => [m, read_ptr, write_ptr, n - m]
+
+        # copy the prefix elements
+        exec.memcopy_elements_internal # 4 + 14 * num_prefix_elements cycles
+        # => [0, read_ptr + m, write_ptr + m, n - m]
+
+        # rearrange the stack for word copy
+        drop movup.2 # 2 cycles
+        # => [n - m, read_ptr + m, write_ptr + m]
+
+        ### "words" handling ######################################################################
+
+        # get the number of words we should handle and the number of remaining felt values
+        u32divmod.4 # 2 cycles
+        # => [values_left, num_words, read_ptr', write_ptr']
+
+        # copy the words
+        movdn.3 exec.memcopy_words_internal # 13 + 16 * num_words cycles
+        # => [0, read_ptr'', write_ptr'', values_left]
+
+        ### "suffix" handling #####################################################################
+
+        # handle the remaining values
+        drop movup.2 exec.memcopy_elements # 9 + 14 * num_suffix_elements cycles
+        # => []
+    else
+        # if read pointer and write pointer are not word-aligned between each other, use 
+        # `memcopy_elements` to copy values one-by-one
+        # => [read_ptr, write_ptr, n]
+
+        movup.2 exec.memcopy_elements # 8 + 14 * num_elements cycles
+        # => []
+    end
+    # => []
+end
+
+#! Returns the number of elements required to make the provided memory pointer multiple of 4, or `n`
+#! if it is smaller than the above described value.
+#!
+#! Inputs:  [n, ptr]
+#! Outputs: [m, ptr]
+#!
+#! Where:
+#! - n is the value that limits the result from below: the result cannot be less than it.
+#! - ptr is the memory pointer relative to which the next multiple of 4 should be computed.
+#! - m is the value equal to the number of elements required to make the provided pointer multiple
+#!   of 4 or `n` if it is smaller than the above described value.
+#!
+#! Total cycles: 19 cycles
+proc elements_to_next_multiple_of_4
+    # get the quotient and the remainder of the division by 4
+    dup.1 u32divmod.4 # 3 cycles
+    # => [ptr mod 4, ptr / 4, n, ptr]
+
+    # get the memory pointer which is next multiple of 4
+    neq.0 add mul.4 # 5 cycles
+    # => [ptr + to_next_4, n, ptr]
+
+    # get the number of values required to make the pointer multiple of 4
+    dup.2 sub # 3 cycles
+    # => [to_next_4, n, ptr]
+
+    # leave the smallest of the two values on the stack
+    dup.1 dup.1 u32gte cdrop # 8 cycles
+    # => [m, ptr]
+end
+
 #! Copies `n` words from `read_ptr` to `write_ptr`.
 #!
 #! `read_ptr` and `write_ptr` pointers *must be* word-aligned.
@@ -16,52 +147,77 @@ use miden::core::crypto::hashes::rpo256
 #!
 #! Total cycles: 15 + 16 * num_words
 pub proc memcopy_words
-  # The loop variable is changed with an add instead of sub because the former
-  # uses one fewer cycle. So here the counter is negated. (1 cycles)
-  # stack: [-n, read_ptr, write_ptr, ...]
-  neg
+    exec.memcopy_words_internal
+    # => [n = 0, read_ptr + 4n, write_ptr + 4n]
+    
+    # clean the stack (3 cycles)
+    drop drop drop
+    # => []
+end
 
-  # Pad the stack because mem_load overwrites it (4 cycles)
-  # stack: [0, 0, 0, 0, -n, read_ptr, write_ptr, ...]
-  padw
+#! Copies `n` words from `read_ptr` to `write_ptr`.
+#!
+#! `read_ptr` and `write_ptr` pointers *must be* word-aligned.
+#!
+#! Inputs:  [n, read_ptr, write_ptr]
+#! Outputs: [n = 0, read_ptr + 4n, write_ptr + 4n]
+#!
+#! Where:
+#! - n is the number of words which should be copied.
+#! - read_ptr is the memory pointer where the words to copy are stored.
+#! - write_ptr is the memory pointer where the words will be copied.
+#!
+#! Total cycles: 12 + 16 * num_words
+proc memcopy_words_internal
+    # The loop variable is changed with an add instead of sub because the former
+    # uses one fewer cycle. So here the counter is negated. (1 cycles)
+    # stack: [-n, read_ptr, write_ptr, ...]
+    neg
 
-  # check loop condition (3 cycles)
-  # stack: [b, 0, 0, 0, 0, -n, read_ptr, write_ptr, ...]
-  dup.4 neq.0
+    # Pad the stack because mem_load overwrites it (4 cycles)
+    # stack: [0, 0, 0, 0, -n, read_ptr, write_ptr, ...]
+    padw
 
-  # LOOP: [0, 0, 0, 0, -n, read_ptr, write_ptr, ...]
-  # while(n!=0) (16 cycles)
-  while.true
-    # perform read (2 cycles)
-    # stack: [a3, a2, a1, a0, -n, read_ptr, write_ptr, ...]
-    dup.5 mem_loadw_be
+    # check loop condition (3 cycles)
+    # stack: [b, 0, 0, 0, 0, -n, read_ptr, write_ptr, ...]
+    dup.4 neq.0
 
-    # perform write (2 cycles)
-    # stack: [a3, a2, a1, a0, -n, read_ptr, write_ptr, ...]
-    dup.6 mem_storew_be
+    # LOOP: [0, 0, 0, 0, -n, read_ptr, write_ptr, ...]
+    # while(n != 0) (16 cycles)
+    while.true
+        # perform read (2 cycles)
+        # stack: [a3, a2, a1, a0, -n, read_ptr, write_ptr, ...]
+        dup.5 mem_loadw_be
 
-    # note: the values of `A` are no longer necessary, use 0 to signal its a
-    # padding
-    # stack: [-n, read_ptr, write_ptr, x, 0, 0, 0, 0, ...]
-    swapw
+        # perform write (2 cycles)
+        # stack: [a3, a2, a1, a0, -n, read_ptr, write_ptr, ...]
+        dup.6 mem_storew_be
 
-    # stack: [-n+1, read_ptr+4, write_ptr+4, x, 0, 0, 0, 0, ...]
-    # update counters (9 cycles)
-    add.1 movup.3 movup.3 add.4 movup.3 add.4 movup.3
+        # note: the values of `A` are no longer necessary, use 0 to signal its a
+        # padding
+        # stack: [-n, read_ptr, write_ptr, x, 0, 0, 0, 0, ...]
+        swapw
 
-    # stack: [0, 0, 0, 0, -n+1, read_ptr+4, write_ptr+4, x, ...]
-    swapw
+        # stack: [-n+1, read_ptr+4, write_ptr+4, x, 0, 0, 0, 0, ...]
+        # update counters (9 cycles)
+        add.1 movup.3 movup.3 add.4 movup.3 add.4 movup.3
 
-    dup.4 neq.0 # while(n!=0) (3 cycles)
-  end
+        # stack: [0, 0, 0, 0, -n+1, read_ptr+4, write_ptr+4, x, ...]
+        swapw
+
+        dup.4 neq.0 # while(n != 0) (3 cycles)
+    end
 
-  # clean stack (7 cycles)
-  # stack: [...]
-  dropw drop drop drop
+    # clean stack (4 cycles)
+    dropw
+    # => [-n+n = 0, read_ptr + 4n, write_ptr + 4n]
 end
 
 #! Copies `n` elements from `read_ptr` to `write_ptr`.
 #!
+#! It is advised to copy big chunks of memory (num_elements >= 12) using `memcopy` procedure 
+#! instead, since it will handle the memory more efficiently.
+#!
 #! Inputs:  [n, read_ptr, write_ptr]
 #! Outputs: []
 #!
@@ -72,6 +228,26 @@ end
 #!
 #! Total cycles: 7 + 14 * num_elements
 pub proc memcopy_elements
+    exec.memcopy_elements_internal
+    # => [n = 0, read_ptr + n, write_ptr + n]
+
+    # clean stack (3 cycles)
+    drop drop drop
+    # => []
+end
+
+#! Copies `n` elements from `read_ptr` to `write_ptr`.
+#!
+#! Inputs:  [n, read_ptr, write_ptr]
+#! Outputs: [n = 0, read_ptr + n, write_ptr + n]
+#!
+#! Where:
+#! - n is the number of elements which should be copied.
+#! - read_ptr is the memory pointer where the elements to copy are stored.
+#! - write_ptr is the memory pointer where the elements will be copied.
+#!
+#! Total cycles: 4 + 14 * num_elements
+proc memcopy_elements_internal
     # The loop variable is changed with an `add` instead of `sub` because the former uses one fewer 
     # cycle. So here the counter is negated. (1 cycle)
     neg
@@ -99,9 +275,6 @@ pub proc memcopy_elements
         # => [loop_flag, -n+1, read_ptr+1, write_ptr+1]
     end
     # => [n = 0, read_ptr + n, write_ptr + n]
-
-    # clean stack (3 cycles)
-    drop drop drop
 end
 
 #! Copies an even number of words from the advice_stack to memory, computing their permutation.
@@ -151,68 +324,68 @@ end
 #!  - Even `num_words`: 43 + 9 * num_words / 2
 #!  - Odd `num_words`: 60 + 9 * round_down(num_words / 2)
 pub proc pipe_words_to_memory
-  # check if there is an odd number of words (6 cycles)
-  dup is_odd
-  # => [is_odd, num_words, write_ptr, ...]
-
-  # copy is_odd, it defines if last word requires padding (2 cycles)
-  dup movdn.3
-  # => [is_odd, num_words, write_ptr, needs_padding, ...]
-
-  # compute `end_ptr` with an even number of words (7 cycles)
-  sub mul.4 dup.1 add swap
-  # => [write_ptr, end_ptr, needs_padding, ...]
-
-  # Prepare the capacity word. We use the padding rule which sets the first capacity
-  # element to `len % 8` where `len` is the length of the hashed sequence. Since `len % 8`
-  # is either equal to 0 or 4,  this is determined by the `needs_padding` flag multiplied
-  # by 4. (6 cycles)
-  dup.2 mul.4 push.0.0.0
-  # => [A, write_ptr, end_ptr, needs_padding, ...]
-
-  # set initial hasher state (8 cycles)
-  padw padw
-  # => [C, B, A, write_ptr, end_ptr, needs_padding, ...]
-
-  # (9 + 6 * num_words cycles)
-  exec.pipe_double_words_to_memory
-  # => [C, B, A, write_ptr, needs_padding, ...]
-
-  # (4 cycles)
-  movup.13
-  # => [needs_padding, C, B, A, write_ptr, ...]
-
-  # if(needs_padding) (17 cycles)
-  if.true
-    # Rescue Prime Optimized uses overwrite mode, drop `C`. (4 cycles)
-    dropw
-    # => [B, A, write_ptr, ...]
-
-    # Overwrite the `B` with the new data (1 cycles)
-    adv_loadw
-    # => [B', A, write_ptr, ...]
-
-    # - get the memory address that B' should be saved to
-    # - update the write_ptr to point to the next address (4 cycles)
-    movup.8 dup.0 add.4 movdn.5
-    # => [write_ptr, B', write_ptr+4, A, ...]
-
-    # save data to memory (1 cycles)
-    mem_storew_be
-    # => [B', write_ptr+1, A, ...]
-
-    # Fix write_ptr position (2 cycles)
-    movup.4 movdn.8
-    # => [B', A, write_ptr+1, ...]
-
-    # Push padding word (4 cycles)
-    padw
-    # => [C, B', A, write_ptr+1, ...]
-
-    # Run RPO permutation (1 cycles)
-    exec.rpo256::permute
-    # => [C', B', A', write_ptr+1, ...]
-  end
+    # check if there is an odd number of words (6 cycles)
+    dup is_odd
+    # => [is_odd, num_words, write_ptr, ...]
+
+    # copy is_odd, it defines if last word requires padding (2 cycles)
+    dup movdn.3
+    # => [is_odd, num_words, write_ptr, needs_padding, ...]
+
+    # compute `end_ptr` with an even number of words (7 cycles)
+    sub mul.4 dup.1 add swap
+    # => [write_ptr, end_ptr, needs_padding, ...]
+
+    # Prepare the capacity word. We use the padding rule which sets the first capacity
+    # element to `len % 8` where `len` is the length of the hashed sequence. Since `len % 8`
+    # is either equal to 0 or 4,  this is determined by the `needs_padding` flag multiplied
+    # by 4. (6 cycles)
+    dup.2 mul.4 push.0.0.0
+    # => [A, write_ptr, end_ptr, needs_padding, ...]
+
+    # set initial hasher state (8 cycles)
+    padw padw
+    # => [C, B, A, write_ptr, end_ptr, needs_padding, ...]
+
+    # (9 + 6 * num_words cycles)
+    exec.pipe_double_words_to_memory
+    # => [C, B, A, write_ptr, needs_padding, ...]
+
+    # (4 cycles)
+    movup.13
+    # => [needs_padding, C, B, A, write_ptr, ...]
+
+    # if(needs_padding) (17 cycles)
+    if.true
+        # Rescue Prime Optimized uses overwrite mode, drop `C`. (4 cycles)
+        dropw
+        # => [B, A, write_ptr, ...]
+
+        # Overwrite the `B` with the new data (1 cycles)
+        adv_loadw
+        # => [B', A, write_ptr, ...]
+
+        # - get the memory address that B' should be saved to
+        # - update the write_ptr to point to the next address (4 cycles)
+        movup.8 dup.0 add.4 movdn.5
+        # => [write_ptr, B', write_ptr+4, A, ...]
+
+        # save data to memory (1 cycles)
+        mem_storew_be
+        # => [B', write_ptr+1, A, ...]
+
+        # Fix write_ptr position (2 cycles)
+        movup.4 movdn.8
+        # => [B', A, write_ptr+1, ...]
+
+        # Push padding word (4 cycles)
+        padw
+        # => [C, B', A, write_ptr+1, ...]
+
+        # Run RPO permutation (1 cycle)
+        exec.rpo256::permute
+        # => [C', B', A', write_ptr+1, ...]
+    end
 end
 
 #! Moves an arbitrary number of words from the advice stack to memory and asserts it matches the
@@ -232,21 +405,21 @@ end
 #!  - Even `num_words`: 62 + 9 * num_words / 2
 #!  - Odd `num_words`: 79 + 9 * round_down(num_words / 2)
 pub proc pipe_preimage_to_memory
-  # Copies the advice stack data to memory
-  exec.pipe_words_to_memory
-  # => [C, B, A, write_ptr', COMMITMENT, ...]
+    # Copies the advice stack data to memory
+    exec.pipe_words_to_memory
+    # => [C, B, A, write_ptr', COMMITMENT, ...]
 
-  # Leave only the digest on the stack
-  exec.rpo256::squeeze_digest
-  # => [B, write_ptr', COMMITMENT, ...]
+    # Leave only the digest on the stack
+    exec.rpo256::squeeze_digest
+    # => [B, write_ptr', COMMITMENT, ...]
 
-  # Save the write_ptr (2 cycles)
-  movup.4 movdn.8
-  # => [HASH, COMMITMENT, write_ptr', ...]
+    # Save the write_ptr (2 cycles)
+    movup.4 movdn.8
+    # => [HASH, COMMITMENT, write_ptr', ...]
 
-  # Check the COMMITMENT (10 cycles)
-  assert_eqw
-  # => [write_ptr', ...]
+    # Check the COMMITMENT (10 cycles)
+    assert_eqw
+    # => [write_ptr', ...]
 end
 
 #! Moves an even number of words from the advice stack to memory and asserts that their sequential
@@ -264,32 +437,32 @@ end
 #!
 #! Total cycles: 56 + 3 * num_words / 2
 pub proc pipe_double_words_preimage_to_memory
-  # Assert precondition (8 cycles).
-  dup is_odd assertz.err="pipe_double_words_preimage_to_memory: num_words must be even"
-  # => [num_words, write_ptr, COMMITMENT, ...]
-
-  # Compute and move end_ptr (5 cycles)
-  mul.4 dup.1 add swap
-  # => [write_ptr, end_ptr, COMMITMENT, ...]
-
-  # Setup the initial hasher state, which starts with a capacity word `A`.
-  # For us, the capacity word is entirely empty.
-  # (12 cycles).
-  exec.rpo256::init_no_padding
-  # => [C, B, A, write_ptr, end_ptr, COMMITMENT, ...]
-
-  # (9 + 3 * num_words cycles)
-  # (e.g., 25 cycles for 4 words)
-  exec.pipe_double_words_to_memory
-  # => [C, B, A, write_ptr', COMMITMENT, ...]
-
-  # Leave just the digest on the stack (9 cycles).
-  exec.rpo256::squeeze_digest
-  # => [B, write_ptr', COMMITMENT, ...]
-
-  # Move write_ptr out of the way so we can assert the commitment (2 cycles).
-  movup.4 movdn.8
-
-  # Assert the commitment (11 cycles).
-  assert_eqw.err="pipe_double_words_preimage_to_memory: COMMITMENT does not match"
+    # Assert precondition (8 cycles).
+    dup is_odd assertz.err="pipe_double_words_preimage_to_memory: num_words must be even"
+    # => [num_words, write_ptr, COMMITMENT, ...]
+
+    # Compute and move end_ptr (5 cycles)
+    mul.4 dup.1 add swap
+    # => [write_ptr, end_ptr, COMMITMENT, ...]
+
+    # Setup the initial hasher state, which starts with a capacity word `A`.
+    # For us, the capacity word is entirely empty.
+    # (12 cycles).
+    exec.rpo256::init_no_padding
+    # => [C, B, A, write_ptr, end_ptr, COMMITMENT, ...]
+
+    # (9 + 3 * num_words cycles)
+    # (e.g., 25 cycles for 4 words)
+    exec.pipe_double_words_to_memory
+    # => [C, B, A, write_ptr', COMMITMENT, ...]
+
+    # Leave just the digest on the stack (9 cycles).
+    exec.rpo256::squeeze_digest
+    # => [B, write_ptr', COMMITMENT, ...]
+
+    # Move write_ptr out of the way so we can assert the commitment (2 cycles).
+    movup.4 movdn.8
+
+    # Assert the commitment (11 cycles).
+    assert_eqw.err="pipe_double_words_preimage_to_memory: COMMITMENT does not match"
 end
diff --git a/libcore/docs/collections/smt.md b/libcore/docs/collections/smt.md
index 793fc358a2..eec4cfdd16 100644
--- a/libcore/docs/collections/smt.md
+++ b/libcore/docs/collections/smt.md
@@ -4,4 +4,4 @@
 | ----------- | ------------- |
 | get | Returns the value located under the specified key in the Sparse Merkle Tree defined by the<br />specified root.<br /><br />If no values had been previously inserted under the specified key, an empty word (i.e.,<br />[ZERO; 4]) is returned.<br /><br />Inputs:<br />Operand stack: [K, R, ...]<br /><br />Outputs:<br />Operand stack: [V, R, ...]<br /><br />Fails if the tree with the specified root does not exits in the VM's advice provider.<br /><br />Cycles<br />Leaf empty: 66 cycles<br />Leaf single: 115 cycles<br />Leaf multiple, non-empty value: 189 + 3 * pair_count<br />Leaf multiple, empty value: 186 + 3 * pair_count<br /> |
 | set | Inserts the specified value under the specified key in a Sparse Merkle Tree defined by the<br />specified root. If the insert is successful, the old value located under the specified key<br />is returned via the stack.<br /><br />If the VALUE is an empty word (i.e., [ZERO; 4]), the new state of the tree is guaranteed to<br />be equivalent to the state as if the updated value was never inserted.<br /><br />Inputs:<br />Operand stack: [V, K, R, ...]<br />Outputs:<br />Operand stack: [V_old, R_new, ...]<br /><br />Fails if the tree with the specified root does not exits in the VM's advice provider.<br /><br />Cycles<br />Leaf empty<br />removal: 74 cycles<br />insertion: 133 cycles<br />Leaf single<br />removal: 227 cycles<br />insertion (leaf remains single): 205<br />insertion (leaf becomes multiple): X cycles<br />Leaf multiple<br />X cycles<br /> |
-| peek | Emits the "miden::core::collections::smt::smt_peek" event, which pushes onto the advice stack the<br />value associated with the specified key in a Sparse Merkle Tree defined by the specified root.<br /><br />If no value was previously associated with the specified key, [ZERO; 4] is pushed onto the advice<br />stack.<br /><br />Inputs:<br />Operand stack: [KEY, ROOT]<br />Advice stack:  []<br /><br />Outputs:<br />Operand stack: [KEY, ROOT]<br />Advice stack:  [VALUE]<br /><br />Panics if:<br />- the tree with the specified root does not exits in the VM's advice provider.<br /><br />Cycles: 3 cycles<br /> |
+| peek | Emits the "core::collections::smt::smt_peek" event, which pushes onto the advice stack the<br />value associated with the specified key in a Sparse Merkle Tree defined by the specified root.<br /><br />If no value was previously associated with the specified key, [ZERO; 4] is pushed onto the advice<br />stack.<br /><br />Inputs:<br />Operand stack: [KEY, ROOT]<br />Advice stack:  []<br /><br />Outputs:<br />Operand stack: [KEY, ROOT]<br />Advice stack:  [VALUE]<br /><br />Panics if:<br />- the tree with the specified root does not exits in the VM's advice provider.<br /><br />Cycles: 3 cycles<br /> |
diff --git a/libcore/docs/crypto/dsa/ecdsa_k256_keccak.md b/libcore/docs/crypto/dsa/ecdsa_k256_keccak.md
index 05b5708a09..b04347bd63 100644
--- a/libcore/docs/crypto/dsa/ecdsa_k256_keccak.md
+++ b/libcore/docs/crypto/dsa/ecdsa_k256_keccak.md
@@ -4,4 +4,4 @@
 | ----------- | ------------- |
 | verify_ecdsa_k256_keccak | Verifies an secp256k1 ECDSA signature compatible with `miden-crypto::ecdsa_k256_keccak`.<br /><br />This wrapper mirrors the materialization performed in `miden-crypto::ecdsa_k256_keccak`: given<br />a public key commitment and the original message, it reconstructs the calldata expected by the<br />precompile (public key bytes, Keccak256(message), signature). The public key and signature are<br />supplied via the advice stack, and can be obtained with the `ecdsa_k256_keccak` function.<br /><br />Inputs:<br />Operand stack: [PK_COMM, MSG, ...]<br />Advice stack:  [PK[9] \| SIG_BYTES[17] \| ...]<br />Outputs:<br />Operand stack: []<br />Advice stack:  []<br /><br />Local memory layout (element addresses):<br />- locaddr[0 ..9 ] : compressed public key (33 bytes packed as 9 felts)<br />- locaddr[12..20] : message bytes (MSG written as eight u32 limbs)<br />- locaddr[20..28] : keccak256(message) digest (8 felts)<br />- locaddr[28..45] : signature (66 bytes packed as 17 felts)<br /><br />The procedure traps if the public key in the advice stack does not hash to `PK_COMM`;<br />otherwise it returns cleanly after emitting the deferred verification request.<br /> |
 | verify | Verifies an ECDSA signature with pre-hashed message using deferred execution.<br /><br />The caller provides the public key, the pre-hashed message digest, and the signature data in<br />memory. This routine forwards the request to the host precompile and returns the boolean result.<br />In typical flows the digest is obtained from `keccak256::hash_bytes`, but any 32-byte prehash<br />is accepted.<br /><br />Input: `[pk_ptr, digest_ptr, sig_ptr, ...]`<br />Output: `[result, ...]`<br /><br />Where:<br />- `pk_ptr`: word-aligned memory address containing the 33-byte compressed secp256k1 public key<br />- `digest_ptr`: word-aligned memory address containing the 32-byte message digest<br />- `sig_ptr`: word-aligned memory address containing the 66-byte signature<br />- `result`: 1 if the signature is valid, 0 if invalid<br /><br />All data must be stored in memory as packed u32 values (little-endian), with unused bytes<br />in the final u32 set to zero.<br /> |
-| verify_impl | Internal implementation of ECDSA signature verification via deferred computation.<br /><br />Emits an event to trigger the precompile handler, reads the verification result from<br />the advice stack, and computes the commitment and tag for tracking deferred verification.<br /><br />This procedure mimics the `ecdsa_secp256k1::PublicKey::verify_prehash()` function from<br />`miden-crypto`, which takes a pre-hashed message that the caller must provide<br />(e.g. obtained using the keccak256 precompile).<br /><br />Input: `[pk_ptr, digest_ptr, sig_ptr, ...]`<br />Output: `[COMM, TAG, result, ...]`<br /><br />Where:<br />- `pk_ptr`: word-aligned memory address containing 33-byte public key<br />- `digest_ptr`: word-aligned memory address containing 32-byte digest<br />- `sig_ptr`: word-aligned memory address containing 66-byte signature<br />- `COMM`: commitment to calldata computed as<br />`Rpo256(Rpo256(Rpo256(pk) \|\| Rpo256(digest)) \|\| Rpo256(sig))`<br />- `TAG`: `[ECDSA_VERIFY_EVENT, result, 0, 0]`<br />- `result`: 1 if signature is valid, 0 if invalid<br /> |
+| verify_impl | Internal implementation of ECDSA signature verification via deferred computation.<br /><br />Emits an event to trigger the precompile handler, reads the verification result from<br />the advice stack, and computes the commitment and tag for tracking deferred verification.<br /><br />This procedure mimics the `ecdsa_secp256k1::PublicKey::verify_prehash()` function from<br />`miden-crypto`, which takes a pre-hashed message that the caller must provide<br />(e.g. obtained using the keccak256 precompile).<br /><br />Input: `[pk_ptr, digest_ptr, sig_ptr, ...]`<br />Output: `[COMM, TAG, result, ...]`<br /><br />Where:<br />- `pk_ptr`: word-aligned memory address containing 33-byte public key<br />- `digest_ptr`: word-aligned memory address containing 32-byte digest<br />- `sig_ptr`: word-aligned memory address containing 66-byte signature<br />- `COMM`: commitment to calldata computed as<br />`rpo256256(rpo256256(rpo256256(pk) \|\| rpo256256(digest)) \|\| rpo256256(sig))`<br />- `TAG`: `[ECDSA_VERIFY_EVENT, result, 0, 0]`<br />- `result`: 1 if signature is valid, 0 if invalid<br /> |
diff --git a/libcore/docs/crypto/hashes/rpo256.md b/libcore/docs/crypto/hashes/rpo256.md
index aac4c36c51..c60458cd1f 100644
--- a/libcore/docs/crypto/hashes/rpo256.md
+++ b/libcore/docs/crypto/hashes/rpo256.md
@@ -1,4 +1,5 @@
-## miden::core::crypto::hashes::rpo
+
+## miden::core::crypto::hashes::rpo256
 | Procedure | Description |
 | ----------- | ------------- |
 | init_no_padding | Prepares the top of the stack with the hasher initial state.<br /><br />This procedures does not handle padding, therefore, the user is expected to<br />consume an amount of data which is a multiple of the rate (2 words).<br /><br />Input: []<br />Output: [PERM, PERM, PERM, ...]<br /><br />Cycles: 12<br /> |
diff --git a/libcore/docs/mem.md b/libcore/docs/mem.md
index f3db7d15ff..f008a35719 100644
--- a/libcore/docs/mem.md
+++ b/libcore/docs/mem.md
@@ -2,8 +2,9 @@
 ## miden::core::mem
 | Procedure | Description |
 | ----------- | ------------- |
+| memcopy | Copies `n` elements from `read_ptr` to `write_ptr`.<br /><br />If possible, this procedure will copy word-aligned words internally using `memcopy_words`<br />procedure, decreasing the total number of cycles required to make a copy.<br /><br />It is advised to copy small chunks of memory (num_elements < 12) using `memcopy_elements`<br />procedure instead, since it won't have a computation logic overhead.<br /><br />Inputs:  [n, read_ptr, write_ptr]<br />Outputs: []<br /><br />Where:<br />- n is the number of elements which should be copied.<br />- read_ptr is the memory pointer where the elements to copy are stored.<br />- write_ptr is the memory pointer where the elements will be copied.<br /><br />Panics if:<br />- read_ptr and write_ptr are not a u32 values.<br /><br />Total cycles:<br />- if read and write pointers are mutually word-unaligned: 27 + 14 * num_elements<br />- if `n` is less than 12 elements: 27 + 14 * num_elements<br />- if read and write pointers are mutually mutually word-aligned, it's equal to the sum of:<br />- prologue: 19 cycles<br />- prefix handling: 31 + 14 * num_prefix_elements, where num_prefix_elements is<br />the number of elements required to make the read (and write) pointer word-aligned<br />- words handling: 15 + 16 * num_words<br />- suffix handling: 9 + 14 * num_suffix_elements<br />- in total: 74 + 14 * num_prefix_elements + 16 * num_words + 14 * num_suffix_elements<br /> |
 | memcopy_words | Copies `n` words from `read_ptr` to `write_ptr`.<br /><br />`read_ptr` and `write_ptr` pointers *must be* word-aligned.<br /><br />Inputs:  [n, read_ptr, write_ptr]<br />Outputs: []<br /><br />Where:<br />- n is the number of words which should be copied.<br />- read_ptr is the memory pointer where the words to copy are stored.<br />- write_ptr is the memory pointer where the words will be copied.<br /><br />Total cycles: 15 + 16 * num_words<br /> |
-| memcopy_elements | Copies `n` elements from `read_ptr` to `write_ptr`.<br /><br />Inputs:  [n, read_ptr, write_ptr]<br />Outputs: []<br /><br />Where:<br />- n is the number of elements which should be copied.<br />- read_ptr is the memory pointer where the elements to copy are stored.<br />- write_ptr is the memory pointer where the elements will be copied.<br /><br />Total cycles: 7 + 14 * num_elements<br /> |
+| memcopy_elements | Copies `n` elements from `read_ptr` to `write_ptr`.<br /><br />It is advised to copy big chunks of memory (num_elements >= 12) using `memcopy` procedure<br />instead, since it will handle the memory more efficiently.<br /><br />Inputs:  [n, read_ptr, write_ptr]<br />Outputs: []<br /><br />Where:<br />- n is the number of elements which should be copied.<br />- read_ptr is the memory pointer where the elements to copy are stored.<br />- write_ptr is the memory pointer where the elements will be copied.<br /><br />Total cycles: 7 + 14 * num_elements<br /> |
 | pipe_double_words_to_memory | Copies an even number of words from the advice_stack to memory, computing their permutation.<br /><br />Inputs:  [C, B, A, write_ptr, end_ptr]<br />Outputs: [C, B, A, write_ptr]<br /><br />Where:<br />- A is the capacity of the RPO hasher state<br />- C and B are the rate portion of the RPO hasher state<br />- write_ptr is the memory pointer where the words will be copied.<br />- end_ptr is the memory pointer where the copying should end.<br /><br />Notice that the `end_ptr - write_ptr` value must be positive and a multiple of 8.<br /><br />Total cycles: 9 + 6 * num_word_pairs<br /> |
 | pipe_words_to_memory | Copies an arbitrary number of words from the advice stack to memory, computing their permutation.<br /><br />Inputs:  [num_words, write_ptr]<br />Outputs: [C, B, A, write_ptr']<br /><br />Where:<br />- num_words is the number of words which will be copied to the memory.<br />- write_ptr is the memory pointer where the words will be copied.<br />- write_ptr' is the memory pointer to the end of the copied words.<br />- A is the capacity of the RPO hasher state<br />- C and B are the rate portion of the RPO hasher state<br /><br />Total cycles:<br />- Even `num_words`: 43 + 9 * num_words / 2<br />- Odd `num_words`: 60 + 9 * round_down(num_words / 2)<br /> |
 | pipe_preimage_to_memory | Moves an arbitrary number of words from the advice stack to memory and asserts it matches the<br />commitment.<br /><br />Inputs:  [num_words, write_ptr, COMMITMENT]<br />Outputs: [write_ptr']<br /><br />Where:<br />- num_words is the number of words which will be copied to the memory.<br />- write_ptr is the memory pointer where the words will be copied.<br />- write_ptr' is the memory pointer to the end of the copied words.<br />- COMMITMENT is the commitment that the one calculated during this procedure will be compared<br />with.<br /><br />Total cycles:<br />- Even `num_words`: 62 + 9 * num_words / 2<br />- Odd `num_words`: 79 + 9 * round_down(num_words / 2)<br /> |
diff --git a/libcore/tests/mem/mod.rs b/libcore/tests/mem/mod.rs
index 5a912035f8..f05622aa4e 100644
--- a/libcore/tests/mem/mod.rs
+++ b/libcore/tests/mem/mod.rs
@@ -5,6 +5,171 @@ use miden_utils_testing::{
     build_expected_perm, felt_slice_to_ints,
 };
 
+#[test]
+fn test_memcopy() {
+    // prepare testing data
+    use miden_libcore::CoreLibrary;
+
+    let store_testing_data_source = "
+        push.1.2.3.4.1000 mem_storew_be dropw
+        push.5.6.7.8.1004 mem_storew_be dropw
+        push.9.10.11.12.1008 mem_storew_be dropw
+        push.13.14.15.16.1012 mem_storew_be dropw
+        push.17.18.19.20.1016 mem_storew_be dropw
+    ";
+
+    let stdlib = CoreLibrary::default();
+
+    fn executed_process_from_source(source: String, stdlib: &CoreLibrary) -> Process {
+        let assembler = miden_assembly::Assembler::default()
+            .with_dynamic_library(stdlib)
+            .expect("failed to load stdlib");
+        let program: Program =
+            assembler.assemble_program(source).expect("Failed to compile test source.");
+        let mut host = DefaultHost::default().with_library(stdlib).unwrap();
+
+        let mut process = Process::new(
+            program.kernel().clone(),
+            StackInputs::default(),
+            AdviceInputs::default(),
+            ExecutionOptions::default(),
+        );
+
+        process.execute(&program, &mut host).unwrap();
+
+        process
+    }
+
+    // Case 1: read pointer and write pointer are mutually unaligned
+
+    let source = format!(
+        "
+    use miden::core::mem
+
+    begin
+        {store_testing_data_source}
+
+        push.2002.1001.18 exec.mem::memcopy
+    end
+    "
+    );
+
+    let process = executed_process_from_source(source, &stdlib);
+
+    for addr in 2002..2020 {
+        assert_eq!(
+            process.chiplets.memory.get_value(ContextId::root(), addr).unwrap(),
+            Felt::from(addr - 2000),
+            "Address {}",
+            addr
+        );
+    }
+
+    // Case 2: pointers are word-aligned; essentially only the "words" handler will work
+
+    let source = format!(
+        "
+    use miden::core::mem
+
+    begin
+        {store_testing_data_source}
+
+        push.2000.1000.16 exec.mem::memcopy
+    end
+    "
+    );
+
+    let process = executed_process_from_source(source, &stdlib);
+
+    for addr in 2000..2016 {
+        assert_eq!(
+            process.chiplets.memory.get_value(ContextId::root(), addr).unwrap(),
+            Felt::from(addr - 1999),
+            "Address {}",
+            addr
+        );
+    }
+
+    // Case 3: pointers are mutually word-aligned (have equal mis-aligning from the next
+    // word-aligned address); we have enough copy values to run all three handlers: prefix,
+    // words, and suffix
+
+    let source = format!(
+        "
+    use miden::core::mem
+
+    begin
+        {store_testing_data_source}
+
+        push.2001.1001.18 exec.mem::memcopy
+    end
+    "
+    );
+
+    let process = executed_process_from_source(source, &stdlib);
+
+    for addr in 2001..2019 {
+        assert_eq!(
+            process.chiplets.memory.get_value(ContextId::root(), addr).unwrap(),
+            Felt::from(addr - 1999),
+            "Address {}",
+            addr
+        );
+    }
+
+    // Case 4: pointers are mutually word-aligned (have equal mis-aligning from the next
+    // word-aligned address); we have enough copy values to run only prefix and suffix handlers.
+
+    let source = format!(
+        "
+    use miden::core::mem
+
+    begin
+        {store_testing_data_source}
+
+        push.2001.1001.5 exec.mem::memcopy
+    end
+    "
+    );
+
+    let process = executed_process_from_source(source, &stdlib);
+
+    for addr in 2001..2006 {
+        assert_eq!(
+            process.chiplets.memory.get_value(ContextId::root(), addr).unwrap(),
+            Felt::from(addr - 1999),
+            "Address {}",
+            addr
+        );
+    }
+
+    // Case 5: pointers are mutually word-aligned (have equal mis-aligning from the next
+    // word-aligned address); we have enough copy values to run only suffix handler.
+
+    let source = format!(
+        "
+    use miden::core::mem
+
+    begin
+        {store_testing_data_source}
+
+        push.2001.1001.2 exec.mem::memcopy
+    end
+    "
+    );
+
+    let process = executed_process_from_source(source, &stdlib);
+
+    for addr in 2001..2003 {
+        assert_eq!(
+            process.chiplets.memory.get_value(ContextId::root(), addr).unwrap(),
+            Felt::from(addr - 1999),
+            "Address {}",
+            addr
+        );
+    }
+}
+
 #[test]
 fn test_memcopy_words() {
     use miden_libcore::CoreLibrary;