Skip to content

Commit 130c8a0

Browse files
committed
turn load_prefix macro into helper function
1 parent 23207fd commit 130c8a0

File tree

1 file changed

+38
-30
lines changed

1 file changed

+38
-30
lines changed

src/mem/impls.rs

+38-30
Original file line numberDiff line numberDiff line change
@@ -41,30 +41,44 @@ unsafe fn read_usize_unaligned(x: *const usize) -> usize {
4141
core::mem::transmute(x_read)
4242
}
4343

44+
/// Loads a `T`-sized chunk from `src` into `dst` at offset `offset`, if that does not exceed
45+
/// `load_sz`. The offset pointers must both be `T`-aligned. Returns the new offset, advanced by the
46+
/// chunk size if a load happened.
47+
#[cfg(not(feature = "mem-unaligned"))]
48+
#[inline(always)]
49+
unsafe fn load_chunk_aligned<T: Copy>(
50+
src: *const usize,
51+
dst: *mut usize,
52+
load_sz: usize,
53+
offset: usize,
54+
) -> usize {
55+
let chunk_sz = core::mem::size_of::<T>();
56+
if (load_sz & chunk_sz) != 0 {
57+
// Since we are doing the large reads first, this must still be aligned to `chunk_sz`.
58+
*dst.wrapping_byte_add(offset).cast::<T>() = *src.wrapping_byte_add(offset).cast::<T>();
59+
offset | chunk_sz
60+
} else {
61+
offset
62+
}
63+
}
64+
4465
/// Load `load_sz` many bytes from `src`, which must be usize-aligned. Acts as if we did a `usize`
4566
/// read with the out-of-bounds part filled with 0s.
4667
/// `load_sz` be strictly less than `WORD_SIZE`.
4768
#[cfg(not(feature = "mem-unaligned"))]
4869
#[inline(always)]
4970
unsafe fn load_aligned_partial(src: *const usize, load_sz: usize) -> usize {
5071
debug_assert!(load_sz < WORD_SIZE);
72+
// We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
73+
// (since `load_sz < WORD_SIZE`).
74+
const { assert!(WORD_SIZE <= 8) };
5175

5276
let mut i = 0;
5377
let mut out = 0usize;
54-
macro_rules! load_prefix {
55-
($($ty:ty)+) => {$(
56-
let chunk_sz = core::mem::size_of::<$ty>();
57-
if (load_sz & chunk_sz) != 0 {
58-
// Since we are doing the large reads first, this must still be aligned to `chunk_sz`.
59-
*(&raw mut out).wrapping_byte_add(i).cast::<$ty>() = *src.wrapping_byte_add(i).cast::<$ty>();
60-
i |= chunk_sz;
61-
}
62-
)+};
63-
}
64-
// We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
65-
// (since `load_size < WORD_SIZE`).
66-
const { assert!(WORD_SIZE <= 8) };
67-
load_prefix!(u32 u16 u8);
78+
// We load in decreasing order, so the pointers remain sufficiently aligned for the next step.
79+
i = load_chunk_aligned::<u32>(src, &raw mut out, load_sz, i);
80+
i = load_chunk_aligned::<u16>(src, &raw mut out, load_sz, i);
81+
i = load_chunk_aligned::<u8>(src, &raw mut out, load_sz, i);
6882
debug_assert!(i == load_sz);
6983
out
7084
}
@@ -77,25 +91,19 @@ unsafe fn load_aligned_partial(src: *const usize, load_sz: usize) -> usize {
7791
#[inline(always)]
7892
unsafe fn load_aligned_end_partial(src: *const usize, load_sz: usize) -> usize {
7993
debug_assert!(load_sz < WORD_SIZE);
94+
// We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
95+
// (since `load_sz < WORD_SIZE`).
96+
const { assert!(WORD_SIZE <= 8) };
8097

8198
let mut i = 0;
8299
let mut out = 0usize;
83-
let start_shift = WORD_SIZE - load_sz;
84-
macro_rules! load_prefix {
85-
($($ty:ty)+) => {$(
86-
let chunk_sz = core::mem::size_of::<$ty>();
87-
if (load_sz & chunk_sz) != 0 {
88-
// Since we are doing the small reads first, `start_shift + i` has in the mean
89-
// time become aligned to `chunk_sz`.
90-
*(&raw mut out).wrapping_byte_add(start_shift + i).cast::<$ty>() = *src.wrapping_byte_add(start_shift + i).cast::<$ty>();
91-
i |= chunk_sz;
92-
}
93-
)+};
94-
}
95-
// We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
96-
// (since `load_size < WORD_SIZE`).
97-
const { assert!(WORD_SIZE <= 8) };
98-
load_prefix!(u8 u16 u32);
100+
// Obtain pointers pointing to the beginning of the range we want to load.
101+
let src_shifted = src.wrapping_byte_add(WORD_SIZE - load_sz);
102+
let out_shifted = (&raw mut out).wrapping_byte_add(WORD_SIZE - load_sz);
103+
// We load in increasing order, so by the time we reach `u16` things are 2-aligned etc.
104+
i = load_chunk_aligned::<u8>(src_shifted, out_shifted, load_sz, i);
105+
i = load_chunk_aligned::<u16>(src_shifted, out_shifted, load_sz, i);
106+
i = load_chunk_aligned::<u32>(src_shifted, out_shifted, load_sz, i);
99107
debug_assert!(i == load_sz);
100108
out
101109
}

0 commit comments

Comments
 (0)