@@ -41,30 +41,44 @@ unsafe fn read_usize_unaligned(x: *const usize) -> usize {
41
41
core:: mem:: transmute ( x_read)
42
42
}
43
43
44
+ /// Loads a `T`-sized chunk from `src` into `dst` at offset `offset`, if that does not exceed
45
+ /// `load_sz`. The offset pointers must both be `T`-aligned. Returns the new offset, advanced by the
46
+ /// chunk size if a load happened.
47
+ #[ cfg( not( feature = "mem-unaligned" ) ) ]
48
+ #[ inline( always) ]
49
+ unsafe fn load_chunk_aligned < T : Copy > (
50
+ src : * const usize ,
51
+ dst : * mut usize ,
52
+ load_sz : usize ,
53
+ offset : usize ,
54
+ ) -> usize {
55
+ let chunk_sz = core:: mem:: size_of :: < T > ( ) ;
56
+ if ( load_sz & chunk_sz) != 0 {
57
+ // Since we are doing the large reads first, this must still be aligned to `chunk_sz`.
58
+ * dst. wrapping_byte_add ( offset) . cast :: < T > ( ) = * src. wrapping_byte_add ( offset) . cast :: < T > ( ) ;
59
+ offset | chunk_sz
60
+ } else {
61
+ offset
62
+ }
63
+ }
64
+
44
65
/// Load `load_sz` many bytes from `src`, which must be usize-aligned. Acts as if we did a `usize`
45
66
/// read with the out-of-bounds part filled with 0s.
46
67
/// `load_sz` be strictly less than `WORD_SIZE`.
47
68
#[ cfg( not( feature = "mem-unaligned" ) ) ]
48
69
#[ inline( always) ]
49
70
unsafe fn load_aligned_partial ( src : * const usize , load_sz : usize ) -> usize {
50
71
debug_assert ! ( load_sz < WORD_SIZE ) ;
72
+ // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
73
+ // (since `load_sz < WORD_SIZE`).
74
+ const { assert ! ( WORD_SIZE <= 8 ) } ;
51
75
52
76
let mut i = 0 ;
53
77
let mut out = 0usize ;
54
- macro_rules! load_prefix {
55
- ( $( $ty: ty) +) => { $(
56
- let chunk_sz = core:: mem:: size_of:: <$ty>( ) ;
57
- if ( load_sz & chunk_sz) != 0 {
58
- // Since we are doing the large reads first, this must still be aligned to `chunk_sz`.
59
- * ( & raw mut out) . wrapping_byte_add( i) . cast:: <$ty>( ) = * src. wrapping_byte_add( i) . cast:: <$ty>( ) ;
60
- i |= chunk_sz;
61
- }
62
- ) +} ;
63
- }
64
- // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
65
- // (since `load_size < WORD_SIZE`).
66
- const { assert ! ( WORD_SIZE <= 8 ) } ;
67
- load_prefix ! ( u32 u16 u8 ) ;
78
+ // We load in decreasing order, so the pointers remain sufficiently aligned for the next step.
79
+ i = load_chunk_aligned :: < u32 > ( src, & raw mut out, load_sz, i) ;
80
+ i = load_chunk_aligned :: < u16 > ( src, & raw mut out, load_sz, i) ;
81
+ i = load_chunk_aligned :: < u8 > ( src, & raw mut out, load_sz, i) ;
68
82
debug_assert ! ( i == load_sz) ;
69
83
out
70
84
}
@@ -77,25 +91,19 @@ unsafe fn load_aligned_partial(src: *const usize, load_sz: usize) -> usize {
77
91
#[ inline( always) ]
78
92
unsafe fn load_aligned_end_partial ( src : * const usize , load_sz : usize ) -> usize {
79
93
debug_assert ! ( load_sz < WORD_SIZE ) ;
94
+ // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
95
+ // (since `load_sz < WORD_SIZE`).
96
+ const { assert ! ( WORD_SIZE <= 8 ) } ;
80
97
81
98
let mut i = 0 ;
82
99
let mut out = 0usize ;
83
- let start_shift = WORD_SIZE - load_sz;
84
- macro_rules! load_prefix {
85
- ( $( $ty: ty) +) => { $(
86
- let chunk_sz = core:: mem:: size_of:: <$ty>( ) ;
87
- if ( load_sz & chunk_sz) != 0 {
88
- // Since we are doing the small reads first, `start_shift + i` has in the mean
89
- // time become aligned to `chunk_sz`.
90
- * ( & raw mut out) . wrapping_byte_add( start_shift + i) . cast:: <$ty>( ) = * src. wrapping_byte_add( start_shift + i) . cast:: <$ty>( ) ;
91
- i |= chunk_sz;
92
- }
93
- ) +} ;
94
- }
95
- // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
96
- // (since `load_size < WORD_SIZE`).
97
- const { assert ! ( WORD_SIZE <= 8 ) } ;
98
- load_prefix ! ( u8 u16 u32 ) ;
100
+ // Obtain pointers pointing to the beginning of the range we want to load.
101
+ let src_shifted = src. wrapping_byte_add ( WORD_SIZE - load_sz) ;
102
+ let out_shifted = ( & raw mut out) . wrapping_byte_add ( WORD_SIZE - load_sz) ;
103
+ // We load in increasing order, so by the time we reach `u16` things are 2-aligned etc.
104
+ i = load_chunk_aligned :: < u8 > ( src_shifted, out_shifted, load_sz, i) ;
105
+ i = load_chunk_aligned :: < u16 > ( src_shifted, out_shifted, load_sz, i) ;
106
+ i = load_chunk_aligned :: < u32 > ( src_shifted, out_shifted, load_sz, i) ;
99
107
debug_assert ! ( i == load_sz) ;
100
108
out
101
109
}
0 commit comments