Skip to content

Commit cf54e38

Browse files
committed
allocating fold with std::ops::Add::add
1 parent 20f76e0 commit cf54e38

File tree

1 file changed

+4
-7
lines changed

1 file changed

+4
-7
lines changed

crates/core_simd/examples/dot_product.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,34 +112,31 @@ pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
112112
// Finally, we present an iterator version for handling remainders in a scalar fashion at the end of the loop.
113113
// Unfortunately, this is allocating 1 `XMM` register on the order of `~len(a)` - we'll see how we can get around it in the
114114
// next example.
115-
use std::ops::Add;
116115
pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 {
117116
let mut sum = a
118117
.array_chunks::<4>()
119118
.map(|&a| f32x4::from_array(a))
120119
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
121120
.map(|(a, b)| a * b)
122-
.fold(f32x4::splat(0.), std::ops::Add)
121+
.fold(f32x4::splat(0.0), std::ops::Add::add)
123122
.reduce_sum();
124123
let remain = a.len() - (a.len() % 4);
125124
sum += a[remain..]
126125
.iter()
127126
.zip(&b[remain..])
128127
.map(|(a, b)| a * b)
129-
.sum();
128+
.sum::<f32>();
130129
sum
131130
}
132131

133132
// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that.
134133
// Notice the the use of `mul_add`, which can do a multiply and an add operation ber iteration.
135134
pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 {
136-
let mut sum = a
137-
.array_chunks::<4>()
135+
a.array_chunks::<4>()
138136
.map(|&a| f32x4::from_array(a))
139137
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
140138
.fold(f32x4::splat(0.), |acc, (a, b)| acc.mul_add(a, b))
141-
.reduce_sum();
142-
sum
139+
.reduce_sum()
143140
}
144141

145142
fn main() {

0 commit comments

Comments
 (0)