Skip to content

Commit db8c751

Browse files
committed
Faster SIMD alterative
1 parent 4658907 commit db8c751

File tree

2 files changed

+167
-52
lines changed

2 files changed

+167
-52
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
9191
| 19 | [Linen Layout](https://adventofcode.com/2024/day/19) | [Source](src/year2024/day19.rs) | 118 |
9292
| 20 | [Race Condition](https://adventofcode.com/2024/day/20) | [Source](src/year2024/day20.rs) | 1038 |
9393
| 21 | [Keypad Conundrum](https://adventofcode.com/2024/day/21) | [Source](src/year2024/day21.rs) | 19 |
94-
| 22 | [Monkey Market](https://adventofcode.com/2024/day/22) | [Source](src/year2024/day22.rs) | 1216 |
94+
| 22 | [Monkey Market](https://adventofcode.com/2024/day/22) | [Source](src/year2024/day22.rs) | 727 |
9595
| 23 | [LAN Party](https://adventofcode.com/2024/day/23) | [Source](src/year2024/day23.rs) | 43 |
9696
| 24 | [Crossed Wires](https://adventofcode.com/2024/day/24) | [Source](src/year2024/day24.rs) | 23 |
9797
| 25 | [Code Chronicle](https://adventofcode.com/2024/day/25) | [Source](src/year2024/day25.rs) | 8 |

src/year2024/day22.rs

+166-51
Original file line numberDiff line numberDiff line change
@@ -17,93 +17,208 @@
1717
//! by 5 bits and storing in an array of 2²⁰ = 1048675 elements. Multiplication on modern
1818
//! processors is cheap (and several instructions can issue at once) but random memory access
1919
//! is expensive.
20+
//!
21+
//! A SIMD variant processes 8 hashes at a time, taking about 60% of the time of the scalar version.
22+
//! The bottleneck is that disjoint indices must be written in sequence reducing the amount of work
23+
//! that can be parallelized.
2024
use crate::util::parse::*;
2125
use crate::util::thread::*;
2226
use std::sync::Mutex;
2327

24-
type Input = (usize, u16);
28+
type Input = (u64, u16);
2529

2630
struct Exclusive {
27-
part_one: usize,
31+
part_one: u64,
2832
part_two: Vec<u16>,
2933
}
3034

3135
pub fn parse(input: &str) -> Input {
32-
let numbers: Vec<_> = input.iter_unsigned().collect();
3336
let mutex = Mutex::new(Exclusive { part_one: 0, part_two: vec![0; 130321] });
3437

35-
// Use as many cores as possible to parallelize the remaining search.
36-
spawn_parallel_iterator(&numbers, |iter| worker(&mutex, iter));
38+
#[cfg(not(feature = "simd"))]
39+
scalar::parallel(input, &mutex);
40+
#[cfg(feature = "simd")]
41+
simd::parallel(input, &mutex);
3742

3843
let Exclusive { part_one, part_two } = mutex.into_inner().unwrap();
3944
(part_one, *part_two.iter().max().unwrap())
4045
}
4146

42-
pub fn part1(input: &Input) -> usize {
47+
pub fn part1(input: &Input) -> u64 {
4348
input.0
4449
}
4550

4651
pub fn part2(input: &Input) -> u16 {
4752
input.1
4853
}
4954

50-
fn worker(mutex: &Mutex<Exclusive>, iter: ParIter<'_, usize>) {
51-
let mut part_one = 0;
52-
let mut part_two = vec![0; 130321];
53-
let mut seen = vec![u16::MAX; 130321];
55+
#[cfg(not(feature = "simd"))]
56+
mod scalar {
57+
use super::*;
5458

55-
for (id, number) in iter.enumerate() {
56-
let id = id as u16;
59+
// Use as many cores as possible to parallelize the remaining search.
60+
pub(super) fn parallel(input: &str, mutex: &Mutex<Exclusive>) {
61+
let numbers: Vec<_> = input.iter_unsigned().collect();
62+
spawn_parallel_iterator(&numbers, |iter| worker(mutex, iter));
63+
}
5764

58-
let zeroth = *number;
59-
let first = hash(zeroth);
60-
let second = hash(first);
61-
let third = hash(second);
65+
fn worker(mutex: &Mutex<Exclusive>, iter: ParIter<'_, u32>) {
66+
let mut part_one = 0;
67+
let mut part_two = vec![0; 130321];
68+
let mut seen = vec![u16::MAX; 130321];
69+
70+
for (id, number) in iter.enumerate() {
71+
let id = id as u16;
72+
73+
let zeroth = *number;
74+
let first = hash(zeroth);
75+
let second = hash(first);
76+
let third = hash(second);
77+
78+
let mut a;
79+
let mut b = to_index(zeroth, first);
80+
let mut c = to_index(first, second);
81+
let mut d = to_index(second, third);
82+
83+
let mut number = third;
84+
let mut previous = third % 10;
85+
86+
for _ in 3..2000 {
87+
number = hash(number);
88+
let price = number % 10;
89+
90+
// Compute index into the array.
91+
(a, b, c, d) = (b, c, d, to_index(previous, price));
92+
let index = (6859 * a + 361 * b + 19 * c + d) as usize;
93+
previous = price;
94+
95+
// Only sell the first time we see a sequence.
96+
// By storing the id in the array we don't need to zero every iteration which is faster.
97+
if seen[index] != id {
98+
part_two[index] += price as u16;
99+
seen[index] = id;
100+
}
101+
}
62102

63-
let mut a;
64-
let mut b = to_index(zeroth, first);
65-
let mut c = to_index(first, second);
66-
let mut d = to_index(second, third);
103+
part_one += number as u64;
104+
}
67105

68-
let mut number = third;
69-
let mut previous = third % 10;
106+
// Merge into global results.
107+
let mut exclusive = mutex.lock().unwrap();
108+
exclusive.part_one += part_one;
109+
exclusive.part_two.iter_mut().zip(part_two).for_each(|(a, b)| *a += b);
110+
}
111+
112+
/// Compute next secret number using a
113+
/// [Xorshift LFSR](https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Xorshift_LFSRs).
114+
fn hash(mut n: u32) -> u32 {
115+
n = (n ^ (n << 6)) & 0xffffff;
116+
n = (n ^ (n >> 5)) & 0xffffff;
117+
(n ^ (n << 11)) & 0xffffff
118+
}
70119

71-
for _ in 3..2000 {
72-
number = hash(number);
73-
let price = number % 10;
120+
/// Convert -9..9 to 0..18.
121+
fn to_index(previous: u32, current: u32) -> u32 {
122+
9 + current % 10 - previous % 10
123+
}
124+
}
74125

75-
// Compute index into the array.
76-
(a, b, c, d) = (b, c, d, 9 + price - previous);
77-
let index = 6859 * a + 361 * b + 19 * c + d;
126+
#[cfg(feature = "simd")]
127+
mod simd {
128+
use super::*;
129+
use std::simd::Simd;
130+
use std::simd::num::SimdUint as _;
78131

79-
// Only sell the first time we see a sequence.
80-
// By storing the id in the array we don't need to zero every iteration which is faster.
81-
if seen[index] != id {
82-
part_two[index] += price as u16;
83-
seen[index] = id;
132+
type Vector = Simd<u32, 8>;
133+
134+
pub(super) fn parallel(input: &str, mutex: &Mutex<Exclusive>) {
135+
let mut numbers: Vec<_> = input.iter_unsigned().collect();
136+
137+
// Add zero elements so that size is a multiple of 8.
138+
// Zero always hashes to zero and does not contribute to score.
139+
numbers.resize(numbers.len().next_multiple_of(8), 0);
140+
let chunks: Vec<_> = numbers.chunks_exact(8).collect();
141+
142+
spawn_parallel_iterator(&chunks, |iter| worker(mutex, iter));
143+
}
144+
145+
/// Similar to scalar version but using SIMD vectors instead.
146+
/// 8 lanes is the sweet spot for performance as the bottleneck is the scalar loop writing
147+
/// to disjoint indices after each step.
148+
fn worker(mutex: &Mutex<Exclusive>, iter: ParIter<'_, &[u32]>) {
149+
let ten = Simd::splat(10);
150+
let x = Simd::splat(6859);
151+
let y = Simd::splat(361);
152+
let z = Simd::splat(19);
153+
154+
let mut part_one = 0;
155+
let mut part_two = vec![0; 130321];
156+
157+
for slice in iter {
158+
// Each lane uses a different bit to track if a sequence has been seen before.
159+
let mut seen = vec![u8::MAX; 130321];
160+
161+
let zeroth = Simd::from_slice(slice);
162+
let first = hash(zeroth);
163+
let second = hash(first);
164+
let third = hash(second);
165+
166+
let mut a;
167+
let mut b = to_index(zeroth, first);
168+
let mut c = to_index(first, second);
169+
let mut d = to_index(second, third);
170+
171+
let mut number = third;
172+
let mut previous = third % ten;
173+
174+
for _ in 3..2000 {
175+
number = hash(number);
176+
let prices = number % ten;
177+
178+
// Compute index into the array.
179+
(a, b, c, d) = (b, c, d, to_index(previous, prices));
180+
let indices = x * a + y * b + z * c + d;
181+
previous = prices;
182+
183+
// Only sell the first time we see a sequence.
184+
let indices = indices.to_array();
185+
let prices = prices.to_array();
186+
187+
for i in 0..8 {
188+
let index = indices[i] as usize;
189+
190+
// Avoid branching to improve speed, instead multiply by either 0 or 1,
191+
// depending if sequence has been seen before or not.
192+
let bit = (seen[index] >> i) & 1;
193+
seen[index] &= !(1 << i);
194+
195+
part_two[index] += prices[i] as u16 * bit as u16;
196+
}
84197
}
85198

86-
previous = price;
199+
part_one += number.reduce_sum() as u64;
87200
}
88201

89-
part_one += number;
202+
// Merge into global results.
203+
let mut exclusive = mutex.lock().unwrap();
204+
exclusive.part_one += part_one;
205+
exclusive.part_two.iter_mut().zip(part_two).for_each(|(a, b)| *a += b);
90206
}
91207

92-
// Merge into global results.
93-
let mut exclusive = mutex.lock().unwrap();
94-
exclusive.part_one += part_one;
95-
exclusive.part_two.iter_mut().zip(part_two).for_each(|(a, b)| *a += b);
96-
}
97-
98-
/// Compute next secret number using a
99-
/// [Xorshift LFSR](https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Xorshift_LFSRs).
100-
fn hash(mut n: usize) -> usize {
101-
n = (n ^ (n << 6)) & 0xffffff;
102-
n = (n ^ (n >> 5)) & 0xffffff;
103-
(n ^ (n << 11)) & 0xffffff
104-
}
208+
/// SIMD vector arguments are passed in memory so inline functions to avoid slow transfers
209+
/// to and from memory.
210+
#[inline]
211+
fn hash(mut n: Vector) -> Vector {
212+
let mask = Simd::splat(0xffffff);
213+
n = (n ^ (n << 6)) & mask;
214+
n = (n ^ (n >> 5)) & mask;
215+
(n ^ (n << 11)) & mask
216+
}
105217

106-
/// Convert -9..9 to 0..18.
107-
fn to_index(previous: usize, current: usize) -> usize {
108-
9 + current % 10 - previous % 10
218+
#[inline]
219+
fn to_index(previous: Vector, current: Vector) -> Vector {
220+
let nine = Simd::splat(9);
221+
let ten = Simd::splat(10);
222+
nine + (current % ten) - (previous % ten)
223+
}
109224
}

0 commit comments

Comments
 (0)