|
17 | 17 | //! by 5 bits and storing in an array of 2²⁰ = 1048675 elements. Multiplication on modern
|
18 | 18 | //! processors is cheap (and several instructions can issue at once) but random memory access
|
19 | 19 | //! is expensive.
|
| 20 | +//! |
| 21 | +//! A SIMD variant processes 8 hashes at a time, taking about 60% of the time of the scalar version. |
| 22 | +//! The bottleneck is that disjoint indices must be written in sequence reducing the amount of work |
| 23 | +//! that can be parallelized. |
20 | 24 | use crate::util::parse::*;
|
21 | 25 | use crate::util::thread::*;
|
22 | 26 | use std::sync::Mutex;
|
23 | 27 |
|
24 |
| -type Input = (usize, u16); |
| 28 | +type Input = (u64, u16); |
25 | 29 |
|
26 | 30 | struct Exclusive {
|
27 |
| - part_one: usize, |
| 31 | + part_one: u64, |
28 | 32 | part_two: Vec<u16>,
|
29 | 33 | }
|
30 | 34 |
|
31 | 35 | pub fn parse(input: &str) -> Input {
|
32 |
| - let numbers: Vec<_> = input.iter_unsigned().collect(); |
33 | 36 | let mutex = Mutex::new(Exclusive { part_one: 0, part_two: vec![0; 130321] });
|
34 | 37 |
|
35 |
| - // Use as many cores as possible to parallelize the remaining search. |
36 |
| - spawn_parallel_iterator(&numbers, |iter| worker(&mutex, iter)); |
| 38 | + #[cfg(not(feature = "simd"))] |
| 39 | + scalar::parallel(input, &mutex); |
| 40 | + #[cfg(feature = "simd")] |
| 41 | + simd::parallel(input, &mutex); |
37 | 42 |
|
38 | 43 | let Exclusive { part_one, part_two } = mutex.into_inner().unwrap();
|
39 | 44 | (part_one, *part_two.iter().max().unwrap())
|
40 | 45 | }
|
41 | 46 |
|
42 |
| -pub fn part1(input: &Input) -> usize { |
| 47 | +pub fn part1(input: &Input) -> u64 { |
43 | 48 | input.0
|
44 | 49 | }
|
45 | 50 |
|
46 | 51 | pub fn part2(input: &Input) -> u16 {
|
47 | 52 | input.1
|
48 | 53 | }
|
49 | 54 |
|
50 |
| -fn worker(mutex: &Mutex<Exclusive>, iter: ParIter<'_, usize>) { |
51 |
| - let mut part_one = 0; |
52 |
| - let mut part_two = vec![0; 130321]; |
53 |
| - let mut seen = vec![u16::MAX; 130321]; |
| 55 | +#[cfg(not(feature = "simd"))] |
| 56 | +mod scalar { |
| 57 | + use super::*; |
54 | 58 |
|
55 |
| - for (id, number) in iter.enumerate() { |
56 |
| - let id = id as u16; |
| 59 | + // Use as many cores as possible to parallelize the remaining search. |
| 60 | + pub(super) fn parallel(input: &str, mutex: &Mutex<Exclusive>) { |
| 61 | + let numbers: Vec<_> = input.iter_unsigned().collect(); |
| 62 | + spawn_parallel_iterator(&numbers, |iter| worker(mutex, iter)); |
| 63 | + } |
57 | 64 |
|
58 |
| - let zeroth = *number; |
59 |
| - let first = hash(zeroth); |
60 |
| - let second = hash(first); |
61 |
| - let third = hash(second); |
| 65 | + fn worker(mutex: &Mutex<Exclusive>, iter: ParIter<'_, u32>) { |
| 66 | + let mut part_one = 0; |
| 67 | + let mut part_two = vec![0; 130321]; |
| 68 | + let mut seen = vec![u16::MAX; 130321]; |
| 69 | + |
| 70 | + for (id, number) in iter.enumerate() { |
| 71 | + let id = id as u16; |
| 72 | + |
| 73 | + let zeroth = *number; |
| 74 | + let first = hash(zeroth); |
| 75 | + let second = hash(first); |
| 76 | + let third = hash(second); |
| 77 | + |
| 78 | + let mut a; |
| 79 | + let mut b = to_index(zeroth, first); |
| 80 | + let mut c = to_index(first, second); |
| 81 | + let mut d = to_index(second, third); |
| 82 | + |
| 83 | + let mut number = third; |
| 84 | + let mut previous = third % 10; |
| 85 | + |
| 86 | + for _ in 3..2000 { |
| 87 | + number = hash(number); |
| 88 | + let price = number % 10; |
| 89 | + |
| 90 | + // Compute index into the array. |
| 91 | + (a, b, c, d) = (b, c, d, to_index(previous, price)); |
| 92 | + let index = (6859 * a + 361 * b + 19 * c + d) as usize; |
| 93 | + previous = price; |
| 94 | + |
| 95 | + // Only sell the first time we see a sequence. |
| 96 | + // By storing the id in the array we don't need to zero every iteration which is faster. |
| 97 | + if seen[index] != id { |
| 98 | + part_two[index] += price as u16; |
| 99 | + seen[index] = id; |
| 100 | + } |
| 101 | + } |
62 | 102 |
|
63 |
| - let mut a; |
64 |
| - let mut b = to_index(zeroth, first); |
65 |
| - let mut c = to_index(first, second); |
66 |
| - let mut d = to_index(second, third); |
| 103 | + part_one += number as u64; |
| 104 | + } |
67 | 105 |
|
68 |
| - let mut number = third; |
69 |
| - let mut previous = third % 10; |
| 106 | + // Merge into global results. |
| 107 | + let mut exclusive = mutex.lock().unwrap(); |
| 108 | + exclusive.part_one += part_one; |
| 109 | + exclusive.part_two.iter_mut().zip(part_two).for_each(|(a, b)| *a += b); |
| 110 | + } |
| 111 | + |
| 112 | + /// Compute next secret number using a |
| 113 | + /// [Xorshift LFSR](https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Xorshift_LFSRs). |
| 114 | + fn hash(mut n: u32) -> u32 { |
| 115 | + n = (n ^ (n << 6)) & 0xffffff; |
| 116 | + n = (n ^ (n >> 5)) & 0xffffff; |
| 117 | + (n ^ (n << 11)) & 0xffffff |
| 118 | + } |
70 | 119 |
|
71 |
| - for _ in 3..2000 { |
72 |
| - number = hash(number); |
73 |
| - let price = number % 10; |
| 120 | + /// Convert -9..9 to 0..18. |
| 121 | + fn to_index(previous: u32, current: u32) -> u32 { |
| 122 | + 9 + current % 10 - previous % 10 |
| 123 | + } |
| 124 | +} |
74 | 125 |
|
75 |
| - // Compute index into the array. |
76 |
| - (a, b, c, d) = (b, c, d, 9 + price - previous); |
77 |
| - let index = 6859 * a + 361 * b + 19 * c + d; |
| 126 | +#[cfg(feature = "simd")] |
| 127 | +mod simd { |
| 128 | + use super::*; |
| 129 | + use std::simd::Simd; |
| 130 | + use std::simd::num::SimdUint as _; |
78 | 131 |
|
79 |
| - // Only sell the first time we see a sequence. |
80 |
| - // By storing the id in the array we don't need to zero every iteration which is faster. |
81 |
| - if seen[index] != id { |
82 |
| - part_two[index] += price as u16; |
83 |
| - seen[index] = id; |
| 132 | + type Vector = Simd<u32, 8>; |
| 133 | + |
| 134 | + pub(super) fn parallel(input: &str, mutex: &Mutex<Exclusive>) { |
| 135 | + let mut numbers: Vec<_> = input.iter_unsigned().collect(); |
| 136 | + |
| 137 | + // Add zero elements so that size is a multiple of 8. |
| 138 | + // Zero always hashes to zero and does not contribute to score. |
| 139 | + numbers.resize(numbers.len().next_multiple_of(8), 0); |
| 140 | + let chunks: Vec<_> = numbers.chunks_exact(8).collect(); |
| 141 | + |
| 142 | + spawn_parallel_iterator(&chunks, |iter| worker(mutex, iter)); |
| 143 | + } |
| 144 | + |
| 145 | + /// Similar to scalar version but using SIMD vectors instead. |
| 146 | + /// 8 lanes is the sweet spot for performance as the bottleneck is the scalar loop writing |
| 147 | + /// to disjoint indices after each step. |
| 148 | + fn worker(mutex: &Mutex<Exclusive>, iter: ParIter<'_, &[u32]>) { |
| 149 | + let ten = Simd::splat(10); |
| 150 | + let x = Simd::splat(6859); |
| 151 | + let y = Simd::splat(361); |
| 152 | + let z = Simd::splat(19); |
| 153 | + |
| 154 | + let mut part_one = 0; |
| 155 | + let mut part_two = vec![0; 130321]; |
| 156 | + |
| 157 | + for slice in iter { |
| 158 | + // Each lane uses a different bit to track if a sequence has been seen before. |
| 159 | + let mut seen = vec![u8::MAX; 130321]; |
| 160 | + |
| 161 | + let zeroth = Simd::from_slice(slice); |
| 162 | + let first = hash(zeroth); |
| 163 | + let second = hash(first); |
| 164 | + let third = hash(second); |
| 165 | + |
| 166 | + let mut a; |
| 167 | + let mut b = to_index(zeroth, first); |
| 168 | + let mut c = to_index(first, second); |
| 169 | + let mut d = to_index(second, third); |
| 170 | + |
| 171 | + let mut number = third; |
| 172 | + let mut previous = third % ten; |
| 173 | + |
| 174 | + for _ in 3..2000 { |
| 175 | + number = hash(number); |
| 176 | + let prices = number % ten; |
| 177 | + |
| 178 | + // Compute index into the array. |
| 179 | + (a, b, c, d) = (b, c, d, to_index(previous, prices)); |
| 180 | + let indices = x * a + y * b + z * c + d; |
| 181 | + previous = prices; |
| 182 | + |
| 183 | + // Only sell the first time we see a sequence. |
| 184 | + let indices = indices.to_array(); |
| 185 | + let prices = prices.to_array(); |
| 186 | + |
| 187 | + for i in 0..8 { |
| 188 | + let index = indices[i] as usize; |
| 189 | + |
| 190 | + // Avoid branching to improve speed, instead multiply by either 0 or 1, |
| 191 | + // depending if sequence has been seen before or not. |
| 192 | + let bit = (seen[index] >> i) & 1; |
| 193 | + seen[index] &= !(1 << i); |
| 194 | + |
| 195 | + part_two[index] += prices[i] as u16 * bit as u16; |
| 196 | + } |
84 | 197 | }
|
85 | 198 |
|
86 |
| - previous = price; |
| 199 | + part_one += number.reduce_sum() as u64; |
87 | 200 | }
|
88 | 201 |
|
89 |
| - part_one += number; |
| 202 | + // Merge into global results. |
| 203 | + let mut exclusive = mutex.lock().unwrap(); |
| 204 | + exclusive.part_one += part_one; |
| 205 | + exclusive.part_two.iter_mut().zip(part_two).for_each(|(a, b)| *a += b); |
90 | 206 | }
|
91 | 207 |
|
92 |
| - // Merge into global results. |
93 |
| - let mut exclusive = mutex.lock().unwrap(); |
94 |
| - exclusive.part_one += part_one; |
95 |
| - exclusive.part_two.iter_mut().zip(part_two).for_each(|(a, b)| *a += b); |
96 |
| -} |
97 |
| - |
98 |
| -/// Compute next secret number using a |
99 |
| -/// [Xorshift LFSR](https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Xorshift_LFSRs). |
100 |
| -fn hash(mut n: usize) -> usize { |
101 |
| - n = (n ^ (n << 6)) & 0xffffff; |
102 |
| - n = (n ^ (n >> 5)) & 0xffffff; |
103 |
| - (n ^ (n << 11)) & 0xffffff |
104 |
| -} |
| 208 | + /// SIMD vector arguments are passed in memory so inline functions to avoid slow transfers |
| 209 | + /// to and from memory. |
| 210 | + #[inline] |
| 211 | + fn hash(mut n: Vector) -> Vector { |
| 212 | + let mask = Simd::splat(0xffffff); |
| 213 | + n = (n ^ (n << 6)) & mask; |
| 214 | + n = (n ^ (n >> 5)) & mask; |
| 215 | + (n ^ (n << 11)) & mask |
| 216 | + } |
105 | 217 |
|
106 |
| -/// Convert -9..9 to 0..18. |
107 |
| -fn to_index(previous: usize, current: usize) -> usize { |
108 |
| - 9 + current % 10 - previous % 10 |
| 218 | + #[inline] |
| 219 | + fn to_index(previous: Vector, current: Vector) -> Vector { |
| 220 | + let nine = Simd::splat(9); |
| 221 | + let ten = Simd::splat(10); |
| 222 | + nine + (current % ten) - (previous % ten) |
| 223 | + } |
109 | 224 | }
|
0 commit comments