Skip to content

Commit 4c5dc2f

Browse files
committed
switch the BLAKE2 implementation to blake2b_simd/blake2s_simd
This is mostly a large performance improvement. The BLAKE2b bench_10000 case is improved by about 30%. This implementation also detects SIMD support at runtime, so the feature flags related to SIMD support are removed. The only performance loss is in the bench_10 cases, where the caller repeatedly feeds input slices less than one block long. The BLAKE2s bench_10 case is almost 20% slower. I'm not sure exactly why, but this implementation optimizes for avoiding copies on long runs of input, so it might just be that it's doing more math up front. This performance issue disappears if the inputs are a full block or longer. The only API consequence of this change is that the undocumented with_parameter_block constructor is no longer supported. Callers who need other parameters might prefer to use the blake2b_simd/blake2s_simd APIs directly, which expose them in a safer way through a Params object.
1 parent 526cc6e commit 4c5dc2f

15 files changed

+41
-884
lines changed

blake2/Cargo.toml

+3-4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ digest = "0.8"
1414
byte-tools = "0.3"
1515
crypto-mac = "0.7"
1616
opaque-debug = "0.2"
17+
blake2b_simd = { version = "0.5", default-features = false }
18+
blake2s_simd = { version = "0.5", default-features = false }
1719

1820
[dev-dependencies]
1921
digest = { version = "0.8", features = ["dev"] }
@@ -22,10 +24,7 @@ hex-literal = "0.1"
2224

2325
[features]
2426
default = ["std"]
25-
std = ["digest/std", "crypto-mac/std"]
26-
simd = []
27-
simd_opt = ["simd"]
28-
simd_asm = ["simd_opt"]
27+
std = ["digest/std", "crypto-mac/std", "blake2b_simd/std", "blake2s_simd/std"]
2928

3029
[badges]
3130
travis-ci = { repository = "RustCrypto/hashes" }

blake2/src/as_bytes.rs

-43
This file was deleted.

blake2/src/blake2.rs

+28-203
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,26 @@
11
macro_rules! blake2_impl {
22
(
3-
$state:ident, $fix_state:ident, $word:ident, $vec:ident, $bytes:ident,
4-
$R1:expr, $R2:expr, $R3:expr, $R4:expr, $IV:expr,
5-
$vardoc:expr, $doc:expr,
3+
$state:ident, $fix_state:ident, $word:ident, $bytes:ident,
4+
$vardoc:expr, $doc:expr, $lib:ident,
65
) => {
7-
8-
use $crate::as_bytes::AsBytes;
9-
use $crate::simd::{Vector4, $vec};
10-
116
use digest::{Input, BlockInput, FixedOutput, VariableOutput, Reset};
127
use digest::InvalidOutputSize;
138
use digest::generic_array::GenericArray;
149
use digest::generic_array::typenum::Unsigned;
15-
use core::cmp;
16-
use byte_tools::{copy, zero};
10+
use byte_tools::copy;
1711
use crypto_mac::{Mac, MacResult, InvalidKeyLength};
1812

13+
use $lib::Params;
14+
use $lib::State;
15+
1916
type Output = GenericArray<u8, $bytes>;
2017

2118
#[derive(Clone)]
2219
#[doc=$vardoc]
2320
pub struct $state {
24-
m: [$word; 16],
25-
h: [$vec; 2],
26-
t: u64,
27-
n: usize,
28-
29-
h0: [$vec; 2],
30-
m0: [$word; 16],
31-
t0: u64,
32-
}
33-
34-
#[inline(always)]
35-
fn iv0() -> $vec { $vec::new($IV[0], $IV[1], $IV[2], $IV[3]) }
36-
#[inline(always)]
37-
fn iv1() -> $vec { $vec::new($IV[4], $IV[5], $IV[6], $IV[7]) }
38-
39-
#[inline(always)]
40-
fn quarter_round(v: &mut [$vec; 4], rd: u32, rb: u32, m: $vec) {
41-
v[0] = v[0].wrapping_add(v[1]).wrapping_add(m.from_le());
42-
v[3] = (v[3] ^ v[0]).rotate_right_const(rd);
43-
v[2] = v[2].wrapping_add(v[3]);
44-
v[1] = (v[1] ^ v[2]).rotate_right_const(rb);
45-
}
46-
47-
#[inline(always)]
48-
fn shuffle(v: &mut [$vec; 4]) {
49-
v[1] = v[1].shuffle_left_1();
50-
v[2] = v[2].shuffle_left_2();
51-
v[3] = v[3].shuffle_left_3();
52-
}
53-
54-
#[inline(always)]
55-
fn unshuffle(v: &mut [$vec; 4]) {
56-
v[1] = v[1].shuffle_right_1();
57-
v[2] = v[2].shuffle_right_2();
58-
v[3] = v[3].shuffle_right_3();
59-
}
60-
61-
#[inline(always)]
62-
fn round(v: &mut [$vec; 4], m: &[$word; 16], s: &[usize; 16]) {
63-
quarter_round(v, $R1, $R2, $vec::gather(m,
64-
s[ 0], s[ 2], s[ 4], s[ 6]));
65-
quarter_round(v, $R3, $R4, $vec::gather(m,
66-
s[ 1], s[ 3], s[ 5], s[ 7]));
67-
68-
shuffle(v);
69-
quarter_round(v, $R1, $R2, $vec::gather(m,
70-
s[ 8], s[10], s[12], s[14]));
71-
quarter_round(v, $R3, $R4, $vec::gather(m,
72-
s[ 9], s[11], s[13], s[15]));
73-
unshuffle(v);
21+
params: Params,
22+
state: State,
23+
output_size: usize,
7424
}
7525

7626
impl $state {
@@ -80,156 +30,33 @@ macro_rules! blake2_impl {
8030
/// make sure to compare codes in constant time! It can be done
8131
/// for example by using `subtle` crate.
8232
pub fn new_keyed(key: &[u8], output_size: usize) -> Self {
83-
let kk = key.len();
84-
assert!(kk <= $bytes::to_usize());
85-
assert!(output_size <= $bytes::to_usize());
86-
87-
let p0 = 0x0101_0000 ^ ((kk as $word) << 8) ^
88-
(output_size as $word);
89-
let h0 = [iv0() ^ $vec::new(p0, 0, 0, 0), iv1()];
90-
let mut state = $state {
91-
m: [0; 16],
92-
h: h0,
93-
t: 0,
94-
n: output_size,
95-
96-
t0: 0,
97-
m0: [0; 16],
98-
h0: h0,
99-
};
100-
101-
if kk > 0 {
102-
copy(key, state.m.as_mut_bytes());
103-
state.t = 2 * $bytes::to_u64();
104-
}
105-
106-
state.t0 = state.t;
107-
state.m0 = state.m;
108-
state
109-
}
110-
111-
#[doc(hidden)]
112-
pub fn with_parameter_block(p: &[$word; 8]) -> Self {
113-
let nn = p[0] as u8 as usize;
114-
let kk = (p[0] >> 8) as u8 as usize;
115-
assert!(nn >= 1 && nn <= $bytes::to_usize());
116-
assert!(kk <= $bytes::to_usize());
117-
118-
let h0 = [
119-
iv0() ^ $vec::new(p[0], p[1], p[2], p[3]),
120-
iv1() ^ $vec::new(p[4], p[5], p[6], p[7]),
121-
];
122-
123-
$state {
124-
m: [0; 16],
125-
h: h0,
126-
t: 0,
127-
n: nn,
128-
129-
t0: 0,
130-
m0: [0; 16],
131-
h0: h0,
33+
let mut params = Params::new();
34+
params.hash_length(output_size);
35+
params.key(key);
36+
Self {
37+
state: params.to_state(),
38+
params,
39+
output_size,
13240
}
13341
}
13442

13543
/// Updates the hashing context with more data.
13644
fn update(&mut self, data: &[u8]) {
137-
let mut rest = data;
138-
139-
let block = 2 * $bytes::to_usize();
140-
141-
let off = self.t as usize % block;
142-
if off != 0 || self.t == 0 {
143-
let len = cmp::min(block - off, rest.len());
144-
145-
let part = &rest[..len];
146-
rest = &rest[part.len()..];
147-
148-
copy(part, &mut self.m.as_mut_bytes()[off..]);
149-
self.t = self.t.checked_add(part.len() as u64)
150-
.expect("hash data length overflow");
151-
}
152-
153-
while rest.len() >= block {
154-
self.compress(0, 0);
155-
156-
let part = &rest[..block];
157-
rest = &rest[part.len()..];
158-
159-
copy(part, &mut self.m.as_mut_bytes());
160-
self.t = self.t.checked_add(part.len() as u64)
161-
.expect("hash data length overflow");
162-
}
163-
164-
let n = rest.len();
165-
if n > 0 {
166-
self.compress(0, 0);
167-
168-
copy(rest, &mut self.m.as_mut_bytes());
169-
self.t = self.t.checked_add(rest.len() as u64)
170-
.expect("hash data length overflow");
171-
}
45+
self.state.update(data);
17246
}
17347

17448
#[doc(hidden)]
17549
pub fn finalize_last_node(self) -> Output {
176-
self.finalize_with_flag(!0)
50+
self.finalize_with_last_node(true)
17751
}
17852

179-
180-
fn finalize_with_flag(mut self, f1: $word) -> Output {
181-
let off = self.t as usize % (2 * $bytes::to_usize());
182-
if off != 0 {
183-
zero(&mut self.m.as_mut_bytes()[off..]);
184-
}
185-
186-
self.compress(!0, f1);
187-
188-
let buf = [self.h[0].to_le(), self.h[1].to_le()];
189-
53+
fn finalize_with_last_node(mut self, last_node: bool) -> Output {
54+
self.state.set_last_node(last_node);
55+
let hash = self.state.finalize();
19056
let mut out = GenericArray::default();
191-
copy(buf.as_bytes(), &mut out);
57+
copy(hash.as_bytes(), &mut out);
19258
out
19359
}
194-
195-
fn compress(&mut self, f0: $word, f1: $word) {
196-
use $crate::consts::SIGMA;
197-
198-
let m = &self.m;
199-
let h = &mut self.h;
200-
201-
let t0 = self.t as $word;
202-
let t1 = match $bytes::to_u8() {
203-
64 => 0,
204-
32 => (self.t >> 32) as $word,
205-
_ => unreachable!(),
206-
};
207-
208-
let mut v = [
209-
h[0],
210-
h[1],
211-
iv0(),
212-
iv1() ^ $vec::new(t0, t1, f0, f1),
213-
];
214-
215-
round(&mut v, m, &SIGMA[0]);
216-
round(&mut v, m, &SIGMA[1]);
217-
round(&mut v, m, &SIGMA[2]);
218-
round(&mut v, m, &SIGMA[3]);
219-
round(&mut v, m, &SIGMA[4]);
220-
round(&mut v, m, &SIGMA[5]);
221-
round(&mut v, m, &SIGMA[6]);
222-
round(&mut v, m, &SIGMA[7]);
223-
round(&mut v, m, &SIGMA[8]);
224-
round(&mut v, m, &SIGMA[9]);
225-
if $bytes::to_u8() == 64 {
226-
round(&mut v, m, &SIGMA[0]);
227-
round(&mut v, m, &SIGMA[1]);
228-
}
229-
230-
h[0] = h[0] ^ (v[0] ^ v[2]);
231-
h[1] = h[1] ^ (v[1] ^ v[3]);
232-
}
23360
}
23461

23562
impl Default for $state {
@@ -255,21 +82,19 @@ macro_rules! blake2_impl {
25582
}
25683

25784
fn output_size(&self) -> usize {
258-
self.n
85+
self.output_size
25986
}
26087

26188
fn variable_result<F: FnOnce(&[u8])>(self, f: F) {
262-
let n = self.n;
263-
let res = self.finalize_with_flag(0);
89+
let n = self.output_size;
90+
let res = self.finalize_with_last_node(false);
26491
f(&res[..n]);
26592
}
26693
}
26794

26895
impl Reset for $state {
26996
fn reset(&mut self) {
270-
self.t = self.t0;
271-
self.m = self.m0;
272-
self.h = self.h0;
97+
self.state = self.params.to_state();
27398
}
27499
}
275100

@@ -304,7 +129,7 @@ macro_rules! blake2_impl {
304129
type OutputSize = $bytes;
305130

306131
fn fixed_result(self) -> Output {
307-
self.state.finalize_with_flag(0)
132+
self.state.finalize_with_last_node(false)
308133
}
309134
}
310135

@@ -339,7 +164,7 @@ macro_rules! blake2_impl {
339164
}
340165

341166
fn result(self) -> MacResult<Self::OutputSize> {
342-
MacResult::new(self.state.finalize_with_flag(0))
167+
MacResult::new(self.state.finalize_with_last_node(false))
343168
}
344169
}
345170

blake2/src/blake2b.rs

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
use digest::generic_array::typenum::U64;
2-
use consts::BLAKE2B_IV;
32

4-
blake2_impl!(VarBlake2b, Blake2b, u64, u64x4, U64,
5-
32, 24, 16, 63, BLAKE2B_IV,
3+
blake2_impl!(VarBlake2b, Blake2b, u64, U64,
64
"Blake2b instance with a variable output.",
75
"Blake2b instance with a fixed output.",
6+
blake2b_simd,
87
);

blake2/src/blake2s.rs

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
use digest::generic_array::typenum::U32;
2-
use consts::BLAKE2S_IV;
32

4-
blake2_impl!(VarBlake2s, Blake2s, u32, u32x4, U32,
5-
16, 12, 8, 7, BLAKE2S_IV,
3+
blake2_impl!(VarBlake2s, Blake2s, u32, U32,
64
"Blake2s instance with a variable output.",
75
"Blake2s instance with a fixed output.",
6+
blake2s_simd,
87
);

0 commit comments

Comments
 (0)