Skip to content

Commit 36b34ef

Browse files
authoredAug 9, 2024··
kuznyechik: add Neon backend (#447)
1 parent daac7ea commit 36b34ef

File tree

4 files changed

+423
-4
lines changed

4 files changed

+423
-4
lines changed
 

‎.github/workflows/kuznyechik.yml

+31-4
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,12 @@ jobs:
3737
toolchain: ${{ matrix.rust }}
3838
targets: ${{ matrix.target }}
3939
- run: cargo build --target ${{ matrix.target }}
40-
- run: cargo build --target ${{ matrix.target }}
41-
env:
40+
- env:
4241
RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft"
43-
- run: cargo build --target ${{ matrix.target }}
44-
env:
42+
run: cargo build --target ${{ matrix.target }}
43+
- env:
4544
RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft --cfg kuznyechik_compact_soft"
45+
run: cargo build --target ${{ matrix.target }}
4646

4747
minimal-versions:
4848
uses: RustCrypto/actions/.github/workflows/minimal-versions.yml@master
@@ -75,3 +75,30 @@ jobs:
7575
run: |
7676
cargo test
7777
cargo test --all-features
78+
79+
macos:
80+
runs-on: macos-latest
81+
strategy:
82+
matrix:
83+
rust:
84+
- 1.65.0
85+
- stable
86+
steps:
87+
- uses: actions/checkout@v4
88+
- uses: RustCrypto/actions/cargo-cache@master
89+
- uses: dtolnay/rust-toolchain@master
90+
with:
91+
toolchain: ${{ matrix.rust }}
92+
- run: |
93+
cargo test
94+
cargo test --all-features
95+
- env:
96+
RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft"
97+
run: |
98+
cargo test
99+
cargo test --all-features
100+
- env:
101+
RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft --cfg kuznyechik_compact_soft"
102+
run: |
103+
cargo test
104+
cargo test --all-features

‎kuznyechik/src/lib.rs

+7
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,13 @@ cfg_if::cfg_if!(
5151
))] {
5252
mod sse2;
5353
use sse2 as imp;
54+
} else if #[cfg(all(
55+
target_arch = "aarch64",
56+
target_feature = "neon",
57+
not(kuznyechik_force_soft),
58+
))] {
59+
mod neon;
60+
use neon as imp;
5461
} else if #[cfg(kuznyechik_compact_soft)] {
5562
mod compact_soft;
5663
use compact_soft as imp;

‎kuznyechik/src/neon/backends.rs

+322
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
use super::consts::{Table, DEC_TABLE, ENC_TABLE, RKEY_GEN};
2+
use crate::{
3+
consts::{P, P_INV},
4+
Block, Key,
5+
};
6+
use cipher::{
7+
consts::{U16, U8},
8+
inout::InOut,
9+
typenum::Unsigned,
10+
BlockBackend, BlockSizeUser, ParBlocks, ParBlocksSizeUser,
11+
};
12+
13+
use core::arch::aarch64::*;
14+
15+
pub(super) type RoundKeys = [uint8x16_t; 10];
16+
17+
type ParBlocksSize = U8;
18+
19+
#[rustfmt::skip]
20+
macro_rules! unroll_par {
21+
($var:ident, $body:block) => {
22+
{ let $var: usize = 0; $body; }
23+
{ let $var: usize = 1; $body; }
24+
{ let $var: usize = 2; $body; }
25+
{ let $var: usize = 3; $body; }
26+
{ let $var: usize = 4; $body; }
27+
{ let $var: usize = 5; $body; }
28+
{ let $var: usize = 6; $body; }
29+
{ let $var: usize = 7; $body; }
30+
31+
};
32+
}
33+
34+
#[inline(always)]
35+
unsafe fn sub_bytes(block: uint8x16_t, sbox: &[u8; 256]) -> uint8x16_t {
36+
let value_vector = vdupq_n_u8(64);
37+
38+
//Split the sbox table into four parts
39+
let sbox_part1 = uint8x16x4_t(
40+
vld1q_u8(&sbox[0] as *const u8),
41+
vld1q_u8(&sbox[16] as *const u8),
42+
vld1q_u8(&sbox[32] as *const u8),
43+
vld1q_u8(&sbox[48] as *const u8),
44+
);
45+
46+
let sbox_part2 = uint8x16x4_t(
47+
vld1q_u8(&sbox[64] as *const u8),
48+
vld1q_u8(&sbox[80] as *const u8),
49+
vld1q_u8(&sbox[96] as *const u8),
50+
vld1q_u8(&sbox[112] as *const u8),
51+
);
52+
53+
let sbox_part3 = uint8x16x4_t(
54+
vld1q_u8(&sbox[128] as *const u8),
55+
vld1q_u8(&sbox[144] as *const u8),
56+
vld1q_u8(&sbox[160] as *const u8),
57+
vld1q_u8(&sbox[176] as *const u8),
58+
);
59+
60+
let sbox_part4 = uint8x16x4_t(
61+
vld1q_u8(&sbox[192] as *const u8),
62+
vld1q_u8(&sbox[208] as *const u8),
63+
vld1q_u8(&sbox[224] as *const u8),
64+
vld1q_u8(&sbox[240] as *const u8),
65+
);
66+
67+
// Indexing each part of the sbox table
68+
let result1 = vqtbl4q_u8(sbox_part1, block);
69+
let block_1 = vsubq_u8(block, value_vector);
70+
let result2 = vqtbl4q_u8(sbox_part2, block_1);
71+
let block_2 = vsubq_u8(block_1, value_vector);
72+
let result3 = vqtbl4q_u8(sbox_part3, block_2);
73+
let block_3 = vsubq_u8(block_2, value_vector);
74+
let result4 = vqtbl4q_u8(sbox_part4, block_3);
75+
// Merging results
76+
let result = vorrq_u8(vorrq_u8(result1, result2), vorrq_u8(result3, result4));
77+
78+
result
79+
}
80+
81+
#[inline(always)]
82+
unsafe fn transform(block: uint8x16_t, table: &Table) -> uint8x16_t {
83+
macro_rules! get {
84+
($table:expr, $ind:expr, $i:expr) => {{
85+
let idx = vgetq_lane_u16($ind, $i) as usize;
86+
let p = &($table.0[idx]) as *const u8 as *const uint8x16_t;
87+
// correct alignment of `p` is guaranteed since offset values
88+
// are shifted by 4 bits left and the table is aligned to 16 bytes
89+
debug_assert_eq!(p as usize % 16, 0);
90+
vld1q_u8(p as *const u8)
91+
}};
92+
}
93+
94+
macro_rules! xor_get {
95+
($val:expr, $table:expr, $ind:expr, $i:expr) => {
96+
$val = veorq_u8($val, get!($table, $ind, $i));
97+
};
98+
}
99+
100+
let ind = vcombine_u8(
101+
vcreate_u8(0x0706050403020100),
102+
vcreate_u8(0x0f0e0d0c0b0a0908),
103+
);
104+
let test = vzip1q_u8(block, ind);
105+
106+
let lind = vshlq_n_u16(vreinterpretq_u16_u8(test), 4);
107+
108+
let mut lt = get!(table, lind, 0);
109+
110+
xor_get!(lt, table, lind, 1);
111+
xor_get!(lt, table, lind, 2);
112+
xor_get!(lt, table, lind, 3);
113+
xor_get!(lt, table, lind, 4);
114+
xor_get!(lt, table, lind, 5);
115+
xor_get!(lt, table, lind, 6);
116+
xor_get!(lt, table, lind, 7);
117+
118+
let rind = vshlq_n_u16(vreinterpretq_u16_u8(vzip2q_u8(block, ind)), 4);
119+
120+
let mut rt = get!(table, rind, 0);
121+
xor_get!(rt, table, rind, 1);
122+
xor_get!(rt, table, rind, 2);
123+
xor_get!(rt, table, rind, 3);
124+
xor_get!(rt, table, rind, 4);
125+
xor_get!(rt, table, rind, 5);
126+
xor_get!(rt, table, rind, 6);
127+
xor_get!(rt, table, rind, 7);
128+
129+
veorq_u8(lt, rt)
130+
}
131+
132+
pub fn expand_enc_keys(key: &Key) -> RoundKeys {
133+
macro_rules! next_const {
134+
($i:expr) => {{
135+
let p = RKEY_GEN.0.as_ptr() as *const uint8x16_t;
136+
// correct alignment of `p` is guaranteed since the table
137+
// is aligned to 16 bytes
138+
let p = p.add($i);
139+
debug_assert_eq!(p as usize % 16, 0);
140+
$i += 1;
141+
vld1q_u8(p as *const u8)
142+
}};
143+
}
144+
145+
unsafe {
146+
let mut enc_keys = [vdupq_n_u8(0); 10];
147+
148+
let pk: *const uint8x16_t = key.as_ptr() as *const uint8x16_t;
149+
let mut k1 = vld1q_u8(pk as *const u8);
150+
let mut k2 = vld1q_u8(pk.add(1) as *const u8);
151+
enc_keys[0] = k1;
152+
enc_keys[1] = k2;
153+
154+
let mut cidx = 0;
155+
for i in 1..5 {
156+
for _ in 0..4 {
157+
let mut t = veorq_u8(k1, next_const!(cidx));
158+
t = transform(t, &ENC_TABLE);
159+
k2 = veorq_u8(k2, t);
160+
161+
let mut t = veorq_u8(k2, next_const!(cidx));
162+
t = transform(t, &ENC_TABLE);
163+
k1 = veorq_u8(k1, t);
164+
}
165+
166+
enc_keys[2 * i] = k1;
167+
enc_keys[2 * i + 1] = k2;
168+
}
169+
170+
enc_keys
171+
}
172+
}
173+
174+
pub fn inv_enc_keys(enc_keys: &RoundKeys) -> RoundKeys {
175+
unsafe {
176+
let mut dec_keys = [vdupq_n_u8(0); 10];
177+
178+
dec_keys[0] = enc_keys[9];
179+
for i in 1..9 {
180+
let k = sub_bytes(enc_keys[i], &P);
181+
dec_keys[9 - i] = transform(k, &DEC_TABLE);
182+
}
183+
dec_keys[9] = enc_keys[0];
184+
185+
dec_keys
186+
}
187+
}
188+
189+
pub(crate) struct EncBackend<'a>(pub(crate) &'a RoundKeys);
190+
191+
impl<'a> BlockSizeUser for EncBackend<'a> {
192+
type BlockSize = U16;
193+
}
194+
195+
impl<'a> ParBlocksSizeUser for EncBackend<'a> {
196+
type ParBlocksSize = ParBlocksSize;
197+
}
198+
199+
impl<'a> BlockBackend for EncBackend<'a> {
200+
#[inline]
201+
fn proc_block(&mut self, block: InOut<'_, '_, Block>) {
202+
let k = self.0;
203+
unsafe {
204+
let (in_ptr, out_ptr) = block.into_raw();
205+
let mut b = vld1q_u8(in_ptr as *const u8);
206+
207+
for i in 0..9 {
208+
b = veorq_u8(b, k[i]);
209+
b = transform(b, &ENC_TABLE);
210+
}
211+
b = veorq_u8(b, k[9]);
212+
vst1q_u8(out_ptr as *mut u8, b);
213+
}
214+
}
215+
216+
#[inline]
217+
fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks<Self>>) {
218+
let k = self.0;
219+
unsafe {
220+
let (in_ptr, out_ptr) = blocks.into_raw();
221+
let in_ptr = in_ptr as *mut uint8x16_t;
222+
let out_ptr = out_ptr as *mut uint8x16_t;
223+
224+
let mut blocks = [vdupq_n_u8(0); ParBlocksSize::USIZE];
225+
unroll_par! {
226+
i, {
227+
blocks[i] = vld1q_u8(in_ptr.add(i) as *const u8);
228+
}
229+
};
230+
231+
for i in 0..9 {
232+
unroll_par!(j, {
233+
let t = veorq_u8(blocks[j], k[i]);
234+
blocks[j] = transform(t, &ENC_TABLE);
235+
});
236+
}
237+
238+
unroll_par! {
239+
i, {
240+
let t = veorq_u8(blocks[i], k[9]);
241+
vst1q_u8(out_ptr.add(i) as *mut u8, t);
242+
}
243+
};
244+
}
245+
}
246+
}
247+
248+
pub(crate) struct DecBackend<'a>(pub(crate) &'a RoundKeys);
249+
250+
impl<'a> BlockSizeUser for DecBackend<'a> {
251+
type BlockSize = U16;
252+
}
253+
254+
impl<'a> ParBlocksSizeUser for DecBackend<'a> {
255+
type ParBlocksSize = ParBlocksSize;
256+
}
257+
258+
impl<'a> BlockBackend for DecBackend<'a> {
259+
#[inline]
260+
fn proc_block(&mut self, block: InOut<'_, '_, Block>) {
261+
let k = self.0;
262+
unsafe {
263+
let (in_ptr, out_ptr) = block.into_raw();
264+
let mut b = vld1q_u8(in_ptr as *const u8);
265+
266+
b = veorq_u8(b, k[0]);
267+
268+
b = sub_bytes(b, &P);
269+
b = transform(b, &DEC_TABLE);
270+
271+
for i in 1..9 {
272+
b = transform(b, &DEC_TABLE);
273+
b = veorq_u8(b, k[i]);
274+
}
275+
b = sub_bytes(b, &P_INV);
276+
b = veorq_u8(b, k[9]);
277+
278+
vst1q_u8(out_ptr as *mut u8, b);
279+
}
280+
}
281+
#[inline]
282+
fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks<Self>>) {
283+
let k = self.0;
284+
unsafe {
285+
let (in_ptr, out_ptr) = blocks.into_raw();
286+
let in_ptr = in_ptr as *mut uint8x16_t;
287+
let out_ptr = out_ptr as *mut uint8x16_t;
288+
289+
let mut blocks = [vdupq_n_u8(0); ParBlocksSize::USIZE];
290+
unroll_par! {
291+
i, {
292+
blocks[i] = vld1q_u8(in_ptr.add(i) as *const u8);
293+
}
294+
};
295+
296+
unroll_par! {
297+
i, {
298+
let t = veorq_u8(blocks[i], k[0]);
299+
let t = sub_bytes(t, &P);
300+
blocks[i] = transform(t, &DEC_TABLE);
301+
}
302+
}
303+
304+
for i in 1..9 {
305+
unroll_par! {
306+
j, {
307+
let t = transform(blocks[j], &DEC_TABLE);
308+
blocks[j] = veorq_u8(t, k[i]);
309+
}
310+
}
311+
}
312+
313+
unroll_par! {
314+
i, {
315+
let t = sub_bytes(blocks[i], &P_INV);
316+
let t2 = veorq_u8(t, k[9]);
317+
vst1q_u8(out_ptr.add(i) as *mut u8, t2);
318+
}
319+
}
320+
}
321+
}
322+
}

‎kuznyechik/src/neon/mod.rs

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
use crate::{BlockSize, Key};
2+
use cipher::{BlockCipherDecrypt, BlockCipherEncrypt, BlockClosure};
3+
4+
mod backends;
5+
#[path = "../fused_tables/consts.rs"]
6+
mod consts;
7+
8+
use backends::{expand_enc_keys, inv_enc_keys, DecBackend, EncBackend, RoundKeys};
9+
10+
#[derive(Clone)]
11+
pub(crate) struct EncDecKeys {
12+
enc: RoundKeys,
13+
dec: RoundKeys,
14+
}
15+
#[derive(Clone)]
16+
pub(crate) struct EncKeys(RoundKeys);
17+
#[derive(Clone)]
18+
pub(crate) struct DecKeys(RoundKeys);
19+
20+
impl EncKeys {
21+
pub fn new(key: &Key) -> Self {
22+
Self(expand_enc_keys(key))
23+
}
24+
}
25+
26+
impl From<EncKeys> for EncDecKeys {
27+
fn from(enc: EncKeys) -> Self {
28+
Self {
29+
dec: inv_enc_keys(&enc.0),
30+
enc: enc.0,
31+
}
32+
}
33+
}
34+
35+
impl From<EncKeys> for DecKeys {
36+
fn from(enc: EncKeys) -> Self {
37+
Self(inv_enc_keys(&enc.0))
38+
}
39+
}
40+
41+
impl BlockCipherEncrypt for crate::Kuznyechik {
42+
fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = BlockSize>) {
43+
f.call(&mut EncBackend(&self.keys.enc));
44+
}
45+
}
46+
47+
impl BlockCipherDecrypt for crate::Kuznyechik {
48+
fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = BlockSize>) {
49+
f.call(&mut DecBackend(&self.keys.dec));
50+
}
51+
}
52+
53+
impl BlockCipherEncrypt for crate::KuznyechikEnc {
54+
fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = BlockSize>) {
55+
f.call(&mut EncBackend(&self.keys.0));
56+
}
57+
}
58+
59+
impl BlockCipherDecrypt for crate::KuznyechikDec {
60+
fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = BlockSize>) {
61+
f.call(&mut DecBackend(&self.keys.0));
62+
}
63+
}

0 commit comments

Comments
 (0)
Please sign in to comment.