-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathK12-SIMD-simulation.py
306 lines (261 loc) · 12.4 KB
/
K12-SIMD-simulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# -*- coding: utf-8 -*-
# Implementation by Gilles Van Assche, hereby denoted as "the implementer".
#
# For more information, feedback or questions, please refer to our website:
# https://keccak.team/
#
# To the extent possible under law, the implementer has waived all copyright
# and related or neighboring rights to the source code in this file.
# http://creativecommons.org/publicdomain/zero/1.0/
"""
The Python code in this module illustrates how KangarooTwelve (K12) can be implemented using single-instruction-multiple-data (SIMD) instructions.
It does not provide optimized SIMD code, but instead illustrates how one can write SIMD code for K12 by simulating how SIMD works.
As a convention, it simulates SIMD registers of N×64 bits, for some positive integer N, by putting the N words of 64 bits in a list.
For instance, 256-bit SIMD instructions like AVX2 work on N=4 words of 64 bits.
SIMD operations are then applied to all the elements of the list.
K12 is described in https://keccak.team/kangarootwelve.html
In a nutshell, the input string is cut into chunks of 8KiB, called leaves, and the leaves are hashed independently.
With SIMD instructions applying to N×64-bit words, one can efficiently hash N leaves in parallel.
"""
from TurboSHAKE import ROL64, load64, store64, KeccakP1600, KeccakP1600onLanes, TurboSHAKE128
from K12 import right_encode
"""
Evaluates Keccak-p[1600, nr] on N states in parallel independently.
The input states are represented as 5×5 SIMD registers each containing N lanes.
states[x][y] represents a SIMD register that contains N words of 64 bits:
- states[x][y][0] is the lane value at (x, y) for the first state,
- states[x][y][1] is the lane value at (x, y) for the second state, etc.
Parameters
----------
N: integer
The number of states to apply Keccak-p[1600, nr] to.
It must be a strictly positive integer.
states: list of list of list of integers
The N input states as a 5×5 matrix of N lanes, with each lane is a 64-bit word in the form of an integer.
lanes[x][y][i] for 0 ≤ x,y < 5 and 0 ≤ i < N contains the lane at coordinates (x, y) of the i-th state.
nrRounds: integer
The number of rounds to perform, must be 12 for Keccak-p[1600, 12] used in K12.
Returns
-------
list of list of list of integers
The N output states as a 5×5 matrix of N lanes, represented like the "states" input parameter.
"""
def KeccakP1600timesN_SIMD(N, states, nrRounds):
R = 1
for round in range(24):
if (round + nrRounds >= 24):
# Perform all the round operations in a SIMD fashion.
# That is, perform whatever Boolean operations on all N words of 64 bits.
# This is what the "for i in range(N)" simulates.
# θ
C = [[states[x][0][i] ^ states[x][1][i] ^ states[x][2][i] ^ states[x][3][i] ^ states[x][4][i] for i in range(N)] for x in range(5)]
D = [[C[(x+4)%5][i] ^ ROL64(C[(x+1)%5][i], 1) for i in range(N)] for x in range(5)]
states = [[[states[x][y][i]^D[x][i] for i in range(N)] for y in range(5)] for x in range(5)]
# ρ and π
(x, y) = (1, 0)
current = [states[x][y][i] for i in range(N)]
for t in range(24):
(x, y) = (y, (2*x+3*y)%5)
(current, states[x][y]) = (states[x][y], [ROL64(current[i], (t+1)*(t+2)//2) for i in range(N)])
# χ
for y in range(5):
T = [states[x][y] for x in range(5)]
for x in range(5):
states[x][y] = [T[x][i] ^((~T[(x+1)%5][i]) & T[(x+2)%5][i]) for i in range(N)]
# ι
for j in range(7):
R = ((R << 1) ^ ((R >> 7)*0x71)) % 256
if (R & 2):
states[0][0] = [states[0][0][i] ^ (1 << ((1<<j)-1)) for i in range(N)]
else:
for j in range(7):
R = ((R << 1) ^ ((R >> 7)*0x71)) % 256
return states
"""
Bitwise add (XOR) data into N states in parallel.
The granularity is the lane (64 bits).
For the first state, we take the 8*laneCount bytes at data[0 : 8*laneCount] and add them to it.
For the second state, we take 8*laneCount bytes 8*laneOffset bytes further, so at data[8*laneOffset : 8*laneOffset + 8*laneCount], and add them to it.
For the third state, we do the same, but again 8*laneOffset bytes further, etc.
For K12, 8*laneOffset is 8192 so that the first state gets data from the first leaf, the second state from the second leaf, etc.
Parameters
----------
N: integer
The number of states to modify.
It must be a strictly positive integer, N ≥ 1.
states: list of list of list of integers
The N input states, as in KeccakP1600timesN_SIMD.
data: byte array
The data to add to the states.
Its size must be at least 8*((N-1)*laneOffset + laneCount) bytes.
laneCount: integer
The number of 64-bit words to add to each state.
It must be an integer between 0 and 25.
laneOffset: integer
The distance in 64-bit words between which we take bytes from data and add them to the different states.
For K12, laneOffset is 1024 (8192 bytes divided by 8 bytes), the distance between leaves.
Returns
-------
list of list of list of integers
The N output states, as in KeccakP1600timesN_SIMD.
"""
def KeccakP1600timesN_AddLanesAll(N, states, data, laneCount, laneOffset):
assert(len(data) >= 8*((N-1)*laneOffset + laneCount))
for y in range(5):
for x in range(5):
xy = x + 5*y
# Loop through the first laneCount lanes (x, y) of the states
if (xy < laneCount):
# Load N words of 64 bits in a SIMD register by taking
# - 64 bits from data[8*xy : 8*xy + 8]
# - 64 bits from data[8*xy + 8*laneOffset : 8*xy + 8*laneOffset + 8]
# - etc.
loadedData = [load64(data[8*(i*laneOffset + xy) : 8*(i*laneOffset + xy + 1)]) for i in range(N)]
# XOR the loaded data into the states at lane (x, y) in a SIMD fashion
states[x][y] = [states[x][y][i] ^ loadedData[i] for i in range(N)]
return states
# Global variables for K12
B = 8192
"""
Evaluates TurboSHAKE128 on N chunks of 8KiB (or leaves) in parallel and returns the chaining values.
Parameters
----------
N: integer
The number of leaves to evaluate in parallel.
It must be a strictly positive integer, N ≥ 1.
data: byte array
The N leaves to hash.
Its size must be at least N*B bytes.
Returns
-------
byte array
The N chaining values.
"""
def KangarooTwelve_ProcessLeaves(N, data):
assert(len(data) >= N*B)
rateInLanes = 21
rateInBytes = rateInLanes*8
# Initialize N all-zero states
A = [[[0 for i in range(N)] for y in range(5)] for x in range(5)]
# First, start with all the complete blocks of 21 lanes (= 168 bytes)
for j in range(0, B - rateInBytes, rateInBytes):
# Add 168 bytes from position j of the leaves to the states, offseted by 8KiB, that is:
# - add bytes data[j : j + 168] to the first state
# - add bytes data[8KiB + j : 8KiB + j + 168] to the second state
# - add bytes data[16KiB + j : 16KiB + j + 168] to the third state, etc.
KeccakP1600timesN_AddLanesAll(N, A, data[j:], rateInLanes, B//8)
# Apply Keccak-p[1600, 12 rounds] to all states
A = KeccakP1600timesN_SIMD(N, A, 12)
# Set the position of the last, incomplete, block of 16 lanes (= 128 bytes)
j = (B//rateInBytes)*rateInBytes
# Add 128 bytes from position j of the leaves to the states, offseted by 8KiB
KeccakP1600timesN_AddLanesAll(N, A, data[j:], (B - j)//8, B//8)
# Append the suffix 110 and the first bit of padding to all states
A[1][3] = [A[1][3][i] ^ 0x0B for i in range(N)]
# Append the second bit of padding to all states
A[0][4] = [A[0][4][i] ^ 0x8000000000000000 for i in range(N)]
# Apply Keccak-p[1600, 12 rounds] to all states
A = KeccakP1600timesN_SIMD(N, A, 12)
# Extract the first 256 bits of each state and concatenate them
CVs = bytearray().join([store64(A[0][0][i]) + store64(A[1][0][i]) + store64(A[2][0][i]) + store64(A[3][0][i]) for i in range(N)])
return CVs
"""
Evaluates K12 on the given input message and customization string, and returns the desired number of output bytes.
Parameters
----------
inputMessage: byte string or byte array
The input message.
customizationString: byte string or byte array
The customization string.
outputByteLen: integer
The desired number of output bytes.
Returns
-------
byte array
The first outputByteLen bytes of K12(inputMessage, customizationString).
"""
def KangarooTwelve(inputMessage, customizationString, outputByteLen):
c = 256
# Concatenate the input message, the customization string and the length of the latter
S = bytearray(inputMessage) + bytearray(customizationString) + right_encode(len(customizationString))
if (len(S) <= B):
# If S fits in one chunk, process the tree with only a final node
return TurboSHAKE128(S, 0x07, outputByteLen)
else:
# If S needs more than one chunk, process the tree with kangaroo hopping
CVs = bytearray()
# Process the leaves starting from offset 8KiB, as the first chunk is part of the final node
j = B
# Count the number of leaves
n = 0
# Process 8 leaves in parallel if possible
while(j + 8*B <= len(S)):
CVs = CVs + KangarooTwelve_ProcessLeaves(8, S[j:])
j = j + 8*B
n = n + 8
# Process 4 leaves in parallel if possible
while(j + 4*B <= len(S)):
CVs = CVs + KangarooTwelve_ProcessLeaves(4, S[j:])
j = j + 4*B
n = n + 4
# Process 2 leaves in parallel if possible
while(j + 2*B <= len(S)):
CVs = CVs + KangarooTwelve_ProcessLeaves(2, S[j:])
j = j + 2*B
n = n + 2
# Process the remaining leaf
while(j < len(S)):
CVs = CVs + TurboSHAKE128(S[j:j+B], 0x0B, c//8)
j = j + B
n = n + 1
# Process the final node
NodeStar = S[0:B] + bytearray([3,0,0,0,0,0,0,0]) + CVs \
+ right_encode(n) + b'\xFF\xFF'
return TurboSHAKE128(NodeStar, 0x06, outputByteLen)
# Test that KeccakP1600timesN_SIMD does what it is supposed to
def Test_KeccakP1600timesN_SIMD():
for N in range(1, 5):
print("Testing KeccakP1600timesN_SIMD for N =", N)
lanes = [[[(x+y+i+x*y*i) % (2**64) for i in range(N)] for y in range(5)] for x in range(5)]
lanes_t = [[[lanes[x][y][i] for y in range(5)] for x in range(5)] for i in range(N)]
ref_lanes_t = [KeccakP1600onLanes(lanes_t[i], 24) for i in range(N)]
ref_lanes = [[[ref_lanes_t[i][x][y] for i in range(N)] for y in range(5)] for x in range(5)]
test_lanes = KeccakP1600timesN_SIMD(N, lanes, 24)
assert(ref_lanes == test_lanes)
return
# Test that KangarooTwelve_ProcessLeaves does what it is supposed to
def Test_KangarooTwelve_ProcessLeaves():
c = 256
for N in range(1, 5):
print("Testing KangarooTwelve_ProcessLeaves for N =", N)
S = bytearray([(i%247) for i in range(B*N)])
Si = [bytearray(S[i*B:(i+1)*B]) for i in range(N)]
ref_CVs = bytearray().join([TurboSHAKE128(Si[i], 0x0B, c//8) for i in range(N)])
test_CVs = KangarooTwelve_ProcessLeaves(N, S)
assert(test_CVs == ref_CVs)
def outputHex(s):
for i in range(len(s)):
print("{0:02x}".format(s[i]), end=' ')
print()
print()
# Produce test vectors
def printK12TestVectors():
print("KangarooTwelve(M=empty, C=empty, 32 output bytes):")
outputHex(KangarooTwelve(b'', b'', 32))
print("KangarooTwelve(M=empty, C=empty, 64 output bytes):")
outputHex(KangarooTwelve(b'', b'', 64))
print("KangarooTwelve(M=empty, C=empty, 10032 output bytes), last 32 bytes:")
outputHex(KangarooTwelve(b'', b'', 10032)[10000:])
for i in range(6):
C = b''
M = bytearray([(j % 251) for j in range(17**i)])
print("KangarooTwelve(M=pattern 0x00 to 0xFA for 17^{0:d} bytes, C=empty, 32 output bytes):".format(i))
outputHex(KangarooTwelve(M, C, 32))
for i in range(4):
M = bytearray([0xFF for j in range(2**i-1)])
C = bytearray([(j % 251) for j in range(41**i)])
print("KangarooTwelve(M={0:d} times byte 0xFF, C=pattern 0x00 to 0xFA for 41^{1:d} bytes, 32 output bytes):".format(2**i-1, i))
outputHex(KangarooTwelve(M, C, 32))
Test_KeccakP1600timesN_SIMD()
Test_KangarooTwelve_ProcessLeaves()
printK12TestVectors()