forked from rurban/smhasher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrcspeed.c
415 lines (354 loc) · 13.4 KB
/
crcspeed.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/*
* Copyright (C) 2013 Mark Adler
* Copyright (C) 2019-2024 Josiah Carlson
* Originally by: crc64.c Version 1.4 16 Dec 2013 Mark Adler
* Modifications by Matt Stancliff <[email protected]>:
* - removed CRC64-specific behavior
* - added generation of lookup tables by parameters
* - removed inversion of CRC input/result
* - removed automatic initialization in favor of explicit initialization
* Modifications by Josiah Carlson <[email protected]>
* - Added case/vector/AVX/+ versions of crc combine function; see crccombine.c
* - added optional static cache
* - Modified to use 1 thread to:
* - Partition large crc blobs into 2-3 segments
* - Process the 2-3 segments in parallel
* - Merge the resulting crcs
* -> Resulting in 10-90% performance boost for data > 1 meg
* - macro-ized to reduce copy/pasta
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Mark Adler
*/
#include "crcspeed.h"
#include "crccombine.h"
#define CRC64_LEN_MASK UINT64_C(0x7ffffffffffffff8)
#define CRC64_REVERSED_POLY UINT64_C(0x95ac9329ac4bc9b5)
/* Fill in a CRC constants table. */
void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
uint64_t crc;
/* generate CRCs for all single byte sequences */
for (int n = 0; n < 256; n++) {
unsigned char v = n;
table[0][n] = crcfn(0, &v, 1);
}
/* generate nested CRC table for future slice-by-8/16/24+ lookup */
for (int n = 0; n < 256; n++) {
crc = table[0][n];
for (int k = 1; k < 8; k++) {
crc = table[0][crc & 0xff] ^ (crc >> 8);
table[k][n] = crc;
}
}
#if USE_STATIC_COMBINE_CACHE
/* initialize combine cache for CRC stapling for slice-by 16/24+ */
init_combine_cache(CRC64_REVERSED_POLY, 64);
#endif
}
void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) {
uint16_t crc;
/* generate CRCs for all single byte sequences */
for (int n = 0; n < 256; n++) {
table[0][n] = crcfn(0, &n, 1);
}
/* generate nested CRC table for future slice-by-8 lookup */
for (int n = 0; n < 256; n++) {
crc = table[0][n];
for (int k = 1; k < 8; k++) {
crc = table[0][(crc >> 8) & 0xff] ^ (crc << 8);
table[k][n] = crc;
}
}
}
/* Reverse the bytes in a 64-bit word. */
static inline uint64_t rev8(uint64_t a) {
#if defined(__GNUC__) || defined(__clang__)
return __builtin_bswap64(a);
#else
uint64_t m;
m = UINT64_C(0xff00ff00ff00ff);
a = ((a >> 8) & m) | (a & m) << 8;
m = UINT64_C(0xffff0000ffff);
a = ((a >> 16) & m) | (a & m) << 16;
return a >> 32 | a << 32;
#endif
}
/* This function is called once to initialize the CRC table for use on a
big-endian architecture. */
void crcspeed64big_init(crcfn64 fn, uint64_t big_table[8][256]) {
/* Create the little endian table then reverse all the entries. */
crcspeed64little_init(fn, big_table);
for (int k = 0; k < 8; k++) {
for (int n = 0; n < 256; n++) {
big_table[k][n] = rev8(big_table[k][n]);
}
}
}
void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) {
/* Create the little endian table then reverse all the entries. */
crcspeed16little_init(fn, big_table);
for (int k = 0; k < 8; k++) {
for (int n = 0; n < 256; n++) {
big_table[k][n] = rev8(big_table[k][n]);
}
}
}
/* Note: doing all of our crc/next modifications *before* the crc table
* references is an absolute speedup on all CPUs tested. So... keep these
* macros separate.
*/
#define DO_8_1(crc, next) \
crc ^= *(uint64_t *)next; \
next += 8
#define DO_8_2(crc) \
crc = little_table[7][(uint8_t)crc] ^ \
little_table[6][(uint8_t)(crc >> 8)] ^ \
little_table[5][(uint8_t)(crc >> 16)] ^ \
little_table[4][(uint8_t)(crc >> 24)] ^ \
little_table[3][(uint8_t)(crc >> 32)] ^ \
little_table[2][(uint8_t)(crc >> 40)] ^ \
little_table[1][(uint8_t)(crc >> 48)] ^ \
little_table[0][crc >> 56]
#define CRC64_SPLIT(div) \
olen = len; \
next2 = next1 + ((len / div) & CRC64_LEN_MASK); \
len = (next2 - next1)
#define MERGE_CRC(crcn) \
crc1 = crc64_combine(crc1, crcn, next2 - next1, CRC64_REVERSED_POLY, 64)
#define MERGE_END(last, DIV) \
len = olen - ((next2 - next1) * DIV); \
next1 = last
/* Variables so we can change for benchmarking; these seem to be fairly
* reasonable for Intel CPUs made since 2010. Please adjust as necessary if
* or when your CPU has more load / execute units. We've written benchmark code
* to help you tune your platform, see crc64Test. */
#if defined(__i386__) || defined(__X86_64__)
static size_t CRC64_TRI_CUTOFF = (2*1024);
static size_t CRC64_DUAL_CUTOFF = (128);
#else
static size_t CRC64_TRI_CUTOFF = (16*1024);
static size_t CRC64_DUAL_CUTOFF = (1024);
#endif
void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff) {
CRC64_DUAL_CUTOFF = dual_cutoff;
CRC64_TRI_CUTOFF = tri_cutoff;
}
uint64_t get_cutoff(int i) {
if (i) return CRC64_TRI_CUTOFF;
return CRC64_DUAL_CUTOFF;
}
/* Calculate a non-inverted CRC multiple bytes at a time on a little-endian
* architecture. If you need inverted CRC, invert *before* calling and invert
* *after* calling.
* 64 bit crc = process 8/16/24 bytes at once;
*/
uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc1,
void *buf, size_t len) {
unsigned char *next1 = buf;
if (CRC64_DUAL_CUTOFF < 1) {
goto final;
}
/* process individual bytes until we reach an 8-byte aligned pointer */
while (len && ((uintptr_t)next1 & 7) != 0) {
crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8);
len--;
}
if (len > CRC64_TRI_CUTOFF) {
/* 24 bytes per loop, doing 3 parallel 8 byte chunks at a time */
unsigned char *next2, *next3;
uint64_t olen, crc2=0, crc3=0;
CRC64_SPLIT(3);
/* len is now the length of the first segment, the 3rd segment possibly
* having extra bytes to clean up at the end
*/
next3 = next2 + len;
while (len >= 8) {
len -= 8;
DO_8_1(crc1, next1);
DO_8_1(crc2, next2);
DO_8_1(crc3, next3);
DO_8_2(crc1);
DO_8_2(crc2);
DO_8_2(crc3);
}
/* merge the 3 crcs */
MERGE_CRC(crc2);
MERGE_CRC(crc3);
MERGE_END(next3, 3);
} else if (len > CRC64_DUAL_CUTOFF) {
/* 16 bytes per loop, doing 2 parallel 8 byte chunks at a time */
unsigned char *next2;
uint64_t olen, crc2=0;
CRC64_SPLIT(2);
/* len is now the length of the first segment, the 2nd segment possibly
* having extra bytes to clean up at the end
*/
while (len >= 8) {
len -= 8;
DO_8_1(crc1, next1);
DO_8_1(crc2, next2);
DO_8_2(crc1);
DO_8_2(crc2);
}
/* merge the 2 crcs */
MERGE_CRC(crc2);
MERGE_END(next2, 2);
}
/* We fall through here to handle our <CRC64_DUAL_CUTOFF inputs, and for any trailing
* bytes that wasn't evenly divisble by 16 or 24 above. */
/* fast processing, 8 bytes (aligned!) per loop */
while (len >= 8) {
len -= 8;
DO_8_1(crc1, next1);
DO_8_2(crc1);
}
final:
/* process remaining bytes (can't be larger than 8) */
while (len) {
crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8);
len--;
}
return crc1;
}
/* clean up our namespace */
#undef DO_8_1
#undef DO_8_2
#undef CRC64_SPLIT
#undef MERGE_CRC
#undef MERGE_END
#undef CRC64_REVERSED_POLY
#undef CRC64_LEN_MASK
/* note: similar perf advantages can be had for long strings in crc16 using all
* of the same optimizations as above; though this is unnecessary. crc16 is
* normally used to shard keys; not hash / verify data, so is used on shorter
* data that doesn't warrant such changes. */
uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc,
void *buf, size_t len) {
unsigned char *next = buf;
/* process individual bytes until we reach an 8-byte aligned pointer */
while (len && ((uintptr_t)next & 7) != 0) {
crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8);
len--;
}
/* fast middle processing, 8 bytes (aligned!) per loop */
while (len >= 8) {
uint64_t n = *(uint64_t *)next;
crc = little_table[7][(n & 0xff) ^ ((crc >> 8) & 0xff)] ^
little_table[6][((n >> 8) & 0xff) ^ (crc & 0xff)] ^
little_table[5][(n >> 16) & 0xff] ^
little_table[4][(n >> 24) & 0xff] ^
little_table[3][(n >> 32) & 0xff] ^
little_table[2][(n >> 40) & 0xff] ^
little_table[1][(n >> 48) & 0xff] ^
little_table[0][n >> 56];
next += 8;
len -= 8;
}
/* process remaining bytes (can't be larger than 8) */
while (len) {
crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8);
len--;
}
return crc;
}
/* Calculate a non-inverted CRC eight bytes at a time on a big-endian
* architecture.
*/
uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf,
size_t len) {
unsigned char *next = buf;
crc = rev8(crc);
while (len && ((uintptr_t)next & 7) != 0) {
crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8);
len--;
}
/* note: alignment + 2/3-way processing can probably be handled here nearly
the same as above, using our updated DO_8_2 macro. Not included in these
changes, as other authors, I don't have big-endian to test with. */
while (len >= 8) {
crc ^= *(uint64_t *)next;
crc = big_table[0][crc & 0xff] ^
big_table[1][(crc >> 8) & 0xff] ^
big_table[2][(crc >> 16) & 0xff] ^
big_table[3][(crc >> 24) & 0xff] ^
big_table[4][(crc >> 32) & 0xff] ^
big_table[5][(crc >> 40) & 0xff] ^
big_table[6][(crc >> 48) & 0xff] ^
big_table[7][crc >> 56];
next += 8;
len -= 8;
}
while (len) {
crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8);
len--;
}
return rev8(crc);
}
/* WARNING: Completely untested on big endian architecture. Possibly broken. */
uint16_t crcspeed16big(uint16_t big_table[8][256], uint16_t crc_in, void *buf,
size_t len) {
unsigned char *next = buf;
uint64_t crc = crc_in;
crc = rev8(crc);
while (len && ((uintptr_t)next & 7) != 0) {
crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8);
len--;
}
while (len >= 8) {
uint64_t n = *(uint64_t *)next;
crc = big_table[0][(n & 0xff) ^ ((crc >> (56 - 8)) & 0xff)] ^
big_table[1][((n >> 8) & 0xff) ^ (crc & 0xff)] ^
big_table[2][(n >> 16) & 0xff] ^
big_table[3][(n >> 24) & 0xff] ^
big_table[4][(n >> 32) & 0xff] ^
big_table[5][(n >> 40) & 0xff] ^
big_table[6][(n >> 48) & 0xff] ^
big_table[7][n >> 56];
next += 8;
len -= 8;
}
while (len) {
crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8);
len--;
}
return rev8(crc);
}
/* Return the CRC of buf[0..len-1] with initial crc, processing eight bytes
at a time using passed-in lookup table.
This selects one of two routines depending on the endianness of
the architecture. */
uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf,
size_t len) {
uint64_t n = 1;
return *(char *)&n ? crcspeed64little(table, crc, buf, len)
: crcspeed64big(table, crc, buf, len);
}
uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf,
size_t len) {
uint64_t n = 1;
return *(char *)&n ? crcspeed16little(table, crc, buf, len)
: crcspeed16big(table, crc, buf, len);
}
/* Initialize CRC lookup table in architecture-dependent manner. */
void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]) {
uint64_t n = 1;
*(char *)&n ? crcspeed64little_init(fn, table)
: crcspeed64big_init(fn, table);
}
void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]) {
uint64_t n = 1;
*(char *)&n ? crcspeed16little_init(fn, table)
: crcspeed16big_init(fn, table);
}