-
-
Notifications
You must be signed in to change notification settings - Fork 704
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
core/crypto/sha2: Use hardware SHA224/256 when available (AMD64)
- Loading branch information
Showing
5 changed files
with
285 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#+build !amd64 | ||
package sha2 | ||
|
||
@(private = "file") | ||
ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported" | ||
|
||
// is_hardware_accelerated_256 returns true iff hardware accelerated | ||
// SHA-224/SHA-256 is supported. | ||
is_hardware_accelerated_256 :: proc "contextless" () -> bool { | ||
return false | ||
} | ||
|
||
sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) { | ||
panic_contextless(ERR_HW_NOT_SUPPORTED) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,260 @@ | ||
#+build amd64 | ||
package sha2 | ||
|
||
// Based on the public domain code by Jeffrey Walton, though | ||
// realistically, there only is one sensible way to write this | ||
// and Intel's whitepaper covers it. | ||
// | ||
// See: https://github.com/noloader/SHA-Intrinsics | ||
|
||
import "base:intrinsics" | ||
import "core:simd" | ||
import "core:simd/x86" | ||
import "core:sys/info" | ||
|
||
@(private = "file") | ||
MASK :: x86.__m128i{0x0405060700010203, 0x0c0d0e0f08090a0b} | ||
|
||
@(private = "file") | ||
K_0 :: simd.u64x2{0x71374491428a2f98, 0xe9b5dba5b5c0fbcf} | ||
@(private = "file") | ||
K_1 :: simd.u64x2{0x59f111f13956c25b, 0xab1c5ed5923f82a4} | ||
@(private = "file") | ||
K_2 :: simd.u64x2{0x12835b01d807aa98, 0x550c7dc3243185be} | ||
@(private = "file") | ||
K_3 :: simd.u64x2{0x80deb1fe72be5d74, 0xc19bf1749bdc06a7} | ||
@(private = "file") | ||
K_4 :: simd.u64x2{0xefbe4786e49b69c1, 0x240ca1cc0fc19dc6} | ||
@(private = "file") | ||
K_5 :: simd.u64x2{0x4a7484aa2de92c6f, 0x76f988da5cb0a9dc} | ||
@(private = "file") | ||
K_6 :: simd.u64x2{0xa831c66d983e5152, 0xbf597fc7b00327c8} | ||
@(private = "file") | ||
K_7 :: simd.u64x2{0xd5a79147c6e00bf3, 0x1429296706ca6351} | ||
@(private = "file") | ||
K_8 :: simd.u64x2{0x2e1b213827b70a85, 0x53380d134d2c6dfc} | ||
@(private = "file") | ||
K_9 :: simd.u64x2{0x766a0abb650a7354, 0x92722c8581c2c92e} | ||
@(private = "file") | ||
K_10 :: simd.u64x2{0xa81a664ba2bfe8a1, 0xc76c51a3c24b8b70} | ||
@(private = "file") | ||
K_11 :: simd.u64x2{0xd6990624d192e819, 0x106aa070f40e3585} | ||
@(private = "file") | ||
K_12 :: simd.u64x2{0x1e376c0819a4c116, 0x34b0bcb52748774c} | ||
@(private = "file") | ||
K_13 :: simd.u64x2{0x4ed8aa4a391c0cb3, 0x682e6ff35b9cca4f} | ||
@(private = "file") | ||
K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814} | ||
@(private = "file") | ||
K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7} | ||
|
||
|
||
// is_hardware_accelerated_256 returns true iff hardware accelerated | ||
// SHA-224/SHA-256 is supported. | ||
is_hardware_accelerated_256 :: proc "contextless" () -> bool { | ||
features, ok := info.cpu_features.? | ||
if !ok { | ||
return false | ||
} | ||
|
||
req_features :: info.CPU_Features{ | ||
.sse2, | ||
.ssse3, | ||
.sse41, | ||
.sha, | ||
} | ||
return features >= req_features | ||
} | ||
|
||
@(private, enable_target_feature="sse2,ssse3,sse4.1,sha") | ||
sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check { | ||
// Load the state | ||
tmp := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[0])) | ||
state_1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[4])) | ||
|
||
tmp = x86._mm_shuffle_epi32(tmp, 0xb1) // CDAB | ||
state_1 = x86._mm_shuffle_epi32(state_1, 0x1b) // EFGH | ||
state_0 := x86._mm_alignr_epi8(tmp, state_1, 8) // ABEF | ||
// state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH | ||
state_1 = kludge_mm_blend_epi16_0xf0(state_1, tmp) | ||
|
||
data := data | ||
for len(data) >= BLOCK_SIZE_256 { | ||
state_0_save, state_1_save := state_0, state_1 | ||
|
||
// Rounds 0-3 | ||
msg := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data))) | ||
msg_0 := x86._mm_shuffle_epi8(msg, MASK) | ||
msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_0)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
msg = x86._mm_shuffle_epi32(msg, 0xe) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
|
||
// Rounds 4-7 | ||
msg_1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[16:]))) | ||
msg_1 = x86._mm_shuffle_epi8(msg_1, MASK) | ||
msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_1)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
msg = x86._mm_shuffle_epi32(msg, 0xe) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1) | ||
|
||
// Rounds 8-11 | ||
msg_2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[32:]))) | ||
msg_2 = x86._mm_shuffle_epi8(msg_2, MASK) | ||
msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_2)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
msg = x86._mm_shuffle_epi32(msg, 0xe) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2) | ||
|
||
// Rounds 12-15 | ||
msg_3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[48:]))) | ||
msg_3 = x86._mm_shuffle_epi8(msg_3, MASK) | ||
msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_3)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4) | ||
msg_0 = x86._mm_add_epi32(msg_0, tmp) | ||
msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3) | ||
|
||
// Rounds 16-19 | ||
msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_4)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4) | ||
msg_1 = x86._mm_add_epi32(msg_1, tmp) | ||
msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0) | ||
|
||
// Rounds 20-23 | ||
msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_5)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4) | ||
msg_2 = x86._mm_add_epi32(msg_2, tmp) | ||
msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1) | ||
|
||
// Rounds 24-27 | ||
msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_6)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4) | ||
msg_3 = x86._mm_add_epi32(msg_3, tmp) | ||
msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2) | ||
|
||
// Rounds 28-31 | ||
msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_7)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4) | ||
msg_0 = x86._mm_add_epi32(msg_0, tmp) | ||
msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3) | ||
|
||
// Rounds 32-35 | ||
msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_8)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4) | ||
msg_1 = x86._mm_add_epi32(msg_1, tmp) | ||
msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0) | ||
|
||
// Rounds 36-39 | ||
msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_9)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4) | ||
msg_2 = x86._mm_add_epi32(msg_2, tmp) | ||
msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1) | ||
|
||
// Rounds 40-43 | ||
msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_10)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4) | ||
msg_3 = x86._mm_add_epi32(msg_3, tmp) | ||
msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2) | ||
|
||
// Rounds 44-47 | ||
msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_11)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4) | ||
msg_0 = x86._mm_add_epi32(msg_0, tmp) | ||
msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3) | ||
|
||
// Rounds 48-51 | ||
msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_12)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4) | ||
msg_1 = x86._mm_add_epi32(msg_1, tmp) | ||
msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0) | ||
|
||
// Rounds 52-55 | ||
msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_13)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4) | ||
msg_2 = x86._mm_add_epi32(msg_2, tmp) | ||
msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
|
||
/* Rounds 56-59 */ | ||
msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_14)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4) | ||
msg_3 = x86._mm_add_epi32(msg_3, tmp) | ||
msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
|
||
// Rounds 60-63 | ||
msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_15)) | ||
state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) | ||
msg = x86._mm_shuffle_epi32(msg, 0x0e) | ||
state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) | ||
|
||
state_0 = x86._mm_add_epi32(state_0, state_0_save) | ||
state_1 = x86._mm_add_epi32(state_1, state_1_save) | ||
|
||
data = data[BLOCK_SIZE_256:] | ||
} | ||
|
||
// Write back the updated state | ||
tmp = x86._mm_shuffle_epi32(state_0, 0x1b) // FEBA | ||
state_1 = x86._mm_shuffle_epi32(state_1, 0xb1) // DCHG | ||
// state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA | ||
state_0 = kludge_mm_blend_epi16_0xf0(tmp, state_1) | ||
state_1 = x86._mm_alignr_epi8(state_1, tmp, 8) // ABEF | ||
|
||
intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[0]), state_0) | ||
intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[4]), state_1) | ||
} | ||
|
||
@(private = "file") | ||
kludge_mm_blend_epi16_0xf0 :: #force_inline proc "contextless"(a, b: x86.__m128i) -> x86.__m128i { | ||
// HACK HACK HACK: LLVM got rid of `llvm.x86.sse41.pblendw`. | ||
a_ := simd.to_array(a) | ||
b_ := simd.to_array(b) | ||
return x86.__m128i{a_[0], b_[1]} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters