From e586036adeeb4cc7ddfa779783912893d86787b0 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 15 Nov 2023 06:26:44 -0800 Subject: [PATCH] Add nopshufb tag (#256) Add `nopshufb` tag that should remove all code utilizing `PSHUFB` - and similar code on other platforms. # Background > None of section below is legal advice. Seek your own legal counsel. > As stated by the [LICENSE](LICENSE) the authors will not be held reliable for any use of this library. > Users are encouraged to independently verify they comply with all legal requirements. As can be seen in [recent news](https://www.datanami.com/2023/10/16/cloudera-hit-with-240-million-judgement-over-erasure-coding/) there has been lawsuits related to possible patents of aspects of erasure coding functionality. As a possible mitigation it is possible to use the tag `nopshufb` when compiling any code which includes this package. This will remove all inclusion and use of `PSHUFB` and equivalent on other platforms. This is done by adding `-tags=nopshufb` to `go build` and similar commands that produce binary output. The removed code may not be infringing and even after `-tags=nopshufb` there may still be infringing code left. --- .github/workflows/go.yml | 16 +- README.md | 15 + _gen/gen.go | 177 +- _gen/gf16.go | 15 +- _gen/gf8.go | 8 +- _gen/go.mod | 8 +- _gen/nopshufb.go | 7 + _gen/pshufb.go | 7 + galois_amd64.go | 14 +- galois_amd64.s | 86 +- galois_arm64.go | 26 +- galois_arm64.s | 27 +- galois_gen_amd64.go | 11 +- galois_gen_amd64.s | 94 +- galois_gen_none.go | 1 - galois_gen_nopshufb_amd64.go | 1164 + galois_gen_nopshufb_amd64.s | 33101 ++++++++++++++++++++++++++ galois_gen_switch_amd64.go | 4 +- galois_gen_switch_nopshufb_amd64.go | 697 + galois_noasm.go | 12 +- galois_nopshufb_amd64.go | 146 + galois_notamd64.go | 3 +- galois_ppc64le.go | 10 +- galois_ppc64le.s | 1 + go.mod | 9 +- reedsolomon.go | 4 +- xor_arm64.go | 19 + xor_arm64.s | 29 + xor_noasm.go | 7 + 29 files changed, 35503 insertions(+), 215 deletions(-) create mode 100644 _gen/nopshufb.go create mode 100644 _gen/pshufb.go create mode 100644 galois_gen_nopshufb_amd64.go create mode 100644 galois_gen_nopshufb_amd64.s create mode 100644 galois_gen_switch_nopshufb_amd64.go create mode 100644 galois_nopshufb_amd64.go create mode 100644 xor_arm64.go create mode 100644 xor_arm64.s create mode 100644 xor_noasm.go diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 5dee2e53..a8c7f2f9 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -11,7 +11,7 @@ jobs: build: strategy: matrix: - go-version: [1.18.x, 1.19.x, 1.20.x] + go-version: [1.19.x, 1.20.x, 1.21.x] os: [ubuntu-latest, macos-latest, windows-latest] env: CGO_ENABLED: 0 @@ -32,11 +32,15 @@ jobs: run: go vet ./... - name: Test - run: go test ./... + run: go test -timeout=15m ./... - name: Test Noasm run: go test -tags=noasm -short&&go test -short -no-avx512&&go test -short -no-avx512 -no-avx2&&go test -no-avx512 -no-avx2 -no-ssse3 -short + - name: Test Nopshufb + run: go test -tags=nopshufb -short&&go test -tags=nopshufb -short -no-avx512 -no-gfni&&go test -tags=nopshufb -short&&go test -tags=nopshufb -no-avx512 -no-avx2 -no-ssse3 -no-sse2 -short + + - name: Test Race env: CGO_ENABLED: 1 @@ -50,7 +54,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v2 with: - go-version: 1.19.x + go-version: 1.21.x - name: Checkout code uses: actions/checkout@v2 @@ -102,13 +106,13 @@ jobs: env: GOOS: linux GOARCH: arm64 - run: go build .&&go build examples/simple-decoder.go&&go build examples/simple-encoder.go&&go build examples/stream-decoder.go&&go build examples/stream-encoder.go + run: go build .&&go build examples/simple-decoder.go&&go build examples/simple-encoder.go&&go build examples/stream-decoder.go&&go build examples/stream-encoder.go&&go build -tags=nopshufb .&&go build -tags=nopshufb examples/simple-decoder.go&&go build examples/simple-encoder.go&&go build -tags=nopshufb examples/stream-decoder.go&&go build examples/stream-encoder.go - name: Build on PPC64LE env: GOOS: linux GOARCH: ppc64le - run: go build .&&go build examples/simple-decoder.go&&go build examples/simple-encoder.go&&go build examples/stream-decoder.go&&go build examples/stream-encoder.go + run: go build .&&go build examples/simple-decoder.go&&go build examples/simple-encoder.go&&go build examples/stream-decoder.go&&go build examples/stream-encoder.go && go build -tags=nopshufb .&&go build -tags=nopshufb examples/simple-decoder.go&&go build examples/simple-encoder.go&&go build -tags=nopshufb examples/stream-decoder.go&&go build examples/stream-encoder.go generate: runs-on: ubuntu-latest @@ -116,7 +120,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v2 with: - go-version: 1.19.x + go-version: 1.21.x - name: Checkout code uses: actions/checkout@v2 diff --git a/README.md b/README.md index e9c148ff..bdcb9e78 100644 --- a/README.md +++ b/README.md @@ -534,6 +534,21 @@ BenchmarkGaloisXor128K-160 862.02 7905.00 9.17x BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x ``` +# Legal + +> None of section below is legal advice. Seek your own legal counsel. +> As stated by the [LICENSE](LICENSE) the authors will not be held reliable for any use of this library. +> Users are encouraged to independently verify they comply with all legal requirements. + +As can be seen in [recent news](https://www.datanami.com/2023/10/16/cloudera-hit-with-240-million-judgement-over-erasure-coding/) +there has been lawsuits related to possible patents of aspects of erasure coding functionality. + +As a possible mitigation it is possible to use the tag `nopshufb` when compiling any code which includes this package. +This will remove all inclusion and use of `PSHUFB` and equivalent on other platforms. + +This is done by adding `-tags=nopshufb` to `go build` and similar commands that produce binary output. + +The removed code may not be infringing and even after `-tags=nopshufb` there may still be infringing code left. # Links * [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/). diff --git a/_gen/gen.go b/_gen/gen.go index b2f25077..30b71780 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -1,8 +1,12 @@ //go:build generate -// +build generate // Copyright 2022+, Klaus Post. See LICENSE for details. +//go:generate go run -tags=generate,nopshufb . -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon +//go:generate go fmt ../galois_gen_switch_nopshufb_amd64.go +//go:generate go fmt ../galois_gen_nopshufb_amd64.go +//go:generate go run cleanup.go ../galois_gen_nopshufb_amd64.s + //go:generate go run -tags=generate . -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon //go:generate go fmt ../galois_gen_switch_amd64.go //go:generate go fmt ../galois_gen_amd64.go @@ -41,6 +45,11 @@ func main() { Constraint(buildtags.Not("appengine").ToConstraint()) Constraint(buildtags.Not("noasm").ToConstraint()) Constraint(buildtags.Not("nogen").ToConstraint()) + if pshufb { + Constraint(buildtags.Not("nopshufb").ToConstraint()) + } else { + Constraint(buildtags.Opt("nopshufb").ToConstraint()) + } Constraint(buildtags.Term("gc").ToConstraint()) TEXT("_dummy_", 0, "func()") @@ -54,20 +63,31 @@ func main() { Comment("#endif") RET() + genXor() const perLoopBits = 6 const perLoop = 1 << perLoopBits for i := 1; i <= inputMax; i++ { for j := 1; j <= outputMax; j++ { - genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false) - genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false) + if pshufb { + genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false) + genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false) + } genMulAvx512GFNI(fmt.Sprintf("mulGFNI_%dx%d_64", i, j), i, j, false) genMulAvx512GFNI(fmt.Sprintf("mulGFNI_%dx%d_64Xor", i, j), i, j, true) - genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%dXor", i, j), i, j, true) - genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64Xor", i, j), i, j, true) + if pshufb { + genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%dXor", i, j), i, j, true) + genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64Xor", i, j), i, j, true) + } } } - f, err := os.Create("../galois_gen_switch_amd64.go") + name := "../galois_gen_switch_amd64.go" + tag := "// +build !nopshufb\n" + if !pshufb { + name = "../galois_gen_switch_nopshufb_amd64.go" + tag = "// +build nopshufb\n" + } + f, err := os.Create(name) if err != nil { panic(err) } @@ -79,7 +99,8 @@ func main() { // +build !appengine // +build !noasm // +build gc -// +build !nogen +// +build !nogen +` + tag + ` package reedsolomon @@ -88,7 +109,6 @@ import ( ) `) - w.WriteString(fmt.Sprintf(`const ( avx2CodeGen = true maxAvx2Inputs = %d @@ -96,24 +116,31 @@ maxAvx2Outputs = %d minAvx2Size = %d avxSizeMask = maxInt - (minAvx2Size-1) )`, inputMax, outputMax, perLoop)) - w.WriteString(` + + if !pshufb { + w.WriteString("\n\nfunc galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`)}\n") + w.WriteString("func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`)}\n") + } + + if pshufb { + w.WriteString(` func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { n := (stop-start) & avxSizeMask `) - w.WriteString(`switch len(in) { + w.WriteString(`switch len(in) { `) - for in, defs := range switchDefs[:] { - w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) - for out, def := range defs[:] { - w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) - w.WriteString(def) + for in, defs := range switchDefs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") } - w.WriteString("}\n") - } - w.WriteString(`} + w.WriteString(`} panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } @@ -122,20 +149,21 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { `) - w.WriteString(`switch len(in) { + w.WriteString(`switch len(in) { `) - for in, defs := range switchDefsX[:] { - w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) - for out, def := range defs[:] { - w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) - w.WriteString(def) + for in, defs := range switchDefsX[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") } - w.WriteString("}\n") - } - w.WriteString(`} + w.WriteString(`} panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } `) + } w.WriteString(` @@ -930,3 +958,98 @@ func genMulAvx512GFNI(name string, inputs int, outputs int, xor bool) { Label(name + "_end") RET() } + +func genXor() { + // SSE 2 + { + Comment("sSE2XorSlice will XOR in with out and store in out.") + Comment("Processes 16 bytes/loop.") + TEXT("sSE2XorSlice", 0, fmt.Sprintf("func(in, out []byte)")) + Pragma("noescape") + src := Load(Param("in").Base(), GP64()) + dst := Load(Param("out").Base(), GP64()) + length := Load(Param("in").Len(), GP64()) + SHRQ(U8(4), length) + srcX, dstX := XMM(), XMM() + JZ(LabelRef("end")) + Label("loop") + MOVOU(Mem{Base: src}, srcX) + MOVOU(Mem{Base: dst}, dstX) + PXOR(srcX, dstX) + MOVOU(dstX, Mem{Base: dst}) + ADDQ(U8(16), src) + ADDQ(U8(16), dst) + DECQ(length) + JNZ(LabelRef("loop")) + Label("end") + RET() + } + + // SSE2 64 bytes + { + Comment("sSE2XorSlice_64 will XOR in with out and store in out.") + Comment("Processes 64 bytes/loop.") + TEXT("sSE2XorSlice_64", 0, fmt.Sprintf("func(in, out []byte)")) + Pragma("noescape") + src := Load(Param("in").Base(), GP64()) + dst := Load(Param("out").Base(), GP64()) + length := Load(Param("in").Len(), GP64()) + SHRQ(U8(6), length) + var srcX, dstX [4]reg.VecVirtual + for i := range srcX { + srcX[i], dstX[i] = XMM(), XMM() + } + JZ(LabelRef("end")) + Label("loop") + for i := range srcX { + MOVOU(Mem{Base: src, Disp: 16 * i}, srcX[i]) + } + for i := range srcX { + MOVOU(Mem{Base: dst, Disp: 16 * i}, dstX[i]) + } + for i := range srcX { + PXOR(srcX[i], dstX[i]) + } + for i := range srcX { + MOVOU(dstX[i], Mem{Base: dst, Disp: 16 * i}) + } + ADDQ(U8(64), src) + ADDQ(U8(64), dst) + DECQ(length) + JNZ(LabelRef("loop")) + Label("end") + RET() + } + //AVX 2 + { + Comment("avx2XorSlice_64 will XOR in with out and store in out.") + Comment("Processes 64 bytes/loop.") + TEXT("avx2XorSlice_64", 0, fmt.Sprintf("func(in, out []byte)")) + Pragma("noescape") + src := Load(Param("in").Base(), GP64()) + dst := Load(Param("out").Base(), GP64()) + length := Load(Param("in").Len(), GP64()) + SHRQ(U8(6), length) + srcX, dstX := YMM(), YMM() + srcX2, dstX2 := YMM(), YMM() + JZ(LabelRef("end")) + + Label("loop") + VMOVDQU(Mem{Base: src}, srcX) + VMOVDQU(Mem{Base: src, Disp: 32}, srcX2) + VMOVDQU(Mem{Base: dst}, dstX) + VMOVDQU(Mem{Base: dst, Disp: 32}, dstX2) + VPXOR(srcX, dstX, dstX) + VPXOR(srcX2, dstX2, dstX2) + VMOVDQU(dstX, Mem{Base: dst}) + VMOVDQU(dstX2, Mem{Base: dst, Disp: 32}) + ADDQ(U8(64), src) + ADDQ(U8(64), dst) + DECQ(length) + JNZ(LabelRef("loop")) + + Label("end") + VZEROUPPER() + RET() + } +} diff --git a/_gen/gf16.go b/_gen/gf16.go index 1cbea53b..1dbdabac 100644 --- a/_gen/gf16.go +++ b/_gen/gf16.go @@ -77,7 +77,7 @@ func genGF16() { var ctx gf16ctx // Ported from static void IFFT_DIT2 // https://github.com/catid/leopard/blob/master/LeopardFF16.cpp#L629 - { + if pshufb { TEXT("ifftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) Pragma("noescape") tablePtr := Load(Param("table"), GP64()) @@ -120,7 +120,7 @@ func genGF16() { VZEROUPPER() RET() } - { + if pshufb { TEXT("fftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) Pragma("noescape") tablePtr := Load(Param("table"), GP64()) @@ -173,7 +173,7 @@ func genGF16() { RET() } - { + if pshufb { TEXT("mulgf16_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) Pragma("noescape") tablePtr := Load(Param("table"), GP64()) @@ -213,6 +213,9 @@ func genGF16() { RET() } for _, avx512 := range []bool{true, false} { + if !pshufb { + continue + } x := [8]int{} for skipMask := range x[:] { // AVX-512 only uses more registers for tables. @@ -562,7 +565,7 @@ func genGF16() { // SSSE3: ctx.avx512 = false - { + if pshufb { TEXT("ifftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) Pragma("noescape") tablePtr := Load(Param("table"), GP64()) @@ -613,7 +616,7 @@ func genGF16() { RET() } - { + if pshufb { TEXT("fftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) Pragma("noescape") tablePtr := Load(Param("table"), GP64()) @@ -671,7 +674,7 @@ func genGF16() { RET() } - { + if pshufb { TEXT("mulgf16_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) Pragma("noescape") tablePtr := Load(Param("table"), GP64()) diff --git a/_gen/gf8.go b/_gen/gf8.go index d177a55e..d774749d 100644 --- a/_gen/gf8.go +++ b/_gen/gf8.go @@ -23,9 +23,10 @@ func genGF8() { var ctx gf8ctx // Ported from static void IFFT_DIT2 // https://github.com/catid/leopard/blob/master/LeopardFF8.cpp#L599 - if true { + if pshufb { TEXT("ifftDIT28_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[2*16]uint8)")) Pragma("noescape") + tablePtr := Load(Param("table"), GP64()) var tables table256 tables.Lo, tables.Hi = YMM(), YMM() @@ -72,7 +73,7 @@ func genGF8() { RET() } // https://github.com/catid/leopard/blob/master/LeopardFF8.cpp#L1323 - if true { + if pshufb { TEXT("fftDIT28_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[2*16]uint8)")) Pragma("noescape") tablePtr := Load(Param("table"), GP64()) @@ -119,6 +120,9 @@ func genGF8() { x := [8]int{} for skipMask := range x[:] { + if !pshufb { + break + } { var suffix = "avx2_" + fmt.Sprint(skipMask) TEXT("ifftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, t01, t23, t02 *[2*16]uint8)")) diff --git a/_gen/go.mod b/_gen/go.mod index 51857275..f6d1dca4 100644 --- a/_gen/go.mod +++ b/_gen/go.mod @@ -1,8 +1,14 @@ module github.com/klauspost/reedsolomon/_gen -go 1.16 +go 1.19 require ( github.com/klauspost/asmfmt v1.3.1 github.com/mmcloughlin/avo v0.5.1-0.20221128045730-bf1d05562091 ) + +require ( + golang.org/x/mod v0.6.0 // indirect + golang.org/x/sys v0.1.0 // indirect + golang.org/x/tools v0.2.0 // indirect +) diff --git a/_gen/nopshufb.go b/_gen/nopshufb.go new file mode 100644 index 00000000..ff847ce2 --- /dev/null +++ b/_gen/nopshufb.go @@ -0,0 +1,7 @@ +//go:build nopshufb + +// Copyright 2023+, Klaus Post. See LICENSE for details. + +package main + +const pshufb = false diff --git a/_gen/pshufb.go b/_gen/pshufb.go new file mode 100644 index 00000000..1ec4eb3c --- /dev/null +++ b/_gen/pshufb.go @@ -0,0 +1,7 @@ +//go:build !nopshufb + +// Copyright 2023+, Klaus Post. See LICENSE for details. + +package main + +const pshufb = true diff --git a/galois_amd64.go b/galois_amd64.go index 9f84276b..c7ab3663 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -1,10 +1,11 @@ -//go:build !noasm && !appengine && !gccgo -// +build !noasm,!appengine,!gccgo +//go:build !noasm && !appengine && !gccgo && !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. package reedsolomon +const pshufb = true + //go:noescape func galMulSSSE3(low, high, in, out []byte) @@ -17,21 +18,12 @@ func galMulAVX2Xor(low, high, in, out []byte) //go:noescape func galMulAVX2(low, high, in, out []byte) -//go:noescape -func sSE2XorSlice(in, out []byte) - //go:noescape func galMulAVX2Xor_64(low, high, in, out []byte) //go:noescape func galMulAVX2_64(low, high, in, out []byte) -//go:noescape -func sSE2XorSlice_64(in, out []byte) - -//go:noescape -func avx2XorSlice_64(in, out []byte) - // This is what the assembler routines do in blocks of 16 bytes: /* func galMulSSSE3(low, high, in, out []byte) { diff --git a/galois_amd64.s b/galois_amd64.s index 3e97c7c1..18e08c31 100644 --- a/galois_amd64.s +++ b/galois_amd64.s @@ -1,6 +1,7 @@ //+build !noasm //+build !appengine //+build !gccgo +//+build !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. @@ -215,28 +216,6 @@ done_avx2: VZEROUPPER RET -// func sSE2XorSlice(in, out []byte) -TEXT ·sSE2XorSlice(SB), 7, $0 - MOVQ in+0(FP), SI // SI: &in - MOVQ in_len+8(FP), R9 // R9: len(in) - MOVQ out+24(FP), DX // DX: &out - SHRQ $4, R9 // len(in) / 16 - CMPQ R9, $0 - JEQ done_xor_sse2 - -loopback_xor_sse2: - MOVOU (SI), X0 // in[x] - MOVOU (DX), X1 // out[x] - PXOR X0, X1 - MOVOU X1, (DX) - ADDQ $16, SI // in+=16 - ADDQ $16, DX // out+=16 - SUBQ $1, R9 - JNZ loopback_xor_sse2 - -done_xor_sse2: - RET - // func galMulAVX2Xor_64(low, high, in, out []byte) TEXT ·galMulAVX2Xor_64(SB), 7, $0 MOVQ low+0(FP), SI // SI: &low @@ -329,66 +308,3 @@ loopback_avx2_64: done_avx2_64: VZEROUPPER RET - -// func sSE2XorSlice_64(in, out []byte) -TEXT ·sSE2XorSlice_64(SB), 7, $0 - MOVQ in+0(FP), SI // SI: &in - MOVQ in_len+8(FP), R9 // R9: len(in) - MOVQ out+24(FP), DX // DX: &out - SHRQ $6, R9 // len(in) / 64 - CMPQ R9, $0 - JEQ done_xor_sse2_64 - -loopback_xor_sse2_64: - MOVOU (SI), X0 // in[x] - MOVOU 16(SI), X2 // in[x] - MOVOU 32(SI), X4 // in[x] - MOVOU 48(SI), X6 // in[x] - MOVOU (DX), X1 // out[x] - MOVOU 16(DX), X3 // out[x] - MOVOU 32(DX), X5 // out[x] - MOVOU 48(DX), X7 // out[x] - PXOR X0, X1 - PXOR X2, X3 - PXOR X4, X5 - PXOR X6, X7 - MOVOU X1, (DX) - MOVOU X3, 16(DX) - MOVOU X5, 32(DX) - MOVOU X7, 48(DX) - ADDQ $64, SI // in+=64 - ADDQ $64, DX // out+=64 - SUBQ $1, R9 - JNZ loopback_xor_sse2_64 - -done_xor_sse2_64: - RET - -// func avx2XorSlice_64(in, out []byte) -TEXT ·avx2XorSlice_64(SB), 7, $0 - MOVQ in+0(FP), SI // SI: &in - MOVQ in_len+8(FP), R9 // R9: len(in) - MOVQ out+24(FP), DX // DX: &out - SHRQ $6, R9 // len(in) / 64 - CMPQ R9, $0 - JEQ done_xor_avx2_64 - -loopback_xor_avx2_64: - VMOVDQU (SI), Y0 - VMOVDQU 32(SI), Y2 - VMOVDQU (DX), Y1 - VMOVDQU 32(DX), Y3 - VPXOR Y0, Y1, Y1 - VPXOR Y2, Y3, Y3 - VMOVDQU Y1, (DX) - VMOVDQU Y3, 32(DX) - - ADDQ $64, SI // in+=64 - ADDQ $64, DX // out+=64 - SUBQ $1, R9 - JNZ loopback_xor_avx2_64 - VZEROUPPER - -done_xor_avx2_64: - - RET diff --git a/galois_arm64.go b/galois_arm64.go index 9ab27941..8ef402bf 100644 --- a/galois_arm64.go +++ b/galois_arm64.go @@ -1,20 +1,18 @@ -//go:build !noasm && !appengine && !gccgo -// +build !noasm,!appengine,!gccgo +//go:build !noasm && !appengine && !gccgo && !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2017, Minio, Inc. package reedsolomon +const pshufb = true + //go:noescape func galMulNEON(low, high, in, out []byte) //go:noescape func galMulXorNEON(low, high, in, out []byte) -//go:noescape -func galXorNEON(in, out []byte) - func galMulSlice(c byte, in, out []byte, o *options) { if c == 1 { copy(out, in) @@ -51,20 +49,6 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } } -// simple slice xor -func sliceXor(in, out []byte, o *options) { - - galXorNEON(in, out) - done := (len(in) >> 5) << 5 - - remain := len(in) - done - if remain > 0 { - for i := done; i < len(in); i++ { - out[i] ^= in[i] - } - } -} - // 4-way butterfly func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) @@ -90,7 +74,7 @@ func fftDIT2(x, y []byte, log_m ffe, o *options) { // Reference version: refMulAdd(x, y, log_m) // 64 byte aligned, always full. - galXorNEON(x, y) + xorSliceNEON(x, y) } // 2-way butterfly forward @@ -103,7 +87,7 @@ func fftDIT28(x, y []byte, log_m ffe8, o *options) { // 2-way butterfly func ifftDIT2(x, y []byte, log_m ffe, o *options) { // 64 byte aligned, always full. - galXorNEON(x, y) + xorSliceNEON(x, y) // Reference version: refMulAdd(x, y, log_m) } diff --git a/galois_arm64.s b/galois_arm64.s index 3ae32372..772dfac9 100644 --- a/galois_arm64.s +++ b/galois_arm64.s @@ -1,6 +1,7 @@ //+build !noasm //+build !appengine //+build !gccgo +//+build !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2017, Minio, Inc. @@ -99,29 +100,3 @@ loopXor: completeXor: RET - -// func galXorNEON(in, out []byte) -TEXT ·galXorNEON(SB), 7, $0 - MOVD in_base+0(FP), R1 - MOVD in_len+8(FP), R2 // length of message - MOVD out_base+24(FP), R5 - SUBS $32, R2 - BMI completeXor - -loopXor: - // Main loop - VLD1.P 32(R1), [V0.B16, V1.B16] - VLD1 (R5), [V20.B16, V21.B16] - - VEOR V20.B16, V0.B16, V4.B16 - VEOR V21.B16, V1.B16, V5.B16 - - // Store result - VST1.P [V4.D2, V5.D2], 32(R5) - - SUBS $32, R2 - BPL loopXor - -completeXor: - RET - diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 5f53c3b4..43184349 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -1,11 +1,20 @@ // Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. -//go:build !appengine && !noasm && !nogen && gc +//go:build !appengine && !noasm && !nogen && !nopshufb && gc package reedsolomon func _dummy_() +//go:noescape +func sSE2XorSlice(in []byte, out []byte) + +//go:noescape +func sSE2XorSlice_64(in []byte, out []byte) + +//go:noescape +func avx2XorSlice_64(in []byte, out []byte) + // mulAvxTwo_1x1 takes 1 inputs and produces 1 outputs. // The output is initialized to 0. // diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index 3a2acace..b3d0d998 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -1,6 +1,6 @@ // Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. -//go:build !appengine && !noasm && !nogen && gc +//go:build !appengine && !noasm && !nogen && !nopshufb && gc #include "textflag.h" @@ -18,6 +18,98 @@ TEXT ·_dummy_(SB), $0 #endif RET +// sSE2XorSlice will XOR in with out and store in out. +// Processes 16 bytes/loop. + +// func sSE2XorSlice(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x04, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU (CX), X1 + PXOR X0, X1 + MOVOU X1, (CX) + ADDQ $0x10, AX + ADDQ $0x10, CX + DECQ DX + JNZ loop + +end: + RET + +// sSE2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func sSE2XorSlice_64(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU 16(AX), X2 + MOVOU 32(AX), X4 + MOVOU 48(AX), X6 + MOVOU (CX), X1 + MOVOU 16(CX), X3 + MOVOU 32(CX), X5 + MOVOU 48(CX), X7 + PXOR X0, X1 + PXOR X2, X3 + PXOR X4, X5 + PXOR X6, X7 + MOVOU X1, (CX) + MOVOU X3, 16(CX) + MOVOU X5, 32(CX) + MOVOU X7, 48(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + RET + +// avx2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func avx2XorSlice_64(in []byte, out []byte) +// Requires: AVX, AVX2 +TEXT ·avx2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y2 + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y3 + VPXOR Y0, Y1, Y1 + VPXOR Y2, Y3, Y3 + VMOVDQU Y1, (CX) + VMOVDQU Y3, 32(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + VZEROUPPER + RET + // func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1(SB), NOSPLIT, $0-88 diff --git a/galois_gen_none.go b/galois_gen_none.go index 11929e68..02c3cf49 100644 --- a/galois_gen_none.go +++ b/galois_gen_none.go @@ -1,5 +1,4 @@ //go:build !amd64 || noasm || appengine || gccgo || nogen -// +build !amd64 noasm appengine gccgo nogen package reedsolomon diff --git a/galois_gen_nopshufb_amd64.go b/galois_gen_nopshufb_amd64.go new file mode 100644 index 00000000..b07f3f34 --- /dev/null +++ b/galois_gen_nopshufb_amd64.go @@ -0,0 +1,1164 @@ +// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && nopshufb && gc + +package reedsolomon + +func _dummy_() + +//go:noescape +func sSE2XorSlice(in []byte, out []byte) + +//go:noescape +func sSE2XorSlice_64(in []byte, out []byte) + +//go:noescape +func avx2XorSlice_64(in []byte, out []byte) + +// mulGFNI_1x1_64 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x2_64 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x3_64 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x4_64 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x5_64 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x6_64 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x7_64 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x8_64 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x9_64 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x10_64 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x1_64 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x2_64 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x3_64 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x4_64 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x5_64 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x6_64 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x7_64 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x8_64 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x9_64 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x10_64 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x1_64 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x2_64 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x3_64 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x4_64 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x5_64 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x6_64 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x7_64 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x8_64 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x9_64 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x10_64 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x1_64 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x2_64 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x3_64 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x4_64 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x5_64 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x7_64 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x8_64 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x9_64 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x10_64 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x1_64 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x2_64 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x3_64 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x4_64 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x5_64 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x6_64 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x7_64 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x8_64 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x9_64 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x10_64 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x1_64 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x2_64 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x3_64 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x4_64 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x5_64 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x6_64 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x7_64 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x8_64 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x9_64 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x10_64 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x1_64 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x2_64 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x3_64 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x4_64 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x5_64 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x6_64 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x7_64 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x8_64 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x9_64 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x10_64 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x1_64 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x2_64 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x3_64 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x4_64 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x5_64 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x6_64 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x7_64 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x8_64 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x9_64 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x10_64 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x1_64 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x2_64 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x3_64 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x4_64 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x5_64 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x6_64 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x7_64 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x8_64 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x9_64 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x10_64 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x1_64 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x2_64 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x3_64 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x4_64 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x5_64 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x6_64 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x7_64 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x8_64 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x9_64 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x10_64 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) diff --git a/galois_gen_nopshufb_amd64.s b/galois_gen_nopshufb_amd64.s new file mode 100644 index 00000000..574dfe9b --- /dev/null +++ b/galois_gen_nopshufb_amd64.s @@ -0,0 +1,33101 @@ +// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && nopshufb && gc + +#include "textflag.h" + +// func _dummy_() +TEXT ·_dummy_(SB), $0 +#ifdef GOAMD64_v4 +#define XOR3WAY(ignore, a, b, dst) \ + VPTERNLOGD $0x96, a, b, dst + +#else +#define XOR3WAY(ignore, a, b, dst) \ + VPXOR a, dst, dst \ + VPXOR b, dst, dst + +#endif + RET + +// sSE2XorSlice will XOR in with out and store in out. +// Processes 16 bytes/loop. + +// func sSE2XorSlice(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x04, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU (CX), X1 + PXOR X0, X1 + MOVOU X1, (CX) + ADDQ $0x10, AX + ADDQ $0x10, CX + DECQ DX + JNZ loop + +end: + RET + +// sSE2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func sSE2XorSlice_64(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU 16(AX), X2 + MOVOU 32(AX), X4 + MOVOU 48(AX), X6 + MOVOU (CX), X1 + MOVOU 16(CX), X3 + MOVOU 32(CX), X5 + MOVOU 48(CX), X7 + PXOR X0, X1 + PXOR X2, X3 + PXOR X4, X5 + PXOR X6, X7 + MOVOU X1, (CX) + MOVOU X3, 16(CX) + MOVOU X5, 32(CX) + MOVOU X7, 48(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + RET + +// avx2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func avx2XorSlice_64(in []byte, out []byte) +// Requires: AVX, AVX2 +TEXT ·avx2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y2 + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y3 + VPXOR Y0, Y1, Y1 + VPXOR Y2, Y3, Y3 + VMOVDQU Y1, (CX) + VMOVDQU Y3, 32(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + VZEROUPPER + RET + +// func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z1 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z1, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64_loop + VZEROUPPER + +mulGFNI_1x1_64_end: + RET + +// func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DX), Z1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z2 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z2, Z2 + VXORPD Z1, Z2, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64Xor_loop + VZEROUPPER + +mulGFNI_1x1_64Xor_end: + RET + +// func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64_loop + VZEROUPPER + +mulGFNI_1x2_64_end: + RET + +// func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (BX), Z2 + VMOVDQU64 (DX), Z3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z5 + VXORPD Z3, Z5, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64Xor_loop + VZEROUPPER + +mulGFNI_1x2_64Xor_end: + RET + +// func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z3 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z4 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64_loop + VZEROUPPER + +mulGFNI_1x3_64_end: + RET + +// func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (BX), Z3 + VMOVDQU64 (SI), Z4 + VMOVDQU64 (DX), Z5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 + VXORPD Z3, Z7, Z3 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64Xor_loop + VZEROUPPER + +mulGFNI_1x3_64Xor_end: + RET + +// func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z7, Z5 + VGF2P8AFFINEQB $0x00, Z2, Z7, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64_loop + VZEROUPPER + +mulGFNI_1x4_64_end: + RET + +// func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (BX), Z4 + VMOVDQU64 (SI), Z5 + VMOVDQU64 (DI), Z6 + VMOVDQU64 (DX), Z7 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 + VXORPD Z4, Z9, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 + VXORPD Z5, Z9, Z5 + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64Xor_loop + VZEROUPPER + +mulGFNI_1x4_64Xor_end: + RET + +// func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z5 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z7 + VGF2P8AFFINEQB $0x00, Z3, Z9, Z8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64_loop + VZEROUPPER + +mulGFNI_1x5_64_end: + RET + +// func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (BX), Z5 + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DX), Z9 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 + VXORPD Z5, Z11, Z5 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 + VXORPD Z6, Z11, Z6 + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z7, Z11, Z7 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64Xor_loop + VZEROUPPER + +mulGFNI_1x5_64Xor_end: + RET + +// func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z11, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z11, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64_loop + VZEROUPPER + +mulGFNI_1x6_64_end: + RET + +// func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (BX), Z6 + VMOVDQU64 (SI), Z7 + VMOVDQU64 (DI), Z8 + VMOVDQU64 (R8), Z9 + VMOVDQU64 (R9), Z10 + VMOVDQU64 (DX), Z11 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z6, Z13, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z7, Z13, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64Xor_loop + VZEROUPPER + +mulGFNI_1x6_64Xor_end: + RET + +// func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z13 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z13, Z7 + VGF2P8AFFINEQB $0x00, Z1, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z2, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z3, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z4, Z13, Z11 + VGF2P8AFFINEQB $0x00, Z5, Z13, Z12 + VGF2P8AFFINEQB $0x00, Z6, Z13, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64_loop + VZEROUPPER + +mulGFNI_1x7_64_end: + RET + +// func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (BX), Z7 + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (R9), Z11 + VMOVDQU64 (R10), Z12 + VMOVDQU64 (DX), Z13 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 + VXORPD Z7, Z15, Z7 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 + VXORPD Z8, Z15, Z8 + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z9, Z15, Z9 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z10, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z11, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64Xor_loop + VZEROUPPER + +mulGFNI_1x7_64Xor_end: + RET + +// func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64_loop + VZEROUPPER + +mulGFNI_1x8_64_end: + RET + +// func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (BX), Z8 + VMOVDQU64 (SI), Z9 + VMOVDQU64 (DI), Z10 + VMOVDQU64 (R8), Z11 + VMOVDQU64 (R9), Z12 + VMOVDQU64 (R10), Z13 + VMOVDQU64 (R11), Z14 + VMOVDQU64 (DX), Z15 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z8, Z17, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z9, Z17, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z10, Z17, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z11, Z17, Z11 + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64Xor_loop + VZEROUPPER + +mulGFNI_1x8_64Xor_end: + RET + +// func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z17 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z17, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z17, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z17, Z11 + VGF2P8AFFINEQB $0x00, Z3, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z6, Z17, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z17, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z17, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64_loop + VZEROUPPER + +mulGFNI_1x9_64_end: + RET + +// func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (BX), Z9 + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (R10), Z14 + VMOVDQU64 (R11), Z15 + VMOVDQU64 (R12), Z16 + VMOVDQU64 (DX), Z17 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z9, Z19, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z10, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z11, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64Xor_loop + VZEROUPPER + +mulGFNI_1x9_64Xor_end: + RET + +// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z19 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z19, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z19, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z19, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64_loop + VZEROUPPER + +mulGFNI_1x10_64_end: + RET + +// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (BX), Z10 + VMOVDQU64 (SI), Z11 + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (R10), Z15 + VMOVDQU64 (R11), Z16 + VMOVDQU64 (R12), Z17 + VMOVDQU64 (R13), Z18 + VMOVDQU64 (DX), Z19 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z10, Z21, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z11, Z21, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z12, Z21, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z13, Z21, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z14, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64Xor_loop + VZEROUPPER + +mulGFNI_1x10_64Xor_end: + RET + +// func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64_loop + VZEROUPPER + +mulGFNI_2x1_64_end: + RET + +// func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (BX), Z2 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64Xor_loop + VZEROUPPER + +mulGFNI_2x1_64Xor_end: + RET + +// func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64_loop + VZEROUPPER + +mulGFNI_2x2_64_end: + RET + +// func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (SI), Z4 + VMOVDQU64 (BX), Z5 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64Xor_loop + VZEROUPPER + +mulGFNI_2x2_64Xor_end: + RET + +// func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64_loop + VZEROUPPER + +mulGFNI_2x3_64_end: + RET + +// func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (BX), Z8 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64Xor_loop + VZEROUPPER + +mulGFNI_2x3_64Xor_end: + RET + +// func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64_loop + VZEROUPPER + +mulGFNI_2x4_64_end: + RET + +// func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (BX), Z11 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64Xor_loop + VZEROUPPER + +mulGFNI_2x4_64Xor_end: + RET + +// func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64_loop + VZEROUPPER + +mulGFNI_2x5_64_end: + RET + +// func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (BX), Z14 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64Xor_loop + VZEROUPPER + +mulGFNI_2x5_64Xor_end: + RET + +// func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64_loop + VZEROUPPER + +mulGFNI_2x6_64_end: + RET + +// func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (SI), Z12 + VMOVDQU64 (DI), Z13 + VMOVDQU64 (R8), Z14 + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (BX), Z17 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64Xor_loop + VZEROUPPER + +mulGFNI_2x6_64Xor_end: + RET + +// func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z6, Z21, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64_loop + VZEROUPPER + +mulGFNI_2x7_64_end: + RET + +// func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (SI), Z14 + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (BX), Z20 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64Xor_loop + VZEROUPPER + +mulGFNI_2x7_64Xor_end: + RET + +// func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z19 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64_loop + VZEROUPPER + +mulGFNI_2x8_64_end: + RET + +// func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (SI), Z16 + VMOVDQU64 (DI), Z17 + VMOVDQU64 (R8), Z18 + VMOVDQU64 (R9), Z19 + VMOVDQU64 (R10), Z20 + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (BX), Z23 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64Xor_loop + VZEROUPPER + +mulGFNI_2x8_64Xor_end: + RET + +// func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z27, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z23 + VGF2P8AFFINEQB $0x00, Z6, Z27, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64_loop + VZEROUPPER + +mulGFNI_2x9_64_end: + RET + +// func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (SI), Z18 + VMOVDQU64 (DI), Z19 + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (BX), Z26 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64Xor_loop + VZEROUPPER + +mulGFNI_2x9_64Xor_end: + RET + +// func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64_loop + VZEROUPPER + +mulGFNI_2x10_64_end: + RET + +// func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (SI), Z20 + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (BX), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64Xor_loop + VZEROUPPER + +mulGFNI_2x10_64Xor_end: + RET + +// func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64_loop + VZEROUPPER + +mulGFNI_3x1_64_end: + RET + +// func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (SI), Z3 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64Xor_loop + VZEROUPPER + +mulGFNI_3x1_64Xor_end: + RET + +// func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64_loop + VZEROUPPER + +mulGFNI_3x2_64_end: + RET + +// func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (DI), Z6 + VMOVDQU64 (SI), Z7 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64Xor_loop + VZEROUPPER + +mulGFNI_3x2_64Xor_end: + RET + +// func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64_loop + VZEROUPPER + +mulGFNI_3x3_64_end: + RET + +// func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (SI), Z11 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64Xor_loop + VZEROUPPER + +mulGFNI_3x3_64Xor_end: + RET + +// func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64_loop + VZEROUPPER + +mulGFNI_3x4_64_end: + RET + +// func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (SI), Z15 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64Xor_loop + VZEROUPPER + +mulGFNI_3x4_64Xor_end: + RET + +// func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z17 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64_loop + VZEROUPPER + +mulGFNI_3x5_64_end: + RET + +// func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (SI), Z19 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64Xor_loop + VZEROUPPER + +mulGFNI_3x5_64Xor_end: + RET + +// func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64_loop + VZEROUPPER + +mulGFNI_3x6_64_end: + RET + +// func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (DI), Z18 + VMOVDQU64 (R8), Z19 + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (SI), Z23 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64Xor_loop + VZEROUPPER + +mulGFNI_3x6_64Xor_end: + RET + +// func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64_loop + VZEROUPPER + +mulGFNI_3x7_64_end: + RET + +// func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (SI), Z27 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64Xor_loop + VZEROUPPER + +mulGFNI_3x7_64Xor_end: + RET + +// func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x8_64(SB), $0-88 + // Loading 22 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64_loop + VZEROUPPER + +mulGFNI_3x8_64_end: + RET + +// func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 + // Loading 22 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64Xor_loop + VZEROUPPER + +mulGFNI_3x8_64Xor_end: + RET + +// func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x9_64(SB), $8-88 + // Loading 21 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64_loop + VZEROUPPER + +mulGFNI_3x9_64_end: + RET + +// func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 + // Loading 21 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64Xor_loop + VZEROUPPER + +mulGFNI_3x9_64Xor_end: + RET + +// func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x10_64(SB), $8-88 + // Loading 20 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64_loop + VZEROUPPER + +mulGFNI_3x10_64_end: + RET + +// func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 + // Loading 20 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (DI), Z20 + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (SI), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64Xor_loop + VZEROUPPER + +mulGFNI_3x10_64Xor_end: + RET + +// func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64_loop + VZEROUPPER + +mulGFNI_4x1_64_end: + RET + +// func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DI), Z4 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64Xor_loop + VZEROUPPER + +mulGFNI_4x1_64Xor_end: + RET + +// func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64_loop + VZEROUPPER + +mulGFNI_4x2_64_end: + RET + +// func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DI), Z9 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64Xor_loop + VZEROUPPER + +mulGFNI_4x2_64Xor_end: + RET + +// func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64_loop + VZEROUPPER + +mulGFNI_4x3_64_end: + RET + +// func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (DI), Z14 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64Xor_loop + VZEROUPPER + +mulGFNI_4x3_64Xor_end: + RET + +// func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64_loop + VZEROUPPER + +mulGFNI_4x4_64_end: + RET + +// func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (DI), Z19 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64Xor_loop + VZEROUPPER + +mulGFNI_4x4_64Xor_end: + RET + +// func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z25, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z25, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64_loop + VZEROUPPER + +mulGFNI_4x5_64_end: + RET + +// func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (DI), Z24 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64Xor_loop + VZEROUPPER + +mulGFNI_4x5_64Xor_end: + RET + +// func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64_loop + VZEROUPPER + +mulGFNI_4x6_64_end: + RET + +// func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R8), Z24 + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64Xor_loop + VZEROUPPER + +mulGFNI_4x6_64Xor_end: + RET + +// func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x7_64(SB), $0-88 + // Loading 23 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64_loop + VZEROUPPER + +mulGFNI_4x7_64_end: + RET + +// func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 + // Loading 23 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64Xor_loop + VZEROUPPER + +mulGFNI_4x7_64Xor_end: + RET + +// func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x8_64(SB), $8-88 + // Loading 22 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64_loop + VZEROUPPER + +mulGFNI_4x8_64_end: + RET + +// func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 + // Loading 22 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64Xor_loop + VZEROUPPER + +mulGFNI_4x8_64Xor_end: + RET + +// func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x9_64(SB), $8-88 + // Loading 21 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64_loop + VZEROUPPER + +mulGFNI_4x9_64_end: + RET + +// func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 + // Loading 21 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64Xor_loop + VZEROUPPER + +mulGFNI_4x9_64Xor_end: + RET + +// func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64_loop + VZEROUPPER + +mulGFNI_4x10_64_end: + RET + +// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU64 (R10)(R9*1), Z20 + MOVQ 24(R8), R10 + VMOVDQU64 (R10)(R9*1), Z21 + MOVQ 48(R8), R10 + VMOVDQU64 (R10)(R9*1), Z22 + MOVQ 72(R8), R10 + VMOVDQU64 (R10)(R9*1), Z23 + MOVQ 96(R8), R10 + VMOVDQU64 (R10)(R9*1), Z24 + MOVQ 120(R8), R10 + VMOVDQU64 (R10)(R9*1), Z25 + MOVQ 144(R8), R10 + VMOVDQU64 (R10)(R9*1), Z26 + MOVQ 168(R8), R10 + VMOVDQU64 (R10)(R9*1), Z27 + MOVQ 192(R8), R10 + VMOVDQU64 (R10)(R9*1), Z28 + MOVQ 216(R8), R10 + VMOVDQU64 (R10)(R9*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64Xor_loop + VZEROUPPER + +mulGFNI_4x10_64Xor_end: + RET + +// func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64_loop + VZEROUPPER + +mulGFNI_5x1_64_end: + RET + +// func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R8), Z5 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64Xor_loop + VZEROUPPER + +mulGFNI_5x1_64Xor_end: + RET + +// func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64_loop + VZEROUPPER + +mulGFNI_5x2_64_end: + RET + +// func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R9), Z10 + VMOVDQU64 (R8), Z11 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64Xor_loop + VZEROUPPER + +mulGFNI_5x2_64Xor_end: + RET + +// func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64_loop + VZEROUPPER + +mulGFNI_5x3_64_end: + RET + +// func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (R8), Z17 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64Xor_loop + VZEROUPPER + +mulGFNI_5x3_64Xor_end: + RET + +// func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64_loop + VZEROUPPER + +mulGFNI_5x4_64_end: + RET + +// func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (R8), Z23 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64Xor_loop + VZEROUPPER + +mulGFNI_5x4_64Xor_end: + RET + +// func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64_loop + VZEROUPPER + +mulGFNI_5x5_64_end: + RET + +// func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64Xor_loop + VZEROUPPER + +mulGFNI_5x5_64Xor_end: + RET + +// func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x6_64(SB), $0-88 + // Loading 24 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64_loop + VZEROUPPER + +mulGFNI_5x6_64_end: + RET + +// func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 + // Loading 24 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64Xor_loop + VZEROUPPER + +mulGFNI_5x6_64Xor_end: + RET + +// func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x7_64(SB), $8-88 + // Loading 23 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64_loop + VZEROUPPER + +mulGFNI_5x7_64_end: + RET + +// func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 + // Loading 23 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64Xor_loop + VZEROUPPER + +mulGFNI_5x7_64Xor_end: + RET + +// func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x8_64(SB), $8-88 + // Loading 22 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64_loop + VZEROUPPER + +mulGFNI_5x8_64_end: + RET + +// func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 + // Loading 22 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64Xor_loop + VZEROUPPER + +mulGFNI_5x8_64Xor_end: + RET + +// func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x9_64(SB), $0-88 + // Loading 21 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64_loop + VZEROUPPER + +mulGFNI_5x9_64_end: + RET + +// func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 + // Loading 21 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z21 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z22 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z23 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z24 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z25 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z26 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z27 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z28 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64Xor_loop + VZEROUPPER + +mulGFNI_5x9_64Xor_end: + RET + +// func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x10_64(SB), $0-88 + // Loading 20 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z20, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64_loop + VZEROUPPER + +mulGFNI_5x10_64_end: + RET + +// func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 + // Loading 20 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z20 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z21 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z22 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z23 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z24 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z25 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z26 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z27 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z28 + MOVQ 216(R9), R11 + VMOVDQU64 (R11)(R10*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z20, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64Xor_loop + VZEROUPPER + +mulGFNI_5x10_64Xor_end: + RET + +// func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64_loop + VZEROUPPER + +mulGFNI_6x1_64_end: + RET + +// func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R9), Z6 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64Xor_loop + VZEROUPPER + +mulGFNI_6x1_64Xor_end: + RET + +// func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64_loop + VZEROUPPER + +mulGFNI_6x2_64_end: + RET + +// func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R10), Z12 + VMOVDQU64 (R9), Z13 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64Xor_loop + VZEROUPPER + +mulGFNI_6x2_64Xor_end: + RET + +// func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64_loop + VZEROUPPER + +mulGFNI_6x3_64_end: + RET + +// func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (R9), Z20 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64Xor_loop + VZEROUPPER + +mulGFNI_6x3_64Xor_end: + RET + +// func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64_loop + VZEROUPPER + +mulGFNI_6x4_64_end: + RET + +// func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R9), Z27 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64Xor_loop + VZEROUPPER + +mulGFNI_6x4_64Xor_end: + RET + +// func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x5_64(SB), $0-88 + // Loading 25 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64_loop + VZEROUPPER + +mulGFNI_6x5_64_end: + RET + +// func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 + // Loading 25 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64Xor_loop + VZEROUPPER + +mulGFNI_6x5_64Xor_end: + RET + +// func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x6_64(SB), $8-88 + // Loading 24 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64_loop + VZEROUPPER + +mulGFNI_6x6_64_end: + RET + +// func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 + // Loading 24 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64Xor_loop + VZEROUPPER + +mulGFNI_6x6_64Xor_end: + RET + +// func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x7_64(SB), $8-88 + // Loading 23 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64_loop + VZEROUPPER + +mulGFNI_6x7_64_end: + RET + +// func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 + // Loading 23 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64Xor_loop + VZEROUPPER + +mulGFNI_6x7_64Xor_end: + RET + +// func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x8_64(SB), $0-88 + // Loading 22 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64_loop + VZEROUPPER + +mulGFNI_6x8_64_end: + RET + +// func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 + // Loading 22 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64Xor_loop + VZEROUPPER + +mulGFNI_6x8_64Xor_end: + RET + +// func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x9_64(SB), $0-88 + // Loading 21 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64_loop + VZEROUPPER + +mulGFNI_6x9_64_end: + RET + +// func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 + // Loading 21 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z21 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64Xor_loop + VZEROUPPER + +mulGFNI_6x9_64Xor_end: + RET + +// func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x10_64(SB), $0-88 + // Loading 20 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z20, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64_loop + VZEROUPPER + +mulGFNI_6x10_64_end: + RET + +// func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 + // Loading 20 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z20 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z21 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 216(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z20, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64Xor_loop + VZEROUPPER + +mulGFNI_6x10_64Xor_end: + RET + +// func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64_loop + VZEROUPPER + +mulGFNI_7x1_64_end: + RET + +// func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R10), Z7 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64Xor_loop + VZEROUPPER + +mulGFNI_7x1_64Xor_end: + RET + +// func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64_loop + VZEROUPPER + +mulGFNI_7x2_64_end: + RET + +// func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R11), Z14 + VMOVDQU64 (R10), Z15 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64Xor_loop + VZEROUPPER + +mulGFNI_7x2_64Xor_end: + RET + +// func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64_loop + VZEROUPPER + +mulGFNI_7x3_64_end: + RET + +// func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (R10), Z23 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64Xor_loop + VZEROUPPER + +mulGFNI_7x3_64Xor_end: + RET + +// func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x4_64(SB), $0-88 + // Loading 26 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64_loop + VZEROUPPER + +mulGFNI_7x4_64_end: + RET + +// func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 + // Loading 26 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64Xor_loop + VZEROUPPER + +mulGFNI_7x4_64Xor_end: + RET + +// func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x5_64(SB), $8-88 + // Loading 25 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64_loop + VZEROUPPER + +mulGFNI_7x5_64_end: + RET + +// func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 + // Loading 25 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64Xor_loop + VZEROUPPER + +mulGFNI_7x5_64Xor_end: + RET + +// func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64_loop + VZEROUPPER + +mulGFNI_7x6_64_end: + RET + +// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64Xor_loop + VZEROUPPER + +mulGFNI_7x6_64Xor_end: + RET + +// func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x7_64(SB), $0-88 + // Loading 23 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64_loop + VZEROUPPER + +mulGFNI_7x7_64_end: + RET + +// func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 + // Loading 23 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64Xor_loop + VZEROUPPER + +mulGFNI_7x7_64Xor_end: + RET + +// func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x8_64(SB), $0-88 + // Loading 22 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64_loop + VZEROUPPER + +mulGFNI_7x8_64_end: + RET + +// func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 + // Loading 22 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64Xor_loop + VZEROUPPER + +mulGFNI_7x8_64Xor_end: + RET + +// func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x9_64(SB), $0-88 + // Loading 21 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64_loop + VZEROUPPER + +mulGFNI_7x9_64_end: + RET + +// func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 + // Loading 21 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z21 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64Xor_loop + VZEROUPPER + +mulGFNI_7x9_64Xor_end: + RET + +// func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x10_64(SB), $0-88 + // Loading 20 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z20, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64_loop + VZEROUPPER + +mulGFNI_7x10_64_end: + RET + +// func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 + // Loading 20 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z20 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z21 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 216(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z20, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64Xor_loop + VZEROUPPER + +mulGFNI_7x10_64Xor_end: + RET + +// func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64_loop + VZEROUPPER + +mulGFNI_8x1_64_end: + RET + +// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R11), Z8 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64Xor_loop + VZEROUPPER + +mulGFNI_8x1_64Xor_end: + RET + +// func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64_loop + VZEROUPPER + +mulGFNI_8x2_64_end: + RET + +// func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R12), Z16 + VMOVDQU64 (R11), Z17 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64Xor_loop + VZEROUPPER + +mulGFNI_8x2_64Xor_end: + RET + +// func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64_loop + VZEROUPPER + +mulGFNI_8x3_64_end: + RET + +// func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (R11), Z26 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64Xor_loop + VZEROUPPER + +mulGFNI_8x3_64Xor_end: + RET + +// func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x4_64(SB), $8-88 + // Loading 26 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64_loop + VZEROUPPER + +mulGFNI_8x4_64_end: + RET + +// func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 + // Loading 26 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64Xor_loop + VZEROUPPER + +mulGFNI_8x4_64Xor_end: + RET + +// func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x5_64(SB), $8-88 + // Loading 25 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64_loop + VZEROUPPER + +mulGFNI_8x5_64_end: + RET + +// func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 + // Loading 25 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64Xor_loop + VZEROUPPER + +mulGFNI_8x5_64Xor_end: + RET + +// func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x6_64(SB), $0-88 + // Loading 24 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64_loop + VZEROUPPER + +mulGFNI_8x6_64_end: + RET + +// func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 + // Loading 24 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64Xor_loop + VZEROUPPER + +mulGFNI_8x6_64Xor_end: + RET + +// func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x7_64(SB), $0-88 + // Loading 23 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64_loop + VZEROUPPER + +mulGFNI_8x7_64_end: + RET + +// func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 + // Loading 23 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64Xor_loop + VZEROUPPER + +mulGFNI_8x7_64Xor_end: + RET + +// func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x8_64(SB), $0-88 + // Loading 22 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64_loop + VZEROUPPER + +mulGFNI_8x8_64_end: + RET + +// func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 + // Loading 22 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64Xor_loop + VZEROUPPER + +mulGFNI_8x8_64Xor_end: + RET + +// func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x9_64(SB), $0-88 + // Loading 21 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64_loop + VZEROUPPER + +mulGFNI_8x9_64_end: + RET + +// func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 + // Loading 21 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z21 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64Xor_loop + VZEROUPPER + +mulGFNI_8x9_64Xor_end: + RET + +// func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x10_64(SB), $0-88 + // Loading 20 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z20, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64_loop + VZEROUPPER + +mulGFNI_8x10_64_end: + RET + +// func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 + // Loading 20 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z20 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z21 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 216(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z20, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64Xor_loop + VZEROUPPER + +mulGFNI_8x10_64Xor_end: + RET + +// func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64_loop + VZEROUPPER + +mulGFNI_9x1_64_end: + RET + +// func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R12), Z9 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64Xor_loop + VZEROUPPER + +mulGFNI_9x1_64Xor_end: + RET + +// func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64_loop + VZEROUPPER + +mulGFNI_9x2_64_end: + RET + +// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R13), Z18 + VMOVDQU64 (R12), Z19 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64Xor_loop + VZEROUPPER + +mulGFNI_9x2_64Xor_end: + RET + +// func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64_loop + VZEROUPPER + +mulGFNI_9x3_64_end: + RET + +// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64Xor_loop + VZEROUPPER + +mulGFNI_9x3_64Xor_end: + RET + +// func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x4_64(SB), $8-88 + // Loading 26 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64_loop + VZEROUPPER + +mulGFNI_9x4_64_end: + RET + +// func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 + // Loading 26 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64Xor_loop + VZEROUPPER + +mulGFNI_9x4_64Xor_end: + RET + +// func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x5_64(SB), $0-88 + // Loading 25 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64_loop + VZEROUPPER + +mulGFNI_9x5_64_end: + RET + +// func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 + // Loading 25 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64Xor_loop + VZEROUPPER + +mulGFNI_9x5_64Xor_end: + RET + +// func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x6_64(SB), $0-88 + // Loading 24 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64_loop + VZEROUPPER + +mulGFNI_9x6_64_end: + RET + +// func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 + // Loading 24 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64Xor_loop + VZEROUPPER + +mulGFNI_9x6_64Xor_end: + RET + +// func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x7_64(SB), $0-88 + // Loading 23 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64_loop + VZEROUPPER + +mulGFNI_9x7_64_end: + RET + +// func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 + // Loading 23 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64Xor_loop + VZEROUPPER + +mulGFNI_9x7_64Xor_end: + RET + +// func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x8_64(SB), $0-88 + // Loading 22 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64_loop + VZEROUPPER + +mulGFNI_9x8_64_end: + RET + +// func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 + // Loading 22 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64Xor_loop + VZEROUPPER + +mulGFNI_9x8_64Xor_end: + RET + +// func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x9_64(SB), $0-88 + // Loading 21 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64_loop + VZEROUPPER + +mulGFNI_9x9_64_end: + RET + +// func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 + // Loading 21 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z21 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64Xor_loop + VZEROUPPER + +mulGFNI_9x9_64Xor_end: + RET + +// func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x10_64(SB), $0-88 + // Loading 20 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z20, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64_loop + VZEROUPPER + +mulGFNI_9x10_64_end: + RET + +// func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 + // Loading 20 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z20 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z21 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 216(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z20, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64Xor_loop + VZEROUPPER + +mulGFNI_9x10_64Xor_end: + RET + +// func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64_loop + VZEROUPPER + +mulGFNI_10x1_64_end: + RET + +// func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R13), Z10 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64Xor_loop + VZEROUPPER + +mulGFNI_10x1_64Xor_end: + RET + +// func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z22, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z22, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64_loop + VZEROUPPER + +mulGFNI_10x2_64_end: + RET + +// func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R14), Z20 + VMOVDQU64 (R13), Z21 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64Xor_loop + VZEROUPPER + +mulGFNI_10x2_64Xor_end: + RET + +// func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64_loop + VZEROUPPER + +mulGFNI_10x3_64_end: + RET + +// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R13), Z29 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64Xor_loop + VZEROUPPER + +mulGFNI_10x3_64Xor_end: + RET + +// func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x4_64(SB), $8-88 + // Loading 26 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64_loop + VZEROUPPER + +mulGFNI_10x4_64_end: + RET + +// func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 + // Loading 26 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64Xor_loop + VZEROUPPER + +mulGFNI_10x4_64Xor_end: + RET + +// func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x5_64(SB), $8-88 + // Loading 25 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64_loop + VZEROUPPER + +mulGFNI_10x5_64_end: + RET + +// func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 + // Loading 25 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64Xor_loop + VZEROUPPER + +mulGFNI_10x5_64Xor_end: + RET + +// func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x6_64(SB), $8-88 + // Loading 24 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64_loop + VZEROUPPER + +mulGFNI_10x6_64_end: + RET + +// func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 + // Loading 24 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64Xor_loop + VZEROUPPER + +mulGFNI_10x6_64Xor_end: + RET + +// func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x7_64(SB), $8-88 + // Loading 23 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64_loop + VZEROUPPER + +mulGFNI_10x7_64_end: + RET + +// func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 + // Loading 23 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64Xor_loop + VZEROUPPER + +mulGFNI_10x7_64Xor_end: + RET + +// func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x8_64(SB), $8-88 + // Loading 22 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64_loop + VZEROUPPER + +mulGFNI_10x8_64_end: + RET + +// func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 + // Loading 22 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64Xor_loop + VZEROUPPER + +mulGFNI_10x8_64Xor_end: + RET + +// func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x9_64(SB), $8-88 + // Loading 21 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64_loop + VZEROUPPER + +mulGFNI_10x9_64_end: + RET + +// func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 + // Loading 21 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z21 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64Xor_loop + VZEROUPPER + +mulGFNI_10x9_64Xor_end: + RET + +// func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x10_64(SB), $8-88 + // Loading 20 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z20, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64_loop + VZEROUPPER + +mulGFNI_10x10_64_end: + RET + +// func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 + // Loading 20 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z20 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z21 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 216(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z20, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64Xor_loop + VZEROUPPER + +mulGFNI_10x10_64Xor_end: + RET + +// func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + VBROADCASTF32X2 t02+48(FP), Z2 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z3 + VMOVDQU64 (DI), Z4 + VMOVDQU64 (R8), Z5 + VMOVDQU64 (AX), Z6 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z5, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VPTERNLOGD $0x96, Z7, Z3, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z6, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + VBROADCASTF32X2 t02+48(FP), Z2 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z3 + VMOVDQU64 (DI), Z4 + VMOVDQU64 (R8), Z5 + VMOVDQU64 (AX), Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z5, Z7, Z5 + VXORPD Z5, Z6, Z6 + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z6, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t02+48(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_3(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z3, Z5, Z3 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VPTERNLOGD $0x96, Z5, Z1, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t02+48(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z0 + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R8), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VMOVDQU64 Z0, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z1, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z3, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z0 + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R8), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VMOVDQU64 Z0, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z1, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z3, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go index ffc1bb1c..28c50658 100644 --- a/galois_gen_switch_amd64.go +++ b/galois_gen_switch_amd64.go @@ -1,7 +1,7 @@ // Code generated by command: go generate gen.go. DO NOT EDIT. -//go:build !appengine && !noasm && gc && !nogen -// +build !appengine,!noasm,gc,!nogen +//go:build !appengine && !noasm && gc && !nogen && !nopshufb +// +build !appengine,!noasm,gc,!nogen,!nopshufb package reedsolomon diff --git a/galois_gen_switch_nopshufb_amd64.go b/galois_gen_switch_nopshufb_amd64.go new file mode 100644 index 00000000..888df307 --- /dev/null +++ b/galois_gen_switch_nopshufb_amd64.go @@ -0,0 +1,697 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !nogen && nopshufb +// +build !appengine,!noasm,gc,!nogen,nopshufb + +package reedsolomon + +import ( + "fmt" +) + +const ( + avx2CodeGen = true + maxAvx2Inputs = 10 + maxAvx2Outputs = 10 + minAvx2Size = 64 + avxSizeMask = maxInt - (minAvx2Size - 1) +) + +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } + +func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & avxSizeMask + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & avxSizeMask + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/galois_noasm.go b/galois_noasm.go index 9043601a..fb5a3b65 100644 --- a/galois_noasm.go +++ b/galois_noasm.go @@ -1,12 +1,11 @@ -//go:build (!amd64 || noasm || appengine || gccgo) && (!arm64 || noasm || appengine || gccgo) && (!ppc64le || noasm || appengine || gccgo) -// +build !amd64 noasm appengine gccgo -// +build !arm64 noasm appengine gccgo -// +build !ppc64le noasm appengine gccgo +//go:build (!amd64 || noasm || appengine || gccgo) && (!arm64 || noasm || appengine || gccgo || nopshufb) && (!ppc64le || noasm || appengine || gccgo || nopshufb) // Copyright 2015, Klaus Post, see LICENSE for details. package reedsolomon +const pshufb = false + func galMulSlice(c byte, in, out []byte, o *options) { out = out[:len(in)] if c == 1 { @@ -31,11 +30,6 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } } -// simple slice xor -func sliceXor(in, out []byte, o *options) { - sliceXorGo(in, out, o) -} - func init() { defaultOptions.useAVX512 = false } diff --git a/galois_nopshufb_amd64.go b/galois_nopshufb_amd64.go new file mode 100644 index 00000000..89c74e24 --- /dev/null +++ b/galois_nopshufb_amd64.go @@ -0,0 +1,146 @@ +// Copyright 2015, Klaus Post, see LICENSE for details + +//go:build nopshufb && !noasm + +package reedsolomon + +// bigSwitchover is the size where 64 bytes are processed per loop. +const bigSwitchover = 128 + +const pshufb = false + +// simple slice xor +func sliceXor(in, out []byte, o *options) { + if o.useSSE2 { + if len(in) >= bigSwitchover { + if o.useAVX2 { + avx2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } else { + sSE2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + } + if len(in) >= 16 { + sSE2XorSlice(in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] + } + } else { + sliceXorGo(in, out, o) + return + } + out = out[:len(in)] + for i := range in { + out[i] ^= in[i] + } +} + +func galMulSlice(c byte, in, out []byte, o *options) { + out = out[:len(in)] + if c == 1 { + copy(out, in) + return + } + mt := mulTable[c][:256] + for len(in) >= 4 { + ii := (*[4]byte)(in) + oo := (*[4]byte)(out) + oo[0] = mt[ii[0]] + oo[1] = mt[ii[1]] + oo[2] = mt[ii[2]] + oo[3] = mt[ii[3]] + in = in[4:] + out = out[4:] + } + for n, input := range in { + out[n] = mt[input] + } +} + +func galMulSliceXor(c byte, in, out []byte, o *options) { + out = out[:len(in)] + if c == 1 { + sliceXor(in, out, o) + return + } + mt := mulTable[c][:256] + for len(in) >= 4 { + ii := (*[4]byte)(in) + oo := (*[4]byte)(out) + oo[0] ^= mt[ii[0]] + oo[1] ^= mt[ii[1]] + oo[2] ^= mt[ii[2]] + oo[3] ^= mt[ii[3]] + in = in[4:] + out = out[4:] + } + for n, input := range in { + out[n] ^= mt[input] + } +} + +func init() { + defaultOptions.useAVX512 = false +} + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + refMulAdd(x, y, log_m) + sliceXor(x, y, o) +} + +// 2-way butterfly forward +func fftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + refMulAdd8(x, y, log_m) + sliceXor(x, y, o) +} + +// 2-way butterfly inverse +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + sliceXor(x, y, o) + refMulAdd(x, y, log_m) +} + +// 2-way butterfly inverse +func ifftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + sliceXor(x, y, o) + refMulAdd8(x, y, log_m) +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + refMul(x, y, log_m) +} + +func mulgf8(x, y []byte, log_m ffe8, o *options) { + refMul8(x, y, log_m) +} diff --git a/galois_notamd64.go b/galois_notamd64.go index e67905b1..f98bfed1 100644 --- a/galois_notamd64.go +++ b/galois_notamd64.go @@ -1,5 +1,4 @@ -//go:build !amd64 || noasm || appengine || gccgo -// +build !amd64 noasm appengine gccgo +//go:build !amd64 || noasm || appengine || gccgo || pshufb // Copyright 2020, Klaus Post, see LICENSE for details. diff --git a/galois_ppc64le.go b/galois_ppc64le.go index 8cd7b52b..c4c80351 100644 --- a/galois_ppc64le.go +++ b/galois_ppc64le.go @@ -1,11 +1,12 @@ -//go:build !noasm && !appengine && !gccgo -// +build !noasm,!appengine,!gccgo +//go:build !noasm && !appengine && !gccgo && !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2018, Minio, Inc. package reedsolomon +const pshufb = true + //go:noescape func galMulPpc(low, high, in, out []byte) @@ -66,11 +67,6 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } } -// slice galois add -func sliceXor(in, out []byte, o *options) { - sliceXorGo(in, out, o) -} - // 4-way butterfly func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) diff --git a/galois_ppc64le.s b/galois_ppc64le.s index 7213c61b..c585c2b6 100644 --- a/galois_ppc64le.s +++ b/galois_ppc64le.s @@ -1,6 +1,7 @@ //+build !noasm //+build !appengine //+build !gccgo +//+build !pshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2018, Minio, Inc. diff --git a/go.mod b/go.mod index 98f3ca41..90f42515 100644 --- a/go.mod +++ b/go.mod @@ -1,14 +1,13 @@ module github.com/klauspost/reedsolomon -go 1.17 +go 1.18 require github.com/klauspost/cpuid/v2 v2.1.1 require golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e // indirect - retract ( - v1.11.2 // https://github.com/klauspost/reedsolomon/pull/229 - [v1.11.3, v1.11.5] // https://github.com/klauspost/reedsolomon/pull/238 - v1.11.6 // https://github.com/klauspost/reedsolomon/issues/240 + v1.11.6 // https://github.com/klauspost/reedsolomon/issues/240 + [v1.11.3, v1.11.5] // https://github.com/klauspost/reedsolomon/pull/238 + v1.11.2 // https://github.com/klauspost/reedsolomon/pull/229 ) diff --git a/reedsolomon.go b/reedsolomon.go index 3fad26f9..75ffc4ff 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -652,7 +652,7 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro return ErrShardSize } - if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && (r.o.useAVX2 || r.o.useGFNI) { + if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useGFNI) { m := make([][]byte, r.parityShards) for iRow := range m { m[iRow] = r.parity[iRow][idx : idx+1] @@ -803,7 +803,7 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { } func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && r.o.useAVX2 && + return avx2CodeGen && pshufb && r.o.useAVX2 && byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs } diff --git a/xor_arm64.go b/xor_arm64.go new file mode 100644 index 00000000..6f0522f8 --- /dev/null +++ b/xor_arm64.go @@ -0,0 +1,19 @@ +//go:build !noasm && !appengine && !gccgo + +package reedsolomon + +//go:noescape +func xorSliceNEON(in, out []byte) + +// simple slice xor +func sliceXor(in, out []byte, o *options) { + xorSliceNEON(in, out) + done := (len(in) >> 5) << 5 + + remain := len(in) - done + if remain > 0 { + for i := done; i < len(in); i++ { + out[i] ^= in[i] + } + } +} diff --git a/xor_arm64.s b/xor_arm64.s new file mode 100644 index 00000000..56298731 --- /dev/null +++ b/xor_arm64.s @@ -0,0 +1,29 @@ +//+build !noasm +//+build !appengine +//+build !gccgo + +// func xorSliceNEON(in, out []byte) +TEXT ·xorSliceNEON(SB), 7, $0 + MOVD in_base+0(FP), R1 + MOVD in_len+8(FP), R2 // length of message + MOVD out_base+24(FP), R5 + SUBS $32, R2 + BMI completeXor + +loopXor: + // Main loop + VLD1.P 32(R1), [V0.B16, V1.B16] + VLD1 (R5), [V20.B16, V21.B16] + + VEOR V20.B16, V0.B16, V4.B16 + VEOR V21.B16, V1.B16, V5.B16 + + // Store result + VST1.P [V4.D2, V5.D2], 32(R5) + + SUBS $32, R2 + BPL loopXor + +completeXor: + RET + diff --git a/xor_noasm.go b/xor_noasm.go new file mode 100644 index 00000000..d3e29f90 --- /dev/null +++ b/xor_noasm.go @@ -0,0 +1,7 @@ +//go:build noasm || gccgo || appengine || (!amd64 && !arm64) + +package reedsolomon + +func sliceXor(in, out []byte, o *options) { + sliceXorGo(in, out, o) +}