Skip to content

Commit

Permalink
finished x86-64 implementation of hebi_pctz
Browse files Browse the repository at this point in the history
  • Loading branch information
suiginsoft committed Oct 10, 2016
1 parent dd72fe0 commit b13c698
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 98 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ bench/libbench.a
bench/p/padd
bench/p/paddu
bench/p/pclz
bench/p/pctz
bench/p/pcmp
bench/p/pcopy
bench/p/pdivrem
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ BENCH_P := \
padd \
paddu \
pclz \
pctz \
pcmp \
pcopy \
pdivrem \
Expand Down
33 changes: 33 additions & 0 deletions bench/p/pctz.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* hebimath - arbitrary precision arithmetic library
* See LICENSE file for copyright and license details
*/

#include "../bench.h"

static volatile size_t result_sink;

int
main(int argc, char *argv[])
{
bench_args p = { .iter = 5000, .an = 32768 };
hebi_packet *a;
int i;

bench_init(argc, argv, &p);

a = hebi_palloc(HEBI_ALLOC_DEFAULT, p.an);
hebi_pzero(a, p.an);
a->hp_limbs32[p.an + HEBI_PACKET_LIMBS32 - 1] = 0x80000000u;

bench_start();

for (i = 0; i < p.iter; ++i)
result_sink = hebi_pctz(a, p.an);

bench_stop();

hebi_pfree(HEBI_ALLOC_DEFAULT, a, p.an);

return 0;
}
2 changes: 1 addition & 1 deletion config.def.x86_64.inc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
.equ HWCAP_FMA, 1
.equ HWCAP_AVX, 1
.equ HWCAP_AVX2, 1
.equ HWCAP_BMI, 1
.equ HWCAP_BMI1, 1
.equ HWCAP_BMI2, 1
.equ HWCAP_ERMSB, 1
.equ HWCAP_ADX, 1
Expand Down
3 changes: 3 additions & 0 deletions src/hwcaps.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ static const struct name_hwcaps hwcapsbyname[] =
{ "ssse3", hebi_hwcap_ssse3 },
{ "sse4_1", hebi_hwcap_sse4_1 },
{ "sse4_2", hebi_hwcap_sse4_2 },
{ "sse4.1", hebi_hwcap_sse4_1 },
{ "sse4.2", hebi_hwcap_sse4_2 },
{ "aes", hebi_hwcap_aesni },
{ "aesni", hebi_hwcap_aesni },
{ "clmul", hebi_hwcap_clmul },
Expand All @@ -70,6 +72,7 @@ static const struct name_hwcaps hwcapsbyname[] =
{ "avx", hebi_hwcap_avx },
{ "avx2", hebi_hwcap_avx2 },
{ "bmi", hebi_hwcap_bmi },
{ "bmi1", hebi_hwcap_bmi },
{ "bmi2", hebi_hwcap_bmi2 },
{ "ermsb", hebi_hwcap_ermsb },
{ "adx", hebi_hwcap_adx },
Expand Down
37 changes: 17 additions & 20 deletions src/p/x86_64/pclz.s
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ MVFUNC_BEGIN pclz, avx_lzcnt
vpcmpeqd %xmm4, %xmm4, %xmm4
mov %rsi, %rcx

.p2align 4
.p2align 4,,15
1: vmovdqa -16(%rdi,%rsi), %xmm0
vmovdqa -32(%rdi,%rsi), %xmm1
vptest %xmm4, %xmm0
Expand All @@ -28,7 +28,7 @@ MVFUNC_BEGIN pclz, avx_lzcnt
lea (,%rcx,8), %rax
ret

.p2align 4,,7
.p2align 4,,15
2: vmovdqa %xmm1, %xmm0
sub $16, %rsi
3: test %rax, %rax
Expand All @@ -52,7 +52,7 @@ MVFUNC_BEGIN pclz, sse41
pcmpeqd %xmm4, %xmm4
mov %rsi, %rcx

.p2align 4
.p2align 4,,15
1: movdqa -16(%rdi,%rsi), %xmm0
movdqa -32(%rdi,%rsi), %xmm1
ptest %xmm4, %xmm0
Expand All @@ -66,7 +66,7 @@ MVFUNC_BEGIN pclz, sse41
lea (,%rcx,8), %rax
ret

.p2align 4,,7
.p2align 4,,15
2: movdqa %xmm1, %xmm0
sub $16, %rsi
3: test %rax, %rax
Expand All @@ -87,14 +87,13 @@ MVFUNC_END
.if HWCAP_SSE2
MVFUNC_BEGIN pclz, sse2

mov %rsi, %rcx
shl $5, %rsi
pxor %xmm4, %xmm4
lea -16(%rdi,%rsi), %rdi
mov %rsi, %rcx

.p2align 4
1: movdqa (%rdi), %xmm0
movdqa -16(%rdi), %xmm1
.p2align 4,,15
1: movdqa -16(%rdi,%rsi), %xmm0
movdqa -32(%rdi,%rsi), %xmm1
movdqa %xmm0, %xmm2
movdqa %xmm1, %xmm3
pcmpeqd %xmm4, %xmm2
Expand All @@ -105,26 +104,24 @@ MVFUNC_BEGIN pclz, sse2
jne 3f
cmp $0xFFFF, %edx
jne 2f
sub $32, %rdi
dec %rcx
sub $32, %rsi
jnz 1b
lea (,%rsi,8), %rax
lea (,%rcx,8), %rax
ret

.p2align 4,,7
2: add $16, %rsi
.p2align 4,,15
2: movdqa %xmm1, %xmm0
mov %edx, %eax
movdqa %xmm1, %xmm0
3: shl $5, %rcx
cmp $0xFF, %ah
sub $16, %rsi
3: cmp $0xFF, %ah
je 4f
punpckhqdq %xmm0, %xmm0
sub $8, %rsi
add $8, %rsi
4: movq %xmm0, %rax
bsr %rax, %rax
sub %rcx, %rsi
sub %rsi, %rcx
xor $63, %rax
lea 64(%rax,%rsi,8), %rax
lea 64(%rax,%rcx,8), %rax
ret

MVFUNC_END
Expand Down
132 changes: 57 additions & 75 deletions src/p/x86_64/pctz.s
Original file line number Diff line number Diff line change
@@ -1,49 +1,45 @@
# hebimath - arbitrary precision arithmetic library
# See LICENSE file for copyright and license details

# size_t hebi_pclz(const hebi_packet *r, size_t n);
# size_t
# hebi_pclz(const hebi_packet *r, size_t n);

.include "src/p/x86_64/x86_64.inc"

#------------------------------------------------------------------------------

.if HWCAP_AVX && HWCAP_LZCNT
MVFUNC_BEGIN pctz, avx_lzcnt
.if HWCAP_AVX && HWCAP_BMI1
MVFUNC_BEGIN pctz, avx_bmi1

mov %rsi, %rcx
shl $5, %rsi
xor %edx, %edx
vpcmpeqd %xmm4, %xmm4, %xmm4
lea -16(%rdi,%rsi), %rdi
jrcxz 3f

.p2align 4,,15
2: vmovdqa (%rdi), %xmm0
vmovdqa -16(%rdi), %xmm1
2: vmovdqa (%rdi,%rdx), %xmm0
vmovdqa 16(%rdi,%rdx), %xmm1
vptest %xmm4, %xmm0
vmovq %xmm0, %rax
jnz 4f
vptest %xmm4, %xmm1
jnz 6f
sub $32, %rdi
dec %rcx
vmovq %xmm1, %rax
jnz 3f
add $32, %rdx
dec %rsi
jnz 2b
3: lea (,%rsi,8), %rax
lea (,%rdx,8), %rax
ret

4: vpextrq $1, %xmm0, %rax
shl $5, %rcx
test %rax, %rax
.p2align 4,,15
3: vmovdqa %xmm1, %xmm0
add $16, %rdx
4: test %rax, %rax
jnz 5f
vmovq %xmm0, %rax
add $8, %rsi
5: lzcnt %rax, %rax
sub %rcx, %rsi
lea (%rax,%rsi,8), %rax
vpextrq $1, %xmm0, %rax
add $8, %rdx
5: tzcnt %rax, %rax
lea (%rax,%rdx,8), %rax
ret

6: add $16, %rsi
vmovdqa %xmm1, %xmm0
jmp 4b

MVFUNC_END
.endif

Expand All @@ -52,41 +48,35 @@ MVFUNC_END
.if HWCAP_SSE41
MVFUNC_BEGIN pctz, sse41

mov %rsi, %rcx
shl $5, %rsi
xor %edx, %edx
pcmpeqd %xmm4, %xmm4
lea -16(%rdi,%rsi), %rdi
jrcxz 3f

.p2align 4,,15
2: movdqa (%rdi), %xmm0
movdqa -16(%rdi), %xmm1
2: movdqa (%rdi,%rdx), %xmm0
movdqa 16(%rdi,%rdx), %xmm1
ptest %xmm4, %xmm0
movq %xmm0, %rax
jnz 4f
ptest %xmm4, %xmm1
jnz 6f
sub $32, %rdi
dec %rcx
movq %xmm1, %rax
jnz 3f
add $32, %rdx
dec %rsi
jnz 2b
3: lea (,%rsi,8), %rax
lea (,%rdx,8), %rax
ret

4: pextrq $1, %xmm0, %rax
shl $5, %rcx
test %rax, %rax
.p2align 4,,15
3: movdqa %xmm1, %xmm0
add $16, %rdx
4: test %rax, %rax
jnz 5f
movq %xmm0, %rax
add $8, %rsi
5: bsr %rax, %rax
sub %rcx, %rsi
xor $63, %rax
lea (%rax,%rsi,8), %rax
pextrq $1, %xmm0, %rax
add $8, %rdx
5: bsf %rax, %rax
lea (%rax,%rdx,8), %rax
ret

6: add $16, %rsi
movdqa %xmm1, %xmm0
jmp 4b

MVFUNC_END
.endif

Expand All @@ -95,15 +85,12 @@ MVFUNC_END
.if HWCAP_SSE2
MVFUNC_BEGIN pctz, sse2

mov %rsi, %rcx
shl $5, %rsi
xor %ecx, %ecx
pxor %xmm4, %xmm4
lea -16(%rdi,%rsi), %rdi
jrcxz 3f

.p2align 4,,15
2: movdqa (%rdi), %xmm0
movdqa -16(%rdi), %xmm1
2: movdqa (%rdi,%rcx), %xmm0
movdqa 16(%rdi,%rcx), %xmm1
movdqa %xmm0, %xmm2
movdqa %xmm1, %xmm3
pcmpeqd %xmm4, %xmm2
Expand All @@ -113,30 +100,25 @@ MVFUNC_BEGIN pctz, sse2
cmp $0xFFFF, %eax
jne 4f
cmp $0xFFFF, %edx
jne 6f
sub $32, %rdi
dec %rcx
jne 3f
add $32, %rcx
dec %rsi
jnz 2b
3: lea (,%rsi,8), %rax
lea (,%rcx,8), %rax
ret

4: shl $5, %rcx
cmp $0xFF, %ah
je 5f
3: movdqa %xmm1, %xmm0
mov %edx, %eax
add $16, %rcx
4: cmp $0xFF, %al
jne 5f
punpckhqdq %xmm0, %xmm0
sub $8, %rsi
add $8, %rcx
5: movq %xmm0, %rax
bsr %rax, %rax
sub %rcx, %rsi
xor $63, %rax
lea 64(%rax,%rsi,8), %rax
bsf %rax, %rax
lea (%rax,%rcx,8), %rax
ret

6: add $16, %rsi
mov %edx, %eax
movdqa %xmm1, %xmm0
jmp 4b

MVFUNC_END
.endif

Expand All @@ -154,12 +136,12 @@ MVFUNC_DISPATCH_BEGIN pctz
pop %rdi
pop %rsi

.if HWCAP_AVX && HWCAP_LZCNT
.if HWCAP_AVX && HWCAP_BMI1
mov %eax, %r11d
and $(hebi_hwcap_avx+hebi_hwcap_lzcnt), %r11d
cmp $(hebi_hwcap_avx+hebi_hwcap_lzcnt), %r11d
and $(hebi_hwcap_avx+hebi_hwcap_bmi1), %r11d
cmp $(hebi_hwcap_avx+hebi_hwcap_bmi1), %r11d
jne 1f
lea hebi_pctz_avx_lzcnt__(%rip), %r10
lea hebi_pctz_avx_bmi1__(%rip), %r10
BREAK
.endif

Expand Down
4 changes: 2 additions & 2 deletions src/p/x86_64/x86_64.inc
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ FUNC_END_IMPL__
.equ hebi_hwcap_fma, 0x00000800
.equ hebi_hwcap_avx, 0x00001000
.equ hebi_hwcap_avx2, 0x00002000
.equ hebi_hwcap_bmi, 0x00004000
.equ hebi_hwcap_bmi1, 0x00004000
.equ hebi_hwcap_bmi2, 0x00008000
.equ hebi_hwcap_ermsb, 0x00010000
.equ hebi_hwcap_adx, 0x00020000
Expand Down Expand Up @@ -259,7 +259,7 @@ MVFUNC_END_IMPL__
.equ HWCAP_FMA, 0
.equ HWCAP_AVX, 0
.equ HWCAP_AVX2, 0
.equ HWCAP_BMI, 0
.equ HWCAP_BMI1, 0
.equ HWCAP_BMI2, 0
.equ HWCAP_ERMSB, 0
.equ HWCAP_ADX, 0
Expand Down

0 comments on commit b13c698

Please sign in to comment.