From 0fcdf129331a47f5aba0b40bb5c412abfa51bc3a Mon Sep 17 00:00:00 2001 From: iSage Date: Thu, 26 Apr 2018 16:07:08 +0300 Subject: [PATCH 1/2] Runtime SSE4.2 detection: Detect presence of SSE4.2 as early as possible and choose between SSE and non-SSE string hashing functions at runtime. This allows to build luajit without -msse4.2, but still get a performace gain on supported hardware. --- src/lib_jit.c | 101 +----------------------------- src/lj_jit.h | 1 + src/lj_state.c | 114 ++++++++++++++++++++++++++++++++++ src/lj_str.c | 17 ++--- src/x64/src/lj_str_hash_x64.h | 73 ++++++++++++---------- src/x64/test/unit_test.sh | 0 6 files changed, 167 insertions(+), 139 deletions(-) mode change 100644 => 100755 src/x64/test/unit_test.sh diff --git a/src/lib_jit.c b/src/lib_jit.c index 8768a20cc..b6efa4147 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -659,114 +659,15 @@ JIT_PARAMDEF(JIT_PARAMINIT) }; #endif -#if LJ_TARGET_ARM && LJ_TARGET_LINUX -#include -#endif - -/* Arch-dependent CPU detection. */ -static uint32_t jit_cpudetect(lua_State *L) -{ - uint32_t flags = 0; -#if LJ_TARGET_X86ORX64 - uint32_t vendor[4]; - uint32_t features[4]; - if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) { -#if !LJ_HASJIT -#define JIT_F_SSE2 2 -#endif - flags |= ((features[3] >> 26)&1) * JIT_F_SSE2; -#if LJ_HASJIT - flags |= ((features[2] >> 0)&1) * JIT_F_SSE3; - flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1; - if (vendor[2] == 0x6c65746e) { /* Intel. */ - if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */ - flags |= JIT_F_LEA_AGU; - } else if (vendor[2] == 0x444d4163) { /* AMD. */ - uint32_t fam = (features[0] & 0x0ff00f00); - if (fam >= 0x00000f00) /* K8, K10. */ - flags |= JIT_F_PREFER_IMUL; - } - if (vendor[0] >= 7) { - uint32_t xfeatures[4]; - lj_vm_cpuid(7, xfeatures); - flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2; - } -#endif - } - /* Check for required instruction set support on x86 (unnecessary on x64). */ -#if LJ_TARGET_X86 - if (!(flags & JIT_F_SSE2)) - luaL_error(L, "CPU with SSE2 required"); -#endif -#elif LJ_TARGET_ARM -#if LJ_HASJIT - int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */ -#if LJ_TARGET_LINUX - if (ver < 70) { /* Runtime ARM CPU detection. */ - struct utsname ut; - uname(&ut); - if (strncmp(ut.machine, "armv", 4) == 0) { - if (ut.machine[4] >= '7') - ver = 70; - else if (ut.machine[4] == '6') - ver = 60; - } - } -#endif - flags |= ver >= 70 ? JIT_F_ARMV7 : - ver >= 61 ? JIT_F_ARMV6T2_ : - ver >= 60 ? JIT_F_ARMV6_ : 0; - flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2; -#endif -#elif LJ_TARGET_ARM64 - /* No optional CPU features to detect (for now). */ -#elif LJ_TARGET_PPC -#if LJ_HASJIT -#if LJ_ARCH_SQRT - flags |= JIT_F_SQRT; -#endif -#if LJ_ARCH_ROUND - flags |= JIT_F_ROUND; -#endif -#endif -#elif LJ_TARGET_MIPS -#if LJ_HASJIT - /* Compile-time MIPS CPU detection. */ -#if LJ_ARCH_VERSION >= 20 - flags |= JIT_F_MIPSXXR2; -#endif - /* Runtime MIPS CPU detection. */ -#if defined(__GNUC__) - if (!(flags & JIT_F_MIPSXXR2)) { - int x; -#ifdef __mips16 - x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */ -#else - /* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */ - __asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2"); -#endif - if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */ - } -#endif -#endif -#else -#error "Missing CPU detection for this architecture" -#endif - UNUSED(L); - return flags; -} /* Initialize JIT compiler. */ static void jit_init(lua_State *L) { - uint32_t flags = jit_cpudetect(L); #if LJ_HASJIT jit_State *J = L2J(L); - J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT; + J->flags = J->flags | JIT_F_ON | JIT_F_OPT_DEFAULT; memcpy(J->param, jit_param_default, sizeof(J->param)); lj_dispatch_update(G(L)); -#else - UNUSED(flags); #endif } diff --git a/src/lj_jit.h b/src/lj_jit.h index 2fea3859b..ef28dacc3 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -20,6 +20,7 @@ #define JIT_F_PREFER_IMUL 0x00000080 #define JIT_F_LEA_AGU 0x00000100 #define JIT_F_BMI2 0x00000200 +#define JIT_F_SSE4_2 0x00000400 /* Names for the CPU-specific flags. Must match the order above. */ #define JIT_F_CPU_FIRST JIT_F_SSE2 diff --git a/src/lj_state.c b/src/lj_state.c index 632dd07e5..40f139dc4 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -180,6 +180,106 @@ static void close_state(lua_State *L) g->allocf(g->allocd, G2GG(g), sizeof(GG_State), 0); } +#if LJ_TARGET_ARM && LJ_TARGET_LINUX +#include +#endif + +/* Arch-dependent CPU detection. */ +static uint32_t _cpudetect(lua_State *L) +{ + uint32_t flags = 0; +#if LJ_TARGET_X86ORX64 + uint32_t vendor[4]; + uint32_t features[4]; + if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) { +#if !LJ_HASJIT +#define JIT_F_SSE2 2 +#endif + flags |= ((features[3] >> 26)&1) * JIT_F_SSE2; +#if LJ_HASJIT + flags |= ((features[2] >> 0)&1) * JIT_F_SSE3; + flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1; + flags |= ((features[2] >> 20)&1) * JIT_F_SSE4_2; + if (vendor[2] == 0x6c65746e) { /* Intel. */ + if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */ + flags |= JIT_F_LEA_AGU; + } else if (vendor[2] == 0x444d4163) { /* AMD. */ + uint32_t fam = (features[0] & 0x0ff00f00); + if (fam >= 0x00000f00) /* K8, K10. */ + flags |= JIT_F_PREFER_IMUL; + } + if (vendor[0] >= 7) { + uint32_t xfeatures[4]; + lj_vm_cpuid(7, xfeatures); + flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2; + } +#endif + } + /* Check for required instruction set support on x86 (unnecessary on x64). */ +#if LJ_TARGET_X86 + if (!(flags & JIT_F_SSE2)) + luaL_error(L, "CPU with SSE2 required"); +#endif +#elif LJ_TARGET_ARM +#if LJ_HASJIT + int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */ +#if LJ_TARGET_LINUX + if (ver < 70) { /* Runtime ARM CPU detection. */ + struct utsname ut; + uname(&ut); + if (strncmp(ut.machine, "armv", 4) == 0) { + if (ut.machine[4] >= '7') + ver = 70; + else if (ut.machine[4] == '6') + ver = 60; + } + } +#endif + flags |= ver >= 70 ? JIT_F_ARMV7 : + ver >= 61 ? JIT_F_ARMV6T2_ : + ver >= 60 ? JIT_F_ARMV6_ : 0; + flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2; +#endif +#elif LJ_TARGET_ARM64 + /* No optional CPU features to detect (for now). */ +#elif LJ_TARGET_PPC +#if LJ_HASJIT +#if LJ_ARCH_SQRT + flags |= JIT_F_SQRT; +#endif +#if LJ_ARCH_ROUND + flags |= JIT_F_ROUND; +#endif +#endif +#elif LJ_TARGET_MIPS +#if LJ_HASJIT + /* Compile-time MIPS CPU detection. */ +#if LJ_ARCH_VERSION >= 20 + flags |= JIT_F_MIPSXXR2; +#endif + /* Runtime MIPS CPU detection. */ +#if defined(__GNUC__) + if (!(flags & JIT_F_MIPSXXR2)) { + int x; +#ifdef __mips16 + x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */ +#else + /* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */ + __asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2"); +#endif + if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */ + } +#endif +#endif +#else +#error "Missing CPU detection for this architecture" +#endif + UNUSED(L); + return flags; +} + +extern void x64_init_random(); + #if LJ_64 && !LJ_GC64 && !(defined(LUAJIT_USE_VALGRIND) && defined(LUAJIT_USE_SYSMALLOC)) lua_State *lj_state_newstate(lua_Alloc f, void *ud) #else @@ -188,7 +288,18 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud) { GG_State *GG = (GG_State *)f(ud, NULL, 0, sizeof(GG_State)); lua_State *L = &GG->L; + + /* detect cpu features as early as possible */ + /* and init random table if we have SSE4.2 support */ + uint32_t flags = _cpudetect(L); + + if (flags & JIT_F_SSE4_2) + { + x64_init_random(); + } + global_State *g = &GG->g; + if (GG == NULL || !checkptrGC(GG)) return NULL; memset(GG, 0, sizeof(GG_State)); L->gct = ~LJ_TTHREAD; @@ -219,6 +330,9 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud) g->gc.stepmul = LUAI_GCMUL; lj_dispatch_init((GG_State *)L); L->status = LUA_ERRERR+1; /* Avoid touching the stack upon memory error. */ + + G2J(g)->flags = flags; /* copy detected flags to jit state */ + if (lj_vm_cpcall(L, NULL, NULL, cpluaopen) != 0) { /* Memory allocation error: free partial state. */ close_state(L); diff --git a/src/lj_str.c b/src/lj_str.c index b9469ca00..3fc04fefa 100644 --- a/src/lj_str.c +++ b/src/lj_str.c @@ -11,6 +11,7 @@ #include "lj_err.h" #include "lj_str.h" #include "lj_char.h" +#include "lj_dispatch.h" /* for G2J */ /* -- String helpers ------------------------------------------------------ */ @@ -165,12 +166,6 @@ lj_str_indep_hash(GCstr *str) { #include "x64/src/lj_str_hash_x64.h" -#if defined(LJ_ARCH_STR_HASH) -#define LJ_STR_HASH LJ_ARCH_STR_HASH -#else -#define LJ_STR_HASH lj_str_original_hash -#endif - /* Intern a string and return string object. */ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) { @@ -187,7 +182,15 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) return &g->strempty; } - h = LJ_STR_HASH(str, lenx); + /* switch between sse and non-sse hash branches */ + if ((G2J(g)->flags & JIT_F_SSE4_2)) + { + h = lj_str_sse_hash(str, lenx); + } + else + { + h = lj_str_original_hash(str, lenx); + } /* Check if the string has already been interned. */ o = gcref(g->strhash[h & g->strmask]); diff --git a/src/x64/src/lj_str_hash_x64.h b/src/x64/src/lj_str_hash_x64.h index 063f631c7..ca1ac1919 100644 --- a/src/x64/src/lj_str_hash_x64.h +++ b/src/x64/src/lj_str_hash_x64.h @@ -8,13 +8,12 @@ #ifndef _LJ_STR_HASH_X64_H_ #define _LJ_STR_HASH_X64_H_ -#if defined(__SSE4_2__) && defined(__x86_64) && defined(__GNUC__) +#if defined(__x86_64) && defined(__GNUC__) #include #include #include #include -#include #include "../../lj_def.h" @@ -26,6 +25,24 @@ #define srandom(seed) srand(seed) #endif +inline __attribute__((always_inline)) uint64_t asm_crc32_u64(uint64_t crc, uint64_t value) { +// printf("CRC!"); + asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +inline __attribute__((always_inline)) uint32_t asm_crc32_u32(uint32_t crc, uint32_t value) { +// printf("CRC!"); + asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +inline __attribute__((always_inline)) uint32_t asm_crc32_u8(uint32_t crc, uint8_t value) { +// printf("CRC!"); + asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + static const uint64_t* cast_uint64p(const char* str) { return (const uint64_t*)(void*)str; @@ -48,7 +65,7 @@ static LJ_AINLINE uint32_t lj_str_hash_1_4(const char* str, uint32_t len) v = (v << 8) | str[len >> 1]; v = (v << 8) | str[len - 1]; v = (v << 8) | len; - return _mm_crc32_u32(0, v); + return asm_crc32_u32(0, v); #else uint32_t a, b, h = len; @@ -78,9 +95,9 @@ static LJ_AINLINE uint32_t lj_str_hash_4_16(const char* str, uint32_t len) v2 = *cast_uint32p(str + len - 4); } - h = _mm_crc32_u32(0, len); - h = _mm_crc32_u64(h, v1); - h = _mm_crc32_u64(h, v2); + h = asm_crc32_u32(0, len); + h = asm_crc32_u64(h, v1); + h = asm_crc32_u64(h, v2); return h; } @@ -90,18 +107,18 @@ static uint32_t lj_str_hash_16_128(const char* str, uint32_t len) uint64_t h1, h2; uint32_t i; - h1 = _mm_crc32_u32(0, len); + h1 = asm_crc32_u32(0, len); h2 = 0; for (i = 0; i < len - 16; i += 16) { - h1 += _mm_crc32_u64(h1, *cast_uint64p(str + i)); - h2 += _mm_crc32_u64(h2, *cast_uint64p(str + i + 8)); + h1 += asm_crc32_u64(h1, *cast_uint64p(str + i)); + h2 += asm_crc32_u64(h2, *cast_uint64p(str + i + 8)); }; - h1 = _mm_crc32_u64(h1, *cast_uint64p(str + len - 16)); - h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8)); + h1 = asm_crc32_u64(h1, *cast_uint64p(str + len - 16)); + h2 = asm_crc32_u64(h2, *cast_uint64p(str + len - 8)); - return _mm_crc32_u32(h1, h2); + return asm_crc32_u32(h1, h2); } /* ************************************************************************** @@ -144,7 +161,7 @@ static LJ_AINLINE uint32_t log2_floor(uint32_t n) /* This function is to populate `random_pos` such that random_pos[i][*] * contains random value in the range of [2**i, 2**(i+1)). */ -static void x64_init_random(void) +void x64_init_random(void) { int i, seed, rml; @@ -155,8 +172,8 @@ static void x64_init_random(void) } /* Init seed */ - seed = _mm_crc32_u32(0, getpid()); - seed = _mm_crc32_u32(seed, time(NULL)); + seed = asm_crc32_u32(0, getpid()); + seed = asm_crc32_u32(seed, time(NULL)); srandom(seed); /* Now start to populate the random_pos[][]. */ @@ -185,11 +202,6 @@ static void x64_init_random(void) } #undef POW2_MASK -void __attribute__((constructor)) x64_init_random_constructor() -{ - x64_init_random(); -} - /* Return a pre-computed random number in the range of [1**chunk_sz_order, * 1**(chunk_sz_order+1)). It is "unsafe" in the sense that the return value * may be greater than chunk-size; it is up to the caller to make sure @@ -216,7 +228,7 @@ static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str, pos1 = get_random_pos_unsafe(chunk_sz_log2, 0); pos2 = get_random_pos_unsafe(chunk_sz_log2, 1); - h1 = _mm_crc32_u32(0, len); + h1 = asm_crc32_u32(0, len); h2 = 0; /* loop over 14 chunks, 2 chunks at a time */ @@ -224,29 +236,29 @@ static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str, chunk_ptr += chunk_sz, i++) { v = *cast_uint64p(chunk_ptr + pos1); - h1 = _mm_crc32_u64(h1, v); + h1 = asm_crc32_u64(h1, v); v = *cast_uint64p(chunk_ptr + chunk_sz + pos2); - h2 = _mm_crc32_u64(h2, v); + h2 = asm_crc32_u64(h2, v); } /* the last two chunks */ v = *cast_uint64p(chunk_ptr + pos1); - h1 = _mm_crc32_u64(h1, v); + h1 = asm_crc32_u64(h1, v); v = *cast_uint64p(chunk_ptr + chunk_sz - 8 - pos2); - h2 = _mm_crc32_u64(h2, v); + h2 = asm_crc32_u64(h2, v); /* process the trailing part */ - h1 = _mm_crc32_u64(h1, *cast_uint64p(str)); - h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8)); + h1 = asm_crc32_u64(h1, *cast_uint64p(str)); + h2 = asm_crc32_u64(h2, *cast_uint64p(str + len - 8)); - h1 = _mm_crc32_u32(h1, h2); + h1 = asm_crc32_u32(h1, h2); return h1; } /* NOTE: the "len" should not be zero */ -static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len) +static LJ_AINLINE uint32_t lj_str_sse_hash(const char* str, size_t len) { if (len < 128) { if (len >= 16) { /* [16, 128) */ @@ -264,8 +276,5 @@ static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len) return lj_str_hash_128_above(str, len); } -#define LJ_ARCH_STR_HASH lj_str_hash -#else -#undef LJ_ARCH_STR_HASH #endif #endif /*_LJ_STR_HASH_X64_H_*/ diff --git a/src/x64/test/unit_test.sh b/src/x64/test/unit_test.sh old mode 100644 new mode 100755 From 0ea7cb41a17c62270cd52c73df63715d88285c00 Mon Sep 17 00:00:00 2001 From: iSage Date: Tue, 8 May 2018 12:03:42 +0300 Subject: [PATCH 2/2] Revork sse detection patch to use gcc intrinsics --- src/Makefile | 2 ++ src/lj_state.c | 2 ++ src/lj_str.c | 4 +++ src/x64/src/lj_str_hash_x64.h | 61 +++++++++++++---------------------- 4 files changed, 30 insertions(+), 39 deletions(-) diff --git a/src/Makefile b/src/Makefile index 71ca028cd..a5bac2404 100644 --- a/src/Makefile +++ b/src/Makefile @@ -37,6 +37,8 @@ CC= $(DEFAULT_CC) # unwinding are not affected -- the assembler part has frame unwind # information and GCC emits it where needed (x64) or with -g (see CCDEBUG). CCOPT= -O2 -fomit-frame-pointer +# only apply SSE4.2 to target or else host minilua will fail on non-sse4.2 hardware +# TARGET_CFLAGS=-msse4.2 # Use this if you want to generate a smaller binary (but it's slower): #CCOPT= -Os -fomit-frame-pointer # Note: it's no longer recommended to use -O3 with GCC 4.x. diff --git a/src/lj_state.c b/src/lj_state.c index 40f139dc4..04001dd14 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -293,10 +293,12 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud) /* and init random table if we have SSE4.2 support */ uint32_t flags = _cpudetect(L); +#if defined(__SSE4_2__) if (flags & JIT_F_SSE4_2) { x64_init_random(); } +#endif global_State *g = &GG->g; diff --git a/src/lj_str.c b/src/lj_str.c index 3fc04fefa..b72c98e7d 100644 --- a/src/lj_str.c +++ b/src/lj_str.c @@ -183,6 +183,7 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) } /* switch between sse and non-sse hash branches */ +#if defined(__SSE4_2__) if ((G2J(g)->flags & JIT_F_SSE4_2)) { h = lj_str_sse_hash(str, lenx); @@ -191,6 +192,9 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) { h = lj_str_original_hash(str, lenx); } +#else + h = lj_str_original_hash(str, lenx); +#endif /* Check if the string has already been interned. */ o = gcref(g->strhash[h & g->strmask]); diff --git a/src/x64/src/lj_str_hash_x64.h b/src/x64/src/lj_str_hash_x64.h index ca1ac1919..058d2038e 100644 --- a/src/x64/src/lj_str_hash_x64.h +++ b/src/x64/src/lj_str_hash_x64.h @@ -8,12 +8,13 @@ #ifndef _LJ_STR_HASH_X64_H_ #define _LJ_STR_HASH_X64_H_ -#if defined(__x86_64) && defined(__GNUC__) +#if defined(__SSE4_2__) && defined(__x86_64) && defined(__GNUC__) #include #include #include #include +#include #include "../../lj_def.h" @@ -25,24 +26,6 @@ #define srandom(seed) srand(seed) #endif -inline __attribute__((always_inline)) uint64_t asm_crc32_u64(uint64_t crc, uint64_t value) { -// printf("CRC!"); - asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - -inline __attribute__((always_inline)) uint32_t asm_crc32_u32(uint32_t crc, uint32_t value) { -// printf("CRC!"); - asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - -inline __attribute__((always_inline)) uint32_t asm_crc32_u8(uint32_t crc, uint8_t value) { -// printf("CRC!"); - asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - static const uint64_t* cast_uint64p(const char* str) { return (const uint64_t*)(void*)str; @@ -65,7 +48,7 @@ static LJ_AINLINE uint32_t lj_str_hash_1_4(const char* str, uint32_t len) v = (v << 8) | str[len >> 1]; v = (v << 8) | str[len - 1]; v = (v << 8) | len; - return asm_crc32_u32(0, v); + return _mm_crc32_u32(0, v); #else uint32_t a, b, h = len; @@ -95,9 +78,9 @@ static LJ_AINLINE uint32_t lj_str_hash_4_16(const char* str, uint32_t len) v2 = *cast_uint32p(str + len - 4); } - h = asm_crc32_u32(0, len); - h = asm_crc32_u64(h, v1); - h = asm_crc32_u64(h, v2); + h = _mm_crc32_u32(0, len); + h = _mm_crc32_u64(h, v1); + h = _mm_crc32_u64(h, v2); return h; } @@ -107,18 +90,18 @@ static uint32_t lj_str_hash_16_128(const char* str, uint32_t len) uint64_t h1, h2; uint32_t i; - h1 = asm_crc32_u32(0, len); + h1 = _mm_crc32_u32(0, len); h2 = 0; for (i = 0; i < len - 16; i += 16) { - h1 += asm_crc32_u64(h1, *cast_uint64p(str + i)); - h2 += asm_crc32_u64(h2, *cast_uint64p(str + i + 8)); + h1 += _mm_crc32_u64(h1, *cast_uint64p(str + i)); + h2 += _mm_crc32_u64(h2, *cast_uint64p(str + i + 8)); }; - h1 = asm_crc32_u64(h1, *cast_uint64p(str + len - 16)); - h2 = asm_crc32_u64(h2, *cast_uint64p(str + len - 8)); + h1 = _mm_crc32_u64(h1, *cast_uint64p(str + len - 16)); + h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8)); - return asm_crc32_u32(h1, h2); + return _mm_crc32_u32(h1, h2); } /* ************************************************************************** @@ -172,8 +155,8 @@ void x64_init_random(void) } /* Init seed */ - seed = asm_crc32_u32(0, getpid()); - seed = asm_crc32_u32(seed, time(NULL)); + seed = _mm_crc32_u32(0, getpid()); + seed = _mm_crc32_u32(seed, time(NULL)); srandom(seed); /* Now start to populate the random_pos[][]. */ @@ -228,7 +211,7 @@ static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str, pos1 = get_random_pos_unsafe(chunk_sz_log2, 0); pos2 = get_random_pos_unsafe(chunk_sz_log2, 1); - h1 = asm_crc32_u32(0, len); + h1 = _mm_crc32_u32(0, len); h2 = 0; /* loop over 14 chunks, 2 chunks at a time */ @@ -236,24 +219,24 @@ static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str, chunk_ptr += chunk_sz, i++) { v = *cast_uint64p(chunk_ptr + pos1); - h1 = asm_crc32_u64(h1, v); + h1 = _mm_crc32_u64(h1, v); v = *cast_uint64p(chunk_ptr + chunk_sz + pos2); - h2 = asm_crc32_u64(h2, v); + h2 = _mm_crc32_u64(h2, v); } /* the last two chunks */ v = *cast_uint64p(chunk_ptr + pos1); - h1 = asm_crc32_u64(h1, v); + h1 = _mm_crc32_u64(h1, v); v = *cast_uint64p(chunk_ptr + chunk_sz - 8 - pos2); - h2 = asm_crc32_u64(h2, v); + h2 = _mm_crc32_u64(h2, v); /* process the trailing part */ - h1 = asm_crc32_u64(h1, *cast_uint64p(str)); - h2 = asm_crc32_u64(h2, *cast_uint64p(str + len - 8)); + h1 = _mm_crc32_u64(h1, *cast_uint64p(str)); + h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8)); - h1 = asm_crc32_u32(h1, h2); + h1 = _mm_crc32_u32(h1, h2); return h1; }