diff --git a/common/cliident.cpp b/common/cliident.cpp index 9fb7f8394..74a64a28e 100644 --- a/common/cliident.cpp +++ b/common/cliident.cpp @@ -542,7 +542,7 @@ int CliIsDevelVersion(void) const char *CliGetFullVersionDescriptor(void) { - static char buffer[10+32+sizeof("v"CLIENT_VERSIONSTRING"-XXX-99071523-*dev* client for "CLIENT_OS_NAME_EXTENDED)]; + static char buffer[10+32+sizeof("v" CLIENT_VERSIONSTRING "-XXX-99071523-*dev* client for " CLIENT_OS_NAME_EXTENDED)]; struct timeval tv; tv.tv_usec = 0; tv.tv_sec = CliGetNewestModuleTime(); sprintf( buffer, "%s v" CLIENT_VERSIONSTRING "-" @@ -559,7 +559,7 @@ const char *CliGetFullVersionDescriptor(void) "%c" /* limited release or dev branch or public release */ "-%s" /* date is in bugzilla format yymmddhh */ "%s" /* "-*dev*" or "" */ - " for "CLIENT_OS_NAME_EXTENDED, + " for " CLIENT_OS_NAME_EXTENDED, utilGetAppName(), ((ConIsGUI())?('G'):('C')), ((CliIsDevelVersion())?('L'):('R')), diff --git a/common/confopt.cpp b/common/confopt.cpp index 7c9957cda..1999defbc 100644 --- a/common/confopt.cpp +++ b/common/confopt.cpp @@ -405,7 +405,7 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = { "\n" "It is possible to have the client rotate through this list, updating its\n" "buffers only once for each pass. To do so, 'Dialup-link detection'\n" - "and '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"' must be disabled since a buffer\n" + "and '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "' must be disabled since a buffer\n" "update (new work being made available) would otherwise cause the client\n" "to go back to the beginning of the load order.\n" /*) */,CONF_MENU_BUFF,CONF_TYPE_ASCIIZ,NULL,NULL,0,0,NULL,NULL @@ -439,13 +439,13 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = { CONF_FREQUENT_FREQUENCY , /* CONF_MENU_BUFF */ CFGTXT("Buffer-level check interval"), "0:00 (on buffer change)", /*CFGTXT(*/ - "This option determines how often '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'\n" + "This option determines how often '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'\n" "should be performed. (More precisely: how much time must elapse between\n" "buffer-level checks)\n" "\n" "This setting is meaningful only if one of the extensions to normal threshold\n" "management is enabled: either implicitly when 'Dialup detection options' are\n" - "active or explicitly with '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'.\n" + "active or explicitly with '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'.\n" "\n" "The interval specified here is in hours and minutes, and the default denotes\n" "that the client should check buffer-levels whenever it detects a change (by\n" @@ -456,13 +456,13 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = { CONF_FREQUENT_RETRY_FREQUENCY , /* CONF_MENU_BUFF */ CFGTXT("Buffer-level check retry interval"), "0:00 (no delay)", /*CFGTXT(*/ - "This option determines how often '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'\n" + "This option determines how often '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'\n" "should be retried after failure. (More precisely: how much time must elapse\n" "between buffer-level check retries)\n" "\n" "This setting is meaningful only if one of the extensions to normal threshold\n" "management is enabled: either implicitly when 'Dialup detection options' are\n" - "active or explicitly with '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'.\n" + "active or explicitly with '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'.\n" "\n" "The interval specified here is in hours and minutes, and the default denotes\n" "that the client should retry the buffer-level checks at most twice per minute\n" @@ -503,7 +503,7 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = { "should be used instead. If that too is unspecified, then the client will\n" "use defaults.\n" "\n" - "* See also: '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'\n" + "* See also: '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'\n" ,CONF_MENU_BUFF,CONF_TYPE_IARRAY,NULL,NULL,1,0xffff,NULL,NULL }, { @@ -523,7 +523,7 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = { "unprocessed packet cannot be predicted.\n" #endif "\n" - "* See also: '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'\n" + "* See also: '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'\n" ,CONF_MENU_BUFF,CONF_TYPE_IARRAY,NULL,NULL,0,(14*24),NULL,NULL }, diff --git a/common/core_ogr_ng.cpp b/common/core_ogr_ng.cpp index 6115aca55..8f9a8bc89 100644 --- a/common/core_ogr_ng.cpp +++ b/common/core_ogr_ng.cpp @@ -84,6 +84,7 @@ return "@(#)$Id: core_ogr_ng.cpp,v 1.47 2015/06/27 21:43:52 zebe Exp $"; } CoreDispatchTable *ogrng64_get_dispatch_table_cj1_generic(void); CoreDispatchTable *ogrng64_get_dispatch_table_cj1_sse2(void); CoreDispatchTable *ogrng64_get_dispatch_table_cj1_sse2_lzcnt(void); + CoreDispatchTable *ogrng64_get_dispatch_table_cj1_avx2(void); #elif (CLIENT_CPU == CPU_SPARC) && (SIZEOF_LONG == 8) CoreDispatchTable *ogrng64_get_dispatch_table(void); #elif (CLIENT_CPU == CPU_S390X) && (SIZEOF_LONG == 8) @@ -166,6 +167,7 @@ int InitializeCoreTable_ogr_ng(int first_time) ogrng64_get_dispatch_table_cj1_generic(); ogrng64_get_dispatch_table_cj1_sse2(); ogrng64_get_dispatch_table_cj1_sse2_lzcnt(); + ogrng64_get_dispatch_table_cj1_avx2(); #elif (CLIENT_CPU == CPU_S390) ogrng_get_dispatch_table(); #elif (CLIENT_CPU == CPU_S390X) @@ -225,6 +227,7 @@ const char **corenames_for_contest_ogr_ng() "cj-asm-generic", "cj-asm-sse2", "cj-asm-sse2-lzcnt", + "cj-asm-avx2", #elif (CLIENT_CPU == CPU_ARM) "FLEGE 2.0", "FLEGE 2.0 ARMv3", @@ -356,6 +359,8 @@ int apply_selcore_substitution_rules_ogr_ng(int cindex) # endif # elif (CLIENT_CPU == CPU_AMD64) unsigned feature = GetProcessorFeatureFlags(); + if (cindex == 4 && !(feature & CPU_F_AVX2)) /* Core 4 needs AVX2 */ + cindex = 2; /* If no AVX2, try SSE2 */ if (cindex == 3 && !(feature & CPU_F_LZCNT)) /* Core 3 needs LZCNT */ cindex = 2; /* If no LZCNT, try SSE2 */ if (cindex == 2 && !(feature & CPU_F_SSE2)) /* Core 2 needs SSE2 */ @@ -504,8 +509,11 @@ int selcoreGetPreselectedCoreForProject_ogr_ng() } if (cindex == -1) { + /* Assume that if AVX2 is availble it is the best choice */ + if (detected_flags & CPU_F_AVX2) + cindex = 4; /* Assume that LZCNT+SSE2 is better then plain SSE2 everywhere */ - if (detected_flags & CPU_F_LZCNT) + else if (detected_flags & CPU_F_LZCNT) cindex = 3; else if (detected_flags & CPU_F_SSE2) cindex = 2; /* sse2 core */ @@ -638,6 +646,8 @@ int selcoreSelectCore_ogr_ng(Client *client, unsigned int threadindex, unit_func.ogr = ogrng64_get_dispatch_table_cj1_sse2(); else if (coresel == 3) unit_func.ogr = ogrng64_get_dispatch_table_cj1_sse2_lzcnt(); + else if (coresel == 4) + unit_func.ogr = ogrng64_get_dispatch_table_cj1_avx2(); else { unit_func.ogr = ogrng64_get_dispatch_table(); diff --git a/common/cpucheck.cpp b/common/cpucheck.cpp index 9d5c410f9..f4a1cf782 100644 --- a/common/cpucheck.cpp +++ b/common/cpucheck.cpp @@ -1344,7 +1344,8 @@ long __GetRawProcessorID(const char **cpuname, int whattoret = 0 ) * 0x19 - Sandy Bridge Core iX-2xxx * 0x1A - Ivy Bridge Core iX-3xxx * 0x1B - Haswell Core iX-4xxx - * 0x1C-1F + * 0x1C - Kaby Lake Core iX-7xxx + * 0x1D-1F * 0x20 - AMD Bobcat - Embedded APU * 0x21 - AMD Bulldozer - FX * 0x22 - AMD Husky - APU @@ -1488,6 +1489,7 @@ long __GetRawProcessorID(const char **cpuname, int whattoret = 0 ) { 0x0006450, 0xFFFFFF0, CPU_F_I686, 0x1B, "Core iX-4xxx (Haswell)" }, /* (#4579) */ { 0x0006460, 0xFFFFFF0, CPU_F_I686, 0x1B, "Core iX-4xxx (Haswell)" }, { 0x00065E0, 0xFFFFFF0, CPU_F_I686, 0x1B, "Core iX-6xxx (Skylake)" }, /* (#4615) */ + { 0x00069E0, 0xFFFFFF0, CPU_F_I686, 0x1C, "Core iX-7xxx (Kaby Lake)" }, { 0x0000000, 0, 0, 0, NULL } }; internalxref = &intelxref[0]; } diff --git a/common/cpucheck.h b/common/cpucheck.h index 3d4dcc5fe..dac20eb09 100644 --- a/common/cpucheck.h +++ b/common/cpucheck.h @@ -24,13 +24,13 @@ #define CPU_F_SSE (0x00002000L) #define CPU_F_SSE2 (0x00004000L) #define CPU_F_SSE3 (0x00008000L) - #define CPU_F_HYPERTHREAD (0x00010000L) /* supported and enabled */ + #define CPU_F_HYPERTHREAD (0x00010000L) /* supported and enabled */ #define CPU_F_AMD64 (0x00020000L) #define CPU_F_EM64T (0x00040000L) #define CPU_F_SSE4_1 (0x00080000L) #define CPU_F_SSE4_2 (0x00100000L) #define CPU_F_SSSE3 (0x00200000L) - #define CPU_F_LZCNT (0x00400000L) + #define CPU_F_LZCNT (0x00400000L) #define CPU_F_AVX_DISABLED (0x00800000L) /* supported but disabled (no OS support) */ #define CPU_F_AVX (0x01000000L) /* supported and enabled */ #define CPU_F_AVX2 (0x02000000L) /* supported and enabled */ diff --git a/common/mail.cpp b/common/mail.cpp index 7d73b324b..975a071d3 100644 --- a/common/mail.cpp +++ b/common/mail.cpp @@ -564,8 +564,8 @@ static int smtp_send_message_header( void * net, if (errcode == 0) //send the date { sprintf( buffer, "\r\nDate: %s" - "\r\nX-Mailer: distributed.net v"CLIENT_VERSIONSTRING - " client for "CLIENT_OS_NAME_EXTENDED, rfc822Date( buffer + 256 ) ); + "\r\nX-Mailer: distributed.net v" CLIENT_VERSIONSTRING + " client for " CLIENT_OS_NAME_EXTENDED, rfc822Date( buffer + 256 ) ); if ( put_smtp_line( net, buffer, strlen( buffer ) ) ) errcode = -1; } diff --git a/common/problem.h b/common/problem.h index 4a6178979..3f6d07e3a 100644 --- a/common/problem.h +++ b/common/problem.h @@ -47,10 +47,10 @@ #define CORE_MEM_ALIGNMENT 4 #endif #else - // For x86, alignment must be 8 for MMX core and 16 for SSE. - #if CORE_MEM_ALIGNMENT < 4 + // For x86, alignment must be 8 for MMX core, 16 for SSE and 32 for AVX2. + #if CORE_MEM_ALIGNMENT < 5 #undef CORE_MEM_ALIGNMENT - #define CORE_MEM_ALIGNMENT 4 + #define CORE_MEM_ALIGNMENT 5 #endif #endif #endif diff --git a/common/util.cpp b/common/util.cpp index b55b99c7e..d2a40a7bb 100644 --- a/common/util.cpp +++ b/common/util.cpp @@ -73,7 +73,7 @@ void trace_setsrc( const char *filename ) void trace_out( int indlevel, const char *format, ... ) { static int indentlevel = -1; /* uninitialized */ - const char *tracefile = "trace"EXTN_SEP"out"; + const char *tracefile = "trace" EXTN_SEP "out"; int old_errno = errno; FILE *file; va_list arglist; diff --git a/configure b/configure index d35d740f2..fbccb8dbc 100755 --- a/configure +++ b/configure @@ -659,9 +659,11 @@ add_sources() # $1=os, $2=arch, $3=custom TARGET_ADDSRCS="$TARGET_ADDSRCS $OGR/amd64/ogrng64-cj1-generic.cpp" TARGET_ADDSRCS="$TARGET_ADDSRCS $OGR/amd64/ogrng64-cj1-sse2.cpp" TARGET_ADDSRCS="$TARGET_ADDSRCS $OGR/amd64/ogrng64-cj1-sse2-lzcnt.cpp" + TARGET_ADDSRCS="$TARGET_ADDSRCS $OGR/amd64/ogrng64-cj1-avx2.cpp" TARGET_ADDNASMS="$TARGET_ADDNASMS $OGR/amd64/ogrng64-cj1-generic-asm.asm" TARGET_ADDNASMS="$TARGET_ADDNASMS $OGR/amd64/ogrng64-cj1-sse2-asm.asm" TARGET_ADDNASMS="$TARGET_ADDNASMS $OGR/amd64/ogrng64-cj1-sse2-lzcnt-asm.asm" + TARGET_ADDNASMS="$TARGET_ADDNASMS $OGR/amd64/ogrng64-cj1-avx2-asm.asm" fi if [ "$HAVE_OGR_P2" = "1" ]; then diff --git a/makefile.vc b/makefile.vc index a710678d4..c8267d0e7 100644 --- a/makefile.vc +++ b/makefile.vc @@ -136,7 +136,7 @@ ZIPEXTRAS = \ OPTS_MSVC = -nologo -D__WIN32__ \ -W4 -GR- -GA -GF -Gy \ - -Dsnprintf=_snprintf -DHAVE_SNPRINTF \ + -DHAVE_SNPRINTF \ -D_M_$(OPTS_M_PLAT) $(OPTS_CC_CPU) $(OPTS_CC_DEBUG) ## *** +++++++++++++++++++++++++++++++++++++++++ OPTS_LIBS = advapi32.lib user32.lib kernel32.lib gdi32.lib @@ -157,6 +157,12 @@ OPTS_RC = -d_Windows -d_M_$(OPTS_M_PLAT) # cl 16.00.xxxx = Visual Studio 2010 (VC10) # cl 17.00.xxxx = Visual Studio 2012 (VC11) # cl 18.00.xxxx = Visual Studio 2013 (VC12) +# cl 19.00.xxxx = Visual Studio 2015 (VC14) + +# snprintf needs to be defined for Visual Studio 2015 and earlier +!if ( [plat\win\msvcver.cmd] < 19 ) +OPTS_MSVC = $(OPTS_MSVC) -Dsnprintf=_snprintf +!endif !if ( [plat\win\msvcver.cmd] >= 15 ) OPTS_MSVC = $(OPTS_MSVC) -EHs-c- -GS- -wd4996 @@ -470,6 +476,8 @@ OGRNG_OBJS = \ $(OUTPUTPATH)/ogrng64-cj1-sse2-asm.obj \ $(OUTPUTPATH)/ogrng64-cj1-sse2-lzcnt.obj \ $(OUTPUTPATH)/ogrng64-cj1-sse2-lzcnt-asm.obj \ + $(OUTPUTPATH)/ogrng64-cj1-avx2.obj \ + $(OUTPUTPATH)/ogrng64-cj1-avx2-asm.obj \ $(OUTPUTPATH)/ogrng_init.obj \ $(OUTPUTPATH)/ogrng_dat.obj !elseif "$(PROCESSOR_ARCHITECTURE)" == "x86" diff --git a/ogr/amd64/ogrng64-cj1-avx2-asm.asm b/ogr/amd64/ogrng64-cj1-avx2-asm.asm new file mode 100644 index 000000000..fb489fd82 --- /dev/null +++ b/ogr/amd64/ogrng64-cj1-avx2-asm.asm @@ -0,0 +1,515 @@ +; +; Assembly core for OGR-NG, 64bit with AVX2. Based on SSE2 core (ogrng-cj1-sse2-asm.asm). +; $Id: ogrng64-cj1-avx2-asm.asm,v 1.0 2013/06/28 05:35:17 stream Exp $ +; +; Created by Craig Johnston (craig.johnston@dolby.com) +; +; 2017-04-17: Initial AVX2 version +; + +%ifdef __NASM_VER__ + cpu 686 +%else + cpu p3 mmx sse sse2 sse41 avx avx2 lzcnt + BITS 64 +%endif + +%ifdef __OMF__ ; Watcom and Borland compilers/linkers + [SECTION _DATA USE32 ALIGN=16 CLASS=DATA] + [SECTION _TEXT FLAT USE32 align=16 CLASS=CODE] +%else + [SECTION .data] + [SECTION .text] +%endif + + %define CHOOSE_DIST_BITS 16 ; /* number of bits to take into account */ + + ; Register renames + %define xmm_newbit xmm0 + %define ymm_newbit ymm0 ; Used only when blending + %define ymm_list ymm1 + %define ymm_comp ymm2 + %define xmm_comp xmm2 ; Used for when only the lowest 128 bits of comp is requred + %define ymm_dist ymm3 + %define xmm_dist xmm3 ; Used for when only the lowest 128 bits of dist is requred + + %define xmm_temp_s xmm4 + %define xmm_temp_ss xmm5 + + %define ymm_temp_A ymm6 + %define ymm_temp_B ymm7 + + %define xmm_zero xmm14 + %define ymm_zero ymm14 + %define xmm_one xmm15 + + + ; REGISTER - globals + ; ebx = mark + ; edi = limit + ; edx = work depth + ; ebp = stack location + ; r12 = half mark addr + ; r13 = pchoose + ; r14d = max_depth_m1 + ; r15d = nodes + + %define worksize 30h + + %define work_halfdepth rsp+00h + %define work_halfdepth2 rsp+04h + %define work_maxlen_m1 rsp+08h + %define work_stopdepth rsp+0Ch + + ; 64 bit work elements + %define work_oState rsp+10h + %define work_pnodes rsp+18h + %define work_oldrsp rsp+20h + + ; State Offsets + %define oState_max 00h + %define oState_maxdepthm1 08h + %define oState_half_depth 0Ch + %define oState_half_depth2 10h + %define oState_stopdepth 18h + %define oState_depth 1Ch + %define oState_Levels 20h + +; It's possible to put rbp (current level) a little forward and reference +; elements as 'rbp-nn' and 'rbp+nn' (with signed byte offsets) to avoid +; long command 'rbp+nnnnnnnn' (dword offset). + %define rbp_shift 128 ; may be up to 128 + %define sizeof_level 128 ; (32*3+8+8*6) + %define level_list 00h + %define level_dist 20h + %define level_comp 40h + %define level_mark 60h + %define level_limit 64h + +%define cur(el, index) [rbp+level_ %+ el + ((index)*8) - rbp_shift] + + + ; Macro defining the whole body of the function + ; Parameter 1 = The Name of this block + ; Parameter 2 = The Name of the block to jump to when pushing + ; Parameter 3 = The Name of the block to jump to when popping +%macro func 3 + + align 16 +do_loop_split%1: + vmovdqa ymm_list, cur(list, 0) + vmovdqa ymm_dist, cur(dist, 0) + +for_loop%1: + + ; REGISTER - end + ; eax = inverse shift amount (location of 0) + ; ecx = shift amount (ecx - eax) + + xor rax, -1 + jz full_shift%1 + +%ifdef use_lzcnt + lzcnt rcx, rax + mov eax, 63 + sub eax, ecx + add ecx, 1 +%else + bsr rax, rax + mov ecx, 64 + sub ecx, eax ; s = ecx-bsr +%endif + + ; REGISTER - start + ; eax = inverse shift amount (location of 0) + ; ecx = shift amount (64 - eax) + + ; if ((mark += s) > limit) { + ; break; + ; } + add ebx, ecx + cmp ebx, edi ; limit (==lev->limit) + ja break_for%1 + + ; COMP_LEFT_LIST_RIGHT(lev, s); + ; !!! + + ; Input + ; comp [D C B A] + ; list [D C B A] + ; newb [0 0 0 N] + ; + ; Output + ; comp >>[0 D C B] (temp_B) | <<[D C B A] (comp) + ; list >>[D C B A] (list) | <<[C B A N] (temp_A) + ; newb [X X X X] + + vmovq xmm_temp_ss, rax + vmovq xmm_temp_s, rcx + vmovdqa cur(dist, 0), ymm_dist + + ; newbit + list goes right and comp goes left + + vpsllq ymm_temp_A, ymm_list, xmm_temp_ss + vpsrlq ymm_temp_B, ymm_comp, xmm_temp_ss + vpsllq xmm_newbit, xmm_newbit, xmm_temp_ss + vpsllq ymm_comp, ymm_comp, xmm_temp_s + vpermpd ymm_temp_A, ymm_temp_A, 90h ; Reorder temp to be [C B A D] + vpblendd ymm_temp_B, ymm_temp_B, ymm_zero, 3 ; overwrite lowest quadword with 0 + vpsrlq ymm_list, ymm_list, xmm_temp_s + vpermpd ymm_temp_B, ymm_temp_B, 39h ; Reorder temp to be [0 D C B] + + ; ebx = mark + + ; if (depth == oState->maxdepthm1) { + ; goto exit; /* Ruler found */ + ; } + cmp r14d, edx + je ruler_found%1 + + vpblendd ymm_temp_A, ymm_temp_A, ymm_newbit, 3 ; overwrite lowest quadword with N + vpor ymm_list, ymm_temp_A + vpor ymm_comp, ymm_temp_B + + ; PUSH_LEVEL_UPDATE_STATE(lev); + ; !!! + ; ** LIST[lev+1] = LIST[lev] + ; ** DIST[lev+1] = (DIST[lev] | LIST[lev+1]) + ; ** COMP[lev+1] = (COMP[lev] | DIST[lev+1]) + ; ** newbit = 1; + + ; Save our loaded values + vmovdqa cur(list, 0), ymm_list + + ; ** LIST[lev+1] = LIST[lev] ; No need as we keep list in registers + ; ** DIST[lev+1] = (DIST[lev] | LIST[lev+1]) + ; ** COMP[lev+1] = (COMP[lev] | DIST[lev+1]) + + vmovdqa cur(comp, 0), ymm_comp + vpor ymm_dist, ymm_list + vpor ymm_comp, ymm_dist + +; !! delay init !! +; newbit = 1 + + ; lev->mark = mark; + mov [rbp+level_mark-rbp_shift], ebx + mov [rbp+level_limit-rbp_shift], edi + + ; lev++; + add rbp, sizeof_level + + ; depth++; + inc edx + + ; /* Compute the maximum position for the next level */ + ; #define choose(dist,seg) pchoose[(dist >> (SCALAR_BITS-CHOOSE_DIST_BITS)) * 32 + (seg)] + ; limit = choose(dist0, depth); + + vpextrw rax, xmm_dist, 3 ; Extract the first 16 bits from dist + shl eax, 5 + add eax, edx + movzx edi, word [r13+rax*2] + + ; if (depth > oState->half_depth && depth <= oState->half_depth2) { + ;;; if (depth > halfdepth && depth <= halfdepth2) { + cmp edx, [work_halfdepth2] + jbe continue_if_depth%1 + +skip_if_depth%1: + + vpextrq rax, xmm_comp, 0 + vmovq xmm_newbit, xmm_one; + + ; if (--nodes <= 0) { + + sub r15d, 1 + jg for_loop%2 + + ; goto exit; + jmp exit + + align 16 +continue_if_depth%1: + cmp edx, [work_halfdepth] + jbe skip_if_depth%1 + +; int temp = maxlen_m1 - oState->Levels[oState->half_depth].mark; +;; int temp = oState->max - 1 - oState->Levels[halfdepth].mark; + + mov esi, [work_maxlen_m1] + sub esi, [r12] + +; if (depth < oState->half_depth2) { + cmp edx, [work_halfdepth2] + jae update_limit_temp%1 + +; temp -= LOOKUP_FIRSTBLANK(dist0); // "33" version +;;; temp -= LOOKUP_FIRSTBLANK(dist0 & -((SCALAR)1 << 32)); + + vpextrq rcx, xmm_dist, 0 ; move upper part of dist into rcx + not rcx + +%ifdef use_lzcnt + sub esi, 1 + lzcnt rcx, rcx + sub esi, ecx +%else + mov eax, -1 + bsr rcx, rcx + cmovz ecx, eax + add esi, ecx + sub esi, 64 +%endif + +update_limit_temp%1: +; if (limit > temp) { +; limit = temp; +; } + + cmp edi, esi + cmovg edi, esi + jmp skip_if_depth%1 + + align 16 +full_shift%1: + ; else { /* s >= SCALAR_BITS */ + + ; if ((mark += SCALAR_BITS) > limit) { + ; break; + ; } + add ebx, 64 + cmp ebx, edi ; limit (==lev->limit) + ja break_for%1 + + ; COMP_LEFT_LIST_RIGHT_WORD(lev); + ; continue; + + ; COMP_LEFT_LIST_RIGHT_WORD(lev); + ; !!! + + ; Input + ; comp [D C B A] + ; list [D C B A] + ; newb [0 0 0 N] + ; + ; Output + ; comp >>[0 D C B] + ; list [C B A N] + ; newb [0 0 0 0] + + vpermpd ymm_comp, ymm_comp, 39h ; Reorder to be [A D C B] + vpermpd ymm_list, ymm_list, 90h ; Reorder to be [C B A D] + + vpblendd ymm_comp, ymm_comp, ymm_zero, 192 ; overwrite highest quadword with 0 + vpblendd ymm_list, ymm_list, ymm_newbit, 3 ; overwrite lowest quadword with N + vmovq xmm_newbit, xmm_zero ; Clear newbit + + vpextrq rax, xmm_comp, 0 + jmp for_loop%1 + + align 16 +break_for%1: + + ; lev--; + sub rbp, sizeof_level + + ; depth--; + dec edx + + ; POP_LEVEL(lev); + ; !!! + vmovdqa ymm_comp, cur(comp, 0) + vmovq xmm_newbit, xmm_zero ; newbit = 0; + + ; } while (depth > oState->stopdepth); + mov ecx, [work_stopdepth] + + vpextrq rax, xmm_comp, 0 + + ; split loop header + mov ebx, [rbp+level_mark-rbp_shift] ; mark = lev->mark; + mov edi, [rbp+level_limit-rbp_shift] + + cmp ecx, edx + jb do_loop_split%3 + + vmovdqa ymm_list, cur(list, 0) + vmovdqa ymm_dist, cur(dist, 0) + jmp exit + +ruler_found%1: + vpblendd ymm_temp_A, ymm_temp_A, ymm_newbit, 3 ; overwrite lowest quadword with N + vpor ymm_list, ymm_temp_A + vpor ymm_comp, ymm_temp_B + jmp exit +%endmacro + +%macro header 1 + ; Although Linux requires less registers to save, common code + ; is simpler to manage. So save maximum amount required to work with all OS'es. + push rsi ; Windows + push rdi ; Windows + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + ; According to x64 ABI, stack must be aligned by 16 before call => + ; it'll be xxxxxxx8 after call. We've pushed EVEN number of registers above => + ; stack is still at xxxxxxx8. Subtracting ***8 will make it aligned to 16, + ; so we can save XMM registers (required for Windows only, but see above). + sub rsp, 0xA8 + movdqa [rsp+0x00], xmm6 + movdqa [rsp+0x10], xmm7 + movdqa [rsp+0x20], xmm8 + movdqa [rsp+0x30], xmm9 + movdqa [rsp+0x40], xmm10 + movdqa [rsp+0x50], xmm11 + movdqa [rsp+0x60], xmm12 + movdqa [rsp+0x70], xmm13 + movdqa [rsp+0x80], xmm14 + movdqa [rsp+0x90], xmm15 + +%ifdef _WINDOWS + ; Switch to linux calling convention + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 +%endif + + ; Create work area and align it to 32 bytes + mov rcx, rsp + sub rsp, worksize + and rsp, -32 + mov [work_oldrsp], rcx + +start: + ; write the paramters in the aligned work space + mov [work_oState], rdi + mov [work_pnodes], rsi + mov r13, rdx + + mov edx, [rdi+oState_depth] + + imul eax, edx, sizeof_level + lea rbp, [rax+rdi+oState_Levels+rbp_shift] ; lev = &oState->Levels[oState->depth] + mov r15d, [rsi] ; nodes = *pnodes + + mov eax, [rdi+oState_half_depth] + mov [work_halfdepth], eax ; halfdepth = oState->half_depth + + ; get address of oState->Levels[oState->half_depth].mark + ; value of this var can be changed during crunching, but addr is const + imul eax, sizeof_level + lea r12, [rax+rdi+oState_Levels+level_mark] + + mov eax, [rdi+oState_half_depth2] + mov [work_halfdepth2], eax ; halfdepth2 = oState->half_depth2 + + mov eax, [rdi+oState_max] + dec eax + mov [work_maxlen_m1], eax ; maxlen_m1 = oState->max - 1 + + mov r14d, [rdi+oState_maxdepthm1] + + mov eax, [rdi+oState_stopdepth] + mov [work_stopdepth], eax + + ; Zero all vector registers + vzeroall + + mov eax, 1 + vmovq xmm_one, rax + + ; SETUP_TOP_STATE(lev); + ; !!! + vmovdqa ymm_comp, cur(comp, 0) + + ; int newbit = (depth < oState->maxdepthm1) ? 1 : 0; + xor eax, eax + cmp edx, r14d + setl al + vmovq xmm_newbit, rax + + ; mm0..mm3 = comp + ; mm4 = newbit + + ; split loop header + mov ebx, [rbp+level_mark-rbp_shift] ; mark = lev->mark; + mov edi, [rbp+level_limit-rbp_shift] + + vpextrq rax, xmm_comp, 0 + + jmp do_loop_split%1 +%endmacro + +%macro footer 0 +exit: + ; SAVE_FINAL_STATE(lev); + ; !!! + vmovdqa cur(list, 0), ymm_list + vmovdqa cur(dist, 0), ymm_dist + vmovdqa cur(comp, 0), ymm_comp + + ; lev->mark = mark; + mov [rbp+level_mark-rbp_shift], ebx + mov [rbp+level_limit-rbp_shift], edi + + mov rbx, [work_pnodes] ; *pnodes -= nodes; + sub [rbx], r15d + + mov eax, edx ; return depth; + + mov rsp, [work_oldrsp] + movdqa xmm6, [rsp+0x00] + movdqa xmm7, [rsp+0x10] + movdqa xmm8, [rsp+0x20] + movdqa xmm9, [rsp+0x30] + movdqa xmm10, [rsp+0x40] + movdqa xmm11, [rsp+0x50] + movdqa xmm12, [rsp+0x60] + movdqa xmm13, [rsp+0x70] + movdqa xmm14, [rsp+0x80] + movdqa xmm15, [rsp+0x90] + add rsp, 0xA8 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + emms + ret +%endmacro + +%macro body 1 + %assign max_id %1 + %assign id 1 + %rep %1 + %assign next_id id + 1 + %if next_id > max_id + %assign next_id max_id + %endif + + %assign prev_id id - 1 + %if prev_id < 1 + %assign prev_id 1 + %endif + + func id, next_id, prev_id + %assign id id + 1 + %endrep +%endmacro + +global _ogrng64_cycle_256_cj1_avx2 +global ogrng64_cycle_256_cj1_avx2 +_ogrng64_cycle_256_cj1_avx2: +ogrng64_cycle_256_cj1_avx2: + + header 5 + body 30 + footer diff --git a/ogr/amd64/ogrng64-cj1-avx2.cpp b/ogr/amd64/ogrng64-cj1-avx2.cpp new file mode 100644 index 000000000..8cace7046 --- /dev/null +++ b/ogr/amd64/ogrng64-cj1-avx2.cpp @@ -0,0 +1,54 @@ +#include "ansi/ogrng-64.h" + +#define OGROPT_HAVE_FIND_FIRST_ZERO_BIT_ASM 0 /* 0-2 - 'no' */ +#define OGROPT_ALTERNATE_CYCLE 1 /* 0/1 - 'yes' */ +#define OGR_NG_GET_DISPATCH_TABLE_FXN ogrng64_get_dispatch_table_cj1_avx2 +#define OGROPT_SPECIFIC_LEVEL_STRUCT + +/* + ** Level datas. + */ +struct OgrLevel { + BMAP list[OGRNG_BITMAPS_WORDS]; + BMAP dist[OGRNG_BITMAPS_WORDS]; + BMAP comp[OGRNG_BITMAPS_WORDS]; + int mark; + int limit; + int pad0; + int pad1; + int pad2; + int pad3; + int pad4; + int pad5; +}; + +#include "ansi/ogrng_codebase.cpp" + +#include "ccoreio.h" /* CDECL */ +#include /* offsetof */ + +extern "C" int CDECL ogrng64_cycle_256_cj1_avx2(struct OgrState *oState, int *pnodes, const u16* pchoose); + +static int ogr_cycle_256(struct OgrState *oState, int *pnodes, const u16* pchoose) +{ + /* Check structures layout and alignment to match assembly */ + + STATIC_ASSERT(offsetof(struct OgrState, max) == 0 ); + STATIC_ASSERT(offsetof(struct OgrState, maxdepthm1) == 8 ); + STATIC_ASSERT(offsetof(struct OgrState, half_depth) == 12); + STATIC_ASSERT(offsetof(struct OgrState, half_depth2) == 16); + STATIC_ASSERT(offsetof(struct OgrState, stopdepth) == 24); + STATIC_ASSERT(offsetof(struct OgrState, depth) == 28); + STATIC_ASSERT(offsetof(struct OgrState, Levels) == 32); + + STATIC_ASSERT(sizeof(struct OgrLevel) == 128); + STATIC_ASSERT(sizeof(oState->Levels) == 128 * OGR_MAXDEPTH); + + STATIC_ASSERT(offsetof(struct OgrLevel, list) == 0); + STATIC_ASSERT(offsetof(struct OgrLevel, dist) == 32); + STATIC_ASSERT(offsetof(struct OgrLevel, comp) == 64); + STATIC_ASSERT(offsetof(struct OgrLevel, mark) == 96); + STATIC_ASSERT(offsetof(struct OgrLevel, limit) == 100); + + return ogrng64_cycle_256_cj1_avx2(oState, pnodes, pchoose); +} diff --git a/plat/win/msvcver.cmd b/plat/win/msvcver.cmd index 6ee1bc114..31bf1e695 100755 --- a/plat/win/msvcver.cmd +++ b/plat/win/msvcver.cmd @@ -13,6 +13,7 @@ rem 15 = cl 15.00.xxxx = Visual Studio 2008 (VC9) rem 16 = cl 16.00.xxxx = Visual Studio 2010 (VC10) rem 17 = cl 17.00.xxxx = Visual Studio 2012 (VC11) rem 18 = cl 18.00.xxxx = Visual Studio 2013 (VC12) +rem 19 = cl 18.00.xxxx = Visual Studio 2015 (VC14) for %%i in (cl.exe) do if "%%~$PATH:i"=="" goto notfound diff --git a/plat/win/w32cons.cpp b/plat/win/w32cons.cpp index 5732e559c..dba446637 100644 --- a/plat/win/w32cons.cpp +++ b/plat/win/w32cons.cpp @@ -7629,7 +7629,7 @@ static int __w32ConOutX(const char *text, int iserr) if (!handled) { /* note the spaces around the caption! Don't let this window be "findable" */ - MessageBox(NULL,text, " "W32CLI_CONSOLE_NAME" ",MB_OK|MB_TASKMODAL + MessageBox(NULL,text, " " W32CLI_CONSOLE_NAME " ",MB_OK|MB_TASKMODAL |(iserr?MB_ICONHAND:MB_ICONINFORMATION)); } return 0; diff --git a/plat/win/w32svc.cpp b/plat/win/w32svc.cpp index b00a0a495..8bc0dfbc0 100644 --- a/plat/win/w32svc.cpp +++ b/plat/win/w32svc.cpp @@ -92,9 +92,9 @@ return "@(#)$Id: w32svc.cpp,v 1.12 2012/08/21 18:55:55 sla Exp $"; } const char *W9xSERVICEKEY = "distributed.net client"; #endif #endif /* PROXYTYPE or not */ -const char *APPDESCRIP = "distributed.net "SERVICEFOR; +const char *APPDESCRIP = "distributed.net " SERVICEFOR; -#define SERVICEMUTEX "distributed.net "SERVICEFOR" service mutex" +#define SERVICEMUTEX "distributed.net " SERVICEFOR " service mutex" /* ---------------------------------------------------------- */ @@ -529,7 +529,7 @@ int win32CliDetectRunningService(void) /* <0=err, 0=no, >0=yes */ int win32CliUninstallService(int quiet) { int retcode = -1; - const char *msg = "A distributed.net "SERVICEFOR" could not be uninstalled"; + const char *msg = "A distributed.net " SERVICEFOR " could not be uninstalled"; if (__winGetVersion() < 400) /* win16 */ {