diff --git a/common/cliident.cpp b/common/cliident.cpp
index 9fb7f8394..74a64a28e 100644
--- a/common/cliident.cpp
+++ b/common/cliident.cpp
@@ -542,7 +542,7 @@ int CliIsDevelVersion(void)
 
 const char *CliGetFullVersionDescriptor(void)
 {
-  static char buffer[10+32+sizeof("v"CLIENT_VERSIONSTRING"-XXX-99071523-*dev* client for "CLIENT_OS_NAME_EXTENDED)];
+  static char buffer[10+32+sizeof("v" CLIENT_VERSIONSTRING "-XXX-99071523-*dev* client for " CLIENT_OS_NAME_EXTENDED)];
   struct timeval tv; tv.tv_usec = 0;
   tv.tv_sec = CliGetNewestModuleTime();
   sprintf( buffer, "%s v" CLIENT_VERSIONSTRING "-"
@@ -559,7 +559,7 @@ const char *CliGetFullVersionDescriptor(void)
          "%c"  /* limited release or dev branch or public release */
          "-%s" /* date is in bugzilla format yymmddhh */
          "%s"  /* "-*dev*" or "" */
-         " for "CLIENT_OS_NAME_EXTENDED,
+         " for " CLIENT_OS_NAME_EXTENDED,
          utilGetAppName(),
          ((ConIsGUI())?('G'):('C')),
          ((CliIsDevelVersion())?('L'):('R')),
diff --git a/common/confopt.cpp b/common/confopt.cpp
index 7c9957cda..1999defbc 100644
--- a/common/confopt.cpp
+++ b/common/confopt.cpp
@@ -405,7 +405,7 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = {
   "\n"
   "It is possible to have the client rotate through this list, updating its\n"
   "buffers only once for each pass. To do so, 'Dialup-link detection'\n"
-  "and '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"' must be disabled since a buffer\n"
+  "and '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "' must be disabled since a buffer\n"
   "update (new work being made available) would otherwise cause the client\n"
   "to go back to the beginning of the load order.\n"
   /*) */,CONF_MENU_BUFF,CONF_TYPE_ASCIIZ,NULL,NULL,0,0,NULL,NULL
@@ -439,13 +439,13 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = {
   CONF_FREQUENT_FREQUENCY      , /* CONF_MENU_BUFF */
   CFGTXT("Buffer-level check interval"), "0:00 (on buffer change)",
   /*CFGTXT(*/
-  "This option determines how often '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'\n"
+  "This option determines how often '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'\n"
   "should be performed. (More precisely: how much time must elapse between\n"
   "buffer-level checks)\n" 
   "\n"
   "This setting is meaningful only if one of the extensions to normal threshold\n"
   "management is enabled: either implicitly when 'Dialup detection options' are\n"
-  "active or explicitly with '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'.\n"
+  "active or explicitly with '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'.\n"
   "\n"
   "The interval specified here is in hours and minutes, and the default denotes\n"
   "that the client should check buffer-levels whenever it detects a change (by\n"
@@ -456,13 +456,13 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = {
   CONF_FREQUENT_RETRY_FREQUENCY      , /* CONF_MENU_BUFF */
   CFGTXT("Buffer-level check retry interval"), "0:00 (no delay)",
   /*CFGTXT(*/
-  "This option determines how often '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'\n"
+  "This option determines how often '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'\n"
   "should be retried after failure. (More precisely: how much time must elapse\n"
   "between buffer-level check retries)\n" 
   "\n"
   "This setting is meaningful only if one of the extensions to normal threshold\n"
   "management is enabled: either implicitly when 'Dialup detection options' are\n"
-  "active or explicitly with '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'.\n"
+  "active or explicitly with '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'.\n"
   "\n"
   "The interval specified here is in hours and minutes, and the default denotes\n"
   "that the client should retry the buffer-level checks at most twice per minute\n"
@@ -503,7 +503,7 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = {
   "should be used instead. If that too is unspecified, then the client will\n"
   "use defaults.\n"
   "\n"
-  "* See also: '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'\n"
+  "* See also: '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'\n"
   ,CONF_MENU_BUFF,CONF_TYPE_IARRAY,NULL,NULL,1,0xffff,NULL,NULL
 },
 { 
@@ -523,7 +523,7 @@ struct optionstruct conf_options[CONF_OPTION_COUNT] = {
   "unprocessed packet cannot be predicted.\n"
 #endif
   "\n"
-  "* See also: '"ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME"'\n"
+  "* See also: '" ADDITIONAL_BUFFLEVEL_CHECK_OPTION_NAME "'\n"
   ,CONF_MENU_BUFF,CONF_TYPE_IARRAY,NULL,NULL,0,(14*24),NULL,NULL
 },
 
diff --git a/common/core_ogr_ng.cpp b/common/core_ogr_ng.cpp
index 6115aca55..8f9a8bc89 100644
--- a/common/core_ogr_ng.cpp
+++ b/common/core_ogr_ng.cpp
@@ -84,6 +84,7 @@ return "@(#)$Id: core_ogr_ng.cpp,v 1.47 2015/06/27 21:43:52 zebe Exp $"; }
     CoreDispatchTable *ogrng64_get_dispatch_table_cj1_generic(void);
     CoreDispatchTable *ogrng64_get_dispatch_table_cj1_sse2(void);
     CoreDispatchTable *ogrng64_get_dispatch_table_cj1_sse2_lzcnt(void);
+    CoreDispatchTable *ogrng64_get_dispatch_table_cj1_avx2(void);
 #elif (CLIENT_CPU == CPU_SPARC) && (SIZEOF_LONG == 8)
     CoreDispatchTable *ogrng64_get_dispatch_table(void); 
 #elif (CLIENT_CPU == CPU_S390X) && (SIZEOF_LONG == 8)
@@ -166,6 +167,7 @@ int InitializeCoreTable_ogr_ng(int first_time)
         ogrng64_get_dispatch_table_cj1_generic();
         ogrng64_get_dispatch_table_cj1_sse2();
         ogrng64_get_dispatch_table_cj1_sse2_lzcnt();
+        ogrng64_get_dispatch_table_cj1_avx2();
       #elif (CLIENT_CPU == CPU_S390)
         ogrng_get_dispatch_table();
       #elif (CLIENT_CPU == CPU_S390X)
@@ -225,6 +227,7 @@ const char **corenames_for_contest_ogr_ng()
       "cj-asm-generic",
       "cj-asm-sse2",
       "cj-asm-sse2-lzcnt",
+      "cj-asm-avx2",
   #elif (CLIENT_CPU == CPU_ARM)
       "FLEGE 2.0",
       "FLEGE 2.0 ARMv3",
@@ -356,6 +359,8 @@ int apply_selcore_substitution_rules_ogr_ng(int cindex)
 #  endif
 # elif (CLIENT_CPU == CPU_AMD64)
   unsigned feature = GetProcessorFeatureFlags();
+  if (cindex == 4 && !(feature & CPU_F_AVX2)) /* Core 4 needs AVX2 */
+    cindex = 2; /* If no AVX2, try SSE2 */
   if (cindex == 3 && !(feature & CPU_F_LZCNT)) /* Core 3 needs LZCNT */
     cindex = 2; /* If no LZCNT, try SSE2 */
   if (cindex == 2 && !(feature & CPU_F_SSE2))  /* Core 2 needs SSE2 */
@@ -504,8 +509,11 @@ int selcoreGetPreselectedCoreForProject_ogr_ng()
       }
       if (cindex == -1)
       {
+        /* Assume that if AVX2 is availble it is the best choice */
+        if (detected_flags & CPU_F_AVX2)
+          cindex = 4;
         /* Assume that LZCNT+SSE2 is better then plain SSE2 everywhere */
-        if (detected_flags & CPU_F_LZCNT)
+        else if (detected_flags & CPU_F_LZCNT)
           cindex = 3;
         else if (detected_flags & CPU_F_SSE2)
           cindex = 2;  /* sse2 core */
@@ -638,6 +646,8 @@ int selcoreSelectCore_ogr_ng(Client *client, unsigned int threadindex,
     unit_func.ogr = ogrng64_get_dispatch_table_cj1_sse2();
   else if (coresel == 3)
     unit_func.ogr = ogrng64_get_dispatch_table_cj1_sse2_lzcnt();
+  else if (coresel == 4)
+    unit_func.ogr = ogrng64_get_dispatch_table_cj1_avx2();
   else
   {
     unit_func.ogr = ogrng64_get_dispatch_table();
diff --git a/common/cpucheck.cpp b/common/cpucheck.cpp
index 9d5c410f9..f4a1cf782 100644
--- a/common/cpucheck.cpp
+++ b/common/cpucheck.cpp
@@ -1344,7 +1344,8 @@ long __GetRawProcessorID(const char **cpuname, int whattoret = 0 )
          *   0x19 - Sandy Bridge Core iX-2xxx
          *   0x1A - Ivy Bridge Core iX-3xxx
          *   0x1B - Haswell Core iX-4xxx
-         *   0x1C-1F
+         *   0x1C - Kaby Lake Core iX-7xxx
+         *   0x1D-1F
          *   0x20 - AMD Bobcat - Embedded APU
          *   0x21 - AMD Bulldozer - FX
          *   0x22 - AMD Husky - APU
@@ -1488,6 +1489,7 @@ long __GetRawProcessorID(const char **cpuname, int whattoret = 0 )
         { 0x0006450, 0xFFFFFF0, CPU_F_I686, 0x1B, "Core iX-4xxx (Haswell)" },  /* (#4579) */
         { 0x0006460, 0xFFFFFF0, CPU_F_I686, 0x1B, "Core iX-4xxx (Haswell)" },
         { 0x00065E0, 0xFFFFFF0, CPU_F_I686, 0x1B, "Core iX-6xxx (Skylake)" },  /* (#4615) */
+        { 0x00069E0, 0xFFFFFF0, CPU_F_I686, 0x1C, "Core iX-7xxx (Kaby Lake)" },
         { 0x0000000,         0,          0,    0, NULL }
       }; internalxref = &intelxref[0];
     }
diff --git a/common/cpucheck.h b/common/cpucheck.h
index 3d4dcc5fe..dac20eb09 100644
--- a/common/cpucheck.h
+++ b/common/cpucheck.h
@@ -24,13 +24,13 @@
   #define CPU_F_SSE             (0x00002000L)
   #define CPU_F_SSE2            (0x00004000L)
   #define CPU_F_SSE3            (0x00008000L)
-  #define CPU_F_HYPERTHREAD     (0x00010000L)	/* supported and enabled */
+  #define CPU_F_HYPERTHREAD     (0x00010000L)   /* supported and enabled */
   #define CPU_F_AMD64           (0x00020000L)
   #define CPU_F_EM64T           (0x00040000L)
   #define CPU_F_SSE4_1          (0x00080000L)
   #define CPU_F_SSE4_2          (0x00100000L)
   #define CPU_F_SSSE3           (0x00200000L)
-  #define CPU_F_LZCNT		(0x00400000L)
+  #define CPU_F_LZCNT           (0x00400000L)
   #define CPU_F_AVX_DISABLED    (0x00800000L)   /* supported but disabled (no OS support) */
   #define CPU_F_AVX             (0x01000000L)   /* supported and enabled */
   #define CPU_F_AVX2            (0x02000000L)   /* supported and enabled */
diff --git a/common/mail.cpp b/common/mail.cpp
index 7d73b324b..975a071d3 100644
--- a/common/mail.cpp
+++ b/common/mail.cpp
@@ -564,8 +564,8 @@ static int smtp_send_message_header( void * net,
   if (errcode == 0) //send the date
   {
     sprintf( buffer, "\r\nDate: %s"
-        "\r\nX-Mailer: distributed.net v"CLIENT_VERSIONSTRING
-           " client for "CLIENT_OS_NAME_EXTENDED, rfc822Date( buffer + 256 ) );
+        "\r\nX-Mailer: distributed.net v" CLIENT_VERSIONSTRING
+           " client for " CLIENT_OS_NAME_EXTENDED, rfc822Date( buffer + 256 ) );
     if ( put_smtp_line( net, buffer, strlen( buffer ) ) )
       errcode = -1;
   }
diff --git a/common/problem.h b/common/problem.h
index 4a6178979..3f6d07e3a 100644
--- a/common/problem.h
+++ b/common/problem.h
@@ -47,10 +47,10 @@
        #define CORE_MEM_ALIGNMENT 4
      #endif
   #else
-     // For x86, alignment must be 8 for MMX core and 16 for SSE.
-     #if CORE_MEM_ALIGNMENT < 4
+     // For x86, alignment must be 8 for MMX core, 16 for SSE and 32 for AVX2.
+     #if CORE_MEM_ALIGNMENT < 5
        #undef CORE_MEM_ALIGNMENT
-       #define CORE_MEM_ALIGNMENT 4
+       #define CORE_MEM_ALIGNMENT 5
      #endif
   #endif
 #endif
diff --git a/common/util.cpp b/common/util.cpp
index b55b99c7e..d2a40a7bb 100644
--- a/common/util.cpp
+++ b/common/util.cpp
@@ -73,7 +73,7 @@ void trace_setsrc( const char *filename )
 void trace_out( int indlevel, const char *format, ... )
 {
   static int indentlevel = -1; /* uninitialized */
-  const char *tracefile = "trace"EXTN_SEP"out";
+  const char *tracefile = "trace" EXTN_SEP "out";
   int old_errno = errno;
   FILE *file;
   va_list arglist;
diff --git a/configure b/configure
index d35d740f2..fbccb8dbc 100755
--- a/configure
+++ b/configure
@@ -659,9 +659,11 @@ add_sources() # $1=os, $2=arch, $3=custom
       TARGET_ADDSRCS="$TARGET_ADDSRCS $OGR/amd64/ogrng64-cj1-generic.cpp"
       TARGET_ADDSRCS="$TARGET_ADDSRCS $OGR/amd64/ogrng64-cj1-sse2.cpp"
       TARGET_ADDSRCS="$TARGET_ADDSRCS $OGR/amd64/ogrng64-cj1-sse2-lzcnt.cpp"
+      TARGET_ADDSRCS="$TARGET_ADDSRCS $OGR/amd64/ogrng64-cj1-avx2.cpp"
       TARGET_ADDNASMS="$TARGET_ADDNASMS $OGR/amd64/ogrng64-cj1-generic-asm.asm"
       TARGET_ADDNASMS="$TARGET_ADDNASMS $OGR/amd64/ogrng64-cj1-sse2-asm.asm"
       TARGET_ADDNASMS="$TARGET_ADDNASMS $OGR/amd64/ogrng64-cj1-sse2-lzcnt-asm.asm"
+      TARGET_ADDNASMS="$TARGET_ADDNASMS $OGR/amd64/ogrng64-cj1-avx2-asm.asm"
     fi
 
     if [ "$HAVE_OGR_P2" = "1" ]; then
diff --git a/makefile.vc b/makefile.vc
index a710678d4..c8267d0e7 100644
--- a/makefile.vc
+++ b/makefile.vc
@@ -136,7 +136,7 @@ ZIPEXTRAS = \
 
 OPTS_MSVC = -nologo -D__WIN32__ \
             -W4 -GR- -GA -GF -Gy \
-            -Dsnprintf=_snprintf -DHAVE_SNPRINTF \
+            -DHAVE_SNPRINTF \
             -D_M_$(OPTS_M_PLAT) $(OPTS_CC_CPU) $(OPTS_CC_DEBUG)
             ## *** +++++++++++++++++++++++++++++++++++++++++
 OPTS_LIBS = advapi32.lib user32.lib kernel32.lib gdi32.lib
@@ -157,6 +157,12 @@ OPTS_RC   = -d_Windows -d_M_$(OPTS_M_PLAT)
 #   cl 16.00.xxxx = Visual Studio 2010 (VC10)
 #   cl 17.00.xxxx = Visual Studio 2012 (VC11)
 #   cl 18.00.xxxx = Visual Studio 2013 (VC12)
+#   cl 19.00.xxxx = Visual Studio 2015 (VC14)
+
+# snprintf needs to be defined for Visual Studio 2015 and earlier
+!if ( [plat\win\msvcver.cmd] < 19 )
+OPTS_MSVC = $(OPTS_MSVC) -Dsnprintf=_snprintf
+!endif
 
 !if ( [plat\win\msvcver.cmd] >= 15 )
 OPTS_MSVC = $(OPTS_MSVC) -EHs-c- -GS- -wd4996
@@ -470,6 +476,8 @@ OGRNG_OBJS =                         \
         $(OUTPUTPATH)/ogrng64-cj1-sse2-asm.obj \
         $(OUTPUTPATH)/ogrng64-cj1-sse2-lzcnt.obj    \
         $(OUTPUTPATH)/ogrng64-cj1-sse2-lzcnt-asm.obj \
+        $(OUTPUTPATH)/ogrng64-cj1-avx2.obj    \
+        $(OUTPUTPATH)/ogrng64-cj1-avx2-asm.obj \
         $(OUTPUTPATH)/ogrng_init.obj \
         $(OUTPUTPATH)/ogrng_dat.obj
 !elseif "$(PROCESSOR_ARCHITECTURE)" == "x86"
diff --git a/ogr/amd64/ogrng64-cj1-avx2-asm.asm b/ogr/amd64/ogrng64-cj1-avx2-asm.asm
new file mode 100644
index 000000000..fb489fd82
--- /dev/null
+++ b/ogr/amd64/ogrng64-cj1-avx2-asm.asm
@@ -0,0 +1,515 @@
+;
+; Assembly core for OGR-NG, 64bit with AVX2. Based on SSE2 core (ogrng-cj1-sse2-asm.asm).
+; $Id: ogrng64-cj1-avx2-asm.asm,v 1.0 2013/06/28 05:35:17 stream Exp $
+;
+; Created by Craig Johnston (craig.johnston@dolby.com)
+;
+; 2017-04-17: Initial AVX2 version
+;
+
+%ifdef __NASM_VER__
+	cpu	686
+%else
+	cpu	p3 mmx sse sse2 sse41 avx avx2 lzcnt
+	BITS	64
+%endif
+
+%ifdef __OMF__ ; Watcom and Borland compilers/linkers
+	[SECTION _DATA USE32 ALIGN=16 CLASS=DATA]
+	[SECTION _TEXT FLAT USE32 align=16 CLASS=CODE]
+%else
+	[SECTION .data]
+	[SECTION .text]
+%endif
+
+	%define CHOOSE_DIST_BITS	16     ; /* number of bits to take into account  */
+
+	; Register renames
+	%define xmm_newbit	xmm0
+	%define ymm_newbit	ymm0	; Used only when blending
+	%define ymm_list	ymm1
+	%define ymm_comp	ymm2
+	%define xmm_comp	xmm2	; Used for when only the lowest 128 bits of comp is requred
+	%define ymm_dist	ymm3
+	%define xmm_dist	xmm3	; Used for when only the lowest 128 bits of dist is requred
+
+	%define xmm_temp_s	xmm4
+	%define xmm_temp_ss	xmm5
+
+	%define ymm_temp_A	ymm6
+	%define ymm_temp_B	ymm7
+
+	%define xmm_zero	xmm14
+	%define ymm_zero	ymm14
+	%define xmm_one	xmm15
+
+
+	; REGISTER - globals
+	; ebx = mark
+	; edi = limit
+	; edx = work depth
+	; ebp = stack location
+	; r12 = half mark addr
+	; r13 = pchoose
+	; r14d = max_depth_m1
+	; r15d = nodes
+
+	%define worksize	30h
+
+	%define work_halfdepth		rsp+00h
+	%define work_halfdepth2		rsp+04h
+	%define work_maxlen_m1		rsp+08h
+	%define work_stopdepth		rsp+0Ch
+
+	; 64 bit work elements
+	%define work_oState			rsp+10h
+	%define work_pnodes			rsp+18h
+	%define work_oldrsp			rsp+20h
+
+	; State Offsets
+	%define oState_max			00h
+	%define oState_maxdepthm1	08h
+	%define oState_half_depth	0Ch
+	%define oState_half_depth2	10h
+	%define oState_stopdepth	18h
+	%define oState_depth		1Ch
+	%define oState_Levels		20h
+
+; It's possible to put rbp (current level) a little forward and reference
+; elements as 'rbp-nn' and 'rbp+nn' (with signed byte offsets) to avoid
+; long command 'rbp+nnnnnnnn' (dword offset).
+	%define rbp_shift		128	; may be up to 128
+	%define sizeof_level	128	; (32*3+8+8*6)
+	%define level_list		00h
+	%define level_dist		20h
+	%define level_comp		40h
+	%define level_mark		60h
+	%define level_limit		64h
+
+%define cur(el, index)   [rbp+level_ %+ el + ((index)*8) - rbp_shift]
+
+
+	; Macro defining the whole body of the function
+	; Parameter 1 = The Name of this block
+	; Parameter 2 = The Name of the block to jump to when pushing
+	; Parameter 3 = The Name of the block to jump to when popping
+%macro func 3
+
+	align	16
+do_loop_split%1:
+	vmovdqa	ymm_list, cur(list, 0)
+	vmovdqa	ymm_dist, cur(dist, 0)
+
+for_loop%1:
+
+	; REGISTER - end
+	; eax = inverse shift amount (location of 0)
+	; ecx = shift amount (ecx - eax)
+
+	xor	rax, -1
+	jz	full_shift%1
+
+%ifdef use_lzcnt
+	lzcnt	rcx, rax
+	mov	eax, 63
+	sub	eax, ecx
+	add	ecx, 1
+%else
+	bsr	rax, rax
+	mov	ecx, 64
+	sub	ecx, eax		; s = ecx-bsr
+%endif
+
+	; REGISTER - start
+	; eax = inverse shift amount (location of 0)
+	; ecx = shift amount (64 - eax)
+
+	;        if ((mark += s) > limit) {
+	;          break;
+	;        }
+	add	ebx, ecx
+	cmp	ebx, edi ; limit (==lev->limit)
+	ja	break_for%1
+
+	;        COMP_LEFT_LIST_RIGHT(lev, s);
+	; !!!
+
+	; Input
+	; comp [D C B A]
+	; list [D C B A]
+	; newb [0 0 0 N]
+	;
+	; Output
+	; comp >>[0 D C B] (temp_B) | <<[D C B A] (comp)
+	; list >>[D C B A] (list) | <<[C B A N] (temp_A)
+	; newb [X X X X]
+
+	vmovq	xmm_temp_ss, rax
+	vmovq	xmm_temp_s, rcx
+	vmovdqa	cur(dist, 0), ymm_dist
+
+	; newbit + list goes right and comp goes left
+
+	vpsllq	ymm_temp_A, ymm_list, xmm_temp_ss
+	vpsrlq	ymm_temp_B, ymm_comp, xmm_temp_ss
+	vpsllq	xmm_newbit, xmm_newbit, xmm_temp_ss
+	vpsllq	ymm_comp, ymm_comp, xmm_temp_s
+	vpermpd	ymm_temp_A, ymm_temp_A, 90h	; Reorder temp to be [C B A D]
+	vpblendd	ymm_temp_B, ymm_temp_B, ymm_zero, 3	; overwrite lowest quadword with 0
+	vpsrlq	ymm_list, ymm_list, xmm_temp_s
+	vpermpd	ymm_temp_B, ymm_temp_B, 39h	; Reorder temp to be [0 D C B]
+
+	; ebx = mark
+
+	;      if (depth == oState->maxdepthm1) {
+	;        goto exit;         /* Ruler found */
+	;      }
+	cmp	r14d, edx
+	je	ruler_found%1
+
+	vpblendd	ymm_temp_A, ymm_temp_A, ymm_newbit, 3	; overwrite lowest quadword with N
+	vpor	ymm_list, ymm_temp_A
+	vpor	ymm_comp, ymm_temp_B
+
+	;      PUSH_LEVEL_UPDATE_STATE(lev);
+	; !!!
+	; **   LIST[lev+1] = LIST[lev]
+	; **   DIST[lev+1] = (DIST[lev] | LIST[lev+1])
+	; **   COMP[lev+1] = (COMP[lev] | DIST[lev+1])
+	; **   newbit = 1;
+
+	; Save our loaded values
+	vmovdqa	cur(list, 0), ymm_list
+
+	; **   LIST[lev+1] = LIST[lev]	; No need as we keep list in registers
+	; **   DIST[lev+1] = (DIST[lev] | LIST[lev+1])
+	; **   COMP[lev+1] = (COMP[lev] | DIST[lev+1])
+
+	vmovdqa	cur(comp, 0), ymm_comp
+	vpor	ymm_dist, ymm_list
+	vpor	ymm_comp, ymm_dist
+
+;	!! delay init !!
+;	newbit = 1
+
+	;      lev->mark = mark;
+	mov	[rbp+level_mark-rbp_shift], ebx
+	mov	[rbp+level_limit-rbp_shift], edi
+
+	;      lev++;
+	add	rbp, sizeof_level
+
+	;      depth++;
+	inc	edx
+
+	; /* Compute the maximum position for the next level */
+	; #define choose(dist,seg) pchoose[(dist >> (SCALAR_BITS-CHOOSE_DIST_BITS)) * 32 + (seg)]
+	; limit = choose(dist0, depth);
+
+	vpextrw	rax, xmm_dist, 3	; Extract the first 16 bits from dist
+	shl	eax, 5
+	add	eax, edx
+	movzx	edi, word [r13+rax*2]
+
+	;      if (depth > oState->half_depth && depth <= oState->half_depth2) {
+	;;;      if (depth > halfdepth && depth <= halfdepth2) {
+	cmp	edx, [work_halfdepth2]
+	jbe	continue_if_depth%1
+
+skip_if_depth%1:
+
+	vpextrq	rax, xmm_comp, 0
+	vmovq	xmm_newbit, xmm_one;
+
+	;      if (--nodes <= 0) {
+
+	sub	r15d, 1
+	jg	for_loop%2
+
+	;        goto exit;
+	jmp	exit
+
+	align	16
+continue_if_depth%1:
+	cmp	edx, [work_halfdepth]
+	jbe	skip_if_depth%1
+
+;        int temp = maxlen_m1 - oState->Levels[oState->half_depth].mark;
+;;        int temp = oState->max - 1 - oState->Levels[halfdepth].mark;
+
+	mov	esi, [work_maxlen_m1]
+	sub	esi, [r12]
+
+;        if (depth < oState->half_depth2) {
+	cmp	edx, [work_halfdepth2]
+	jae	update_limit_temp%1
+
+;          temp -= LOOKUP_FIRSTBLANK(dist0); // "33" version
+;;;        temp -= LOOKUP_FIRSTBLANK(dist0 & -((SCALAR)1 << 32));
+
+	vpextrq	rcx, xmm_dist, 0	; move upper part of dist into rcx
+	not	rcx
+
+%ifdef use_lzcnt
+	sub	esi, 1
+	lzcnt	rcx, rcx
+	sub	esi, ecx
+%else
+	mov	eax, -1
+	bsr	rcx, rcx
+	cmovz	ecx, eax
+	add	esi, ecx
+	sub	esi, 64
+%endif
+
+update_limit_temp%1:
+;        if (limit > temp) {
+;          limit = temp;
+;        }
+
+	cmp	edi, esi
+	cmovg	edi, esi
+	jmp	skip_if_depth%1
+
+	align	16
+full_shift%1:
+	;      else {         /* s >= SCALAR_BITS */
+
+	;        if ((mark += SCALAR_BITS) > limit) {
+	;          break;
+	;        }
+	add	ebx, 64
+	cmp	ebx, edi ; limit (==lev->limit)
+	ja	break_for%1
+
+	;      COMP_LEFT_LIST_RIGHT_WORD(lev);
+	;      continue;
+
+	; COMP_LEFT_LIST_RIGHT_WORD(lev);
+	; !!!
+
+	; Input
+	; comp [D C B A]
+	; list [D C B A]
+	; newb [0 0 0 N]
+	;
+	; Output
+	; comp >>[0 D C B]
+	; list [C B A N]
+	; newb [0 0 0 0]
+
+	vpermpd	ymm_comp, ymm_comp, 39h	; Reorder to be [A D C B]
+	vpermpd ymm_list, ymm_list, 90h	; Reorder to be [C B A D]
+
+	vpblendd	ymm_comp, ymm_comp, ymm_zero, 192	; overwrite highest quadword with 0
+	vpblendd	ymm_list, ymm_list, ymm_newbit, 3	; overwrite lowest quadword with N
+	vmovq	xmm_newbit, xmm_zero	; Clear newbit
+
+	vpextrq	rax, xmm_comp, 0
+	jmp	for_loop%1
+
+	align	16
+break_for%1:
+
+	;    lev--;
+	sub	rbp, sizeof_level
+
+	;    depth--;
+	dec	edx
+
+	;    POP_LEVEL(lev);
+	; !!!
+	vmovdqa	ymm_comp, cur(comp, 0)
+	vmovq	xmm_newbit, xmm_zero	;      newbit = 0;
+
+	;  } while (depth > oState->stopdepth);
+	mov	ecx, [work_stopdepth]
+
+	vpextrq	rax, xmm_comp, 0
+
+	; split loop header
+	mov	ebx, [rbp+level_mark-rbp_shift]	; mark  = lev->mark;
+	mov	edi, [rbp+level_limit-rbp_shift]
+
+	cmp	ecx, edx
+	jb	do_loop_split%3
+
+	vmovdqa	ymm_list, cur(list, 0)
+	vmovdqa	ymm_dist, cur(dist, 0)
+	jmp	exit
+
+ruler_found%1:
+	vpblendd	ymm_temp_A, ymm_temp_A, ymm_newbit, 3	; overwrite lowest quadword with N
+	vpor	ymm_list, ymm_temp_A
+	vpor	ymm_comp, ymm_temp_B
+	jmp	exit
+%endmacro
+
+%macro header 1
+	; Although Linux requires less registers to save, common code
+	; is simpler to manage. So save maximum amount required to work with all OS'es.
+	push	rsi	; Windows
+	push	rdi	; Windows
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	; According to x64 ABI, stack must be aligned by 16 before call =>
+	; it'll be xxxxxxx8 after call. We've pushed EVEN number of registers above =>
+	; stack is still at xxxxxxx8. Subtracting ***8 will make it aligned to 16,
+	; so we can save XMM registers (required for Windows only, but see above).
+	sub	rsp, 0xA8
+	movdqa	[rsp+0x00], xmm6
+	movdqa	[rsp+0x10], xmm7
+	movdqa	[rsp+0x20], xmm8
+	movdqa	[rsp+0x30], xmm9
+	movdqa	[rsp+0x40], xmm10
+	movdqa	[rsp+0x50], xmm11
+	movdqa	[rsp+0x60], xmm12
+	movdqa	[rsp+0x70], xmm13
+	movdqa	[rsp+0x80], xmm14
+	movdqa	[rsp+0x90], xmm15
+
+%ifdef _WINDOWS
+	; Switch to linux calling convention
+	mov	rdi, rcx
+	mov	rsi, rdx
+	mov	rdx, r8
+%endif
+
+	; Create work area and align it to 32 bytes
+	mov	rcx, rsp
+	sub	rsp, worksize
+	and	rsp, -32
+	mov	[work_oldrsp], rcx
+
+start:
+	; write the paramters in the aligned work space
+	mov	[work_oState], rdi
+	mov	[work_pnodes], rsi
+	mov	r13, rdx
+
+	mov	edx, [rdi+oState_depth]
+
+	imul	eax, edx, sizeof_level
+	lea	rbp, [rax+rdi+oState_Levels+rbp_shift]	; lev = &oState->Levels[oState->depth]
+	mov	r15d, [rsi]	; nodes = *pnodes
+
+	mov	eax, [rdi+oState_half_depth]
+	mov	[work_halfdepth], eax	; halfdepth = oState->half_depth
+
+	; get address of oState->Levels[oState->half_depth].mark
+	; value of this var can be changed during crunching, but addr is const
+	imul	eax, sizeof_level
+	lea	r12, [rax+rdi+oState_Levels+level_mark]
+
+	mov	eax, [rdi+oState_half_depth2]
+	mov	[work_halfdepth2], eax	; halfdepth2 = oState->half_depth2
+
+	mov	eax, [rdi+oState_max]
+	dec	eax
+	mov	[work_maxlen_m1], eax	; maxlen_m1 = oState->max - 1
+
+	mov	r14d, [rdi+oState_maxdepthm1]
+
+	mov	eax, [rdi+oState_stopdepth]
+	mov	[work_stopdepth], eax
+
+	; Zero all vector registers
+	vzeroall
+
+	mov eax, 1
+	vmovq xmm_one, rax
+
+	; SETUP_TOP_STATE(lev);
+	; !!!
+	vmovdqa	ymm_comp, cur(comp, 0)
+
+	; int newbit = (depth < oState->maxdepthm1) ? 1 : 0;
+	xor	eax, eax
+	cmp	edx, r14d
+	setl	al
+	vmovq	xmm_newbit, rax
+
+	; mm0..mm3 = comp
+	; mm4 = newbit
+
+	; split loop header
+	mov	ebx, [rbp+level_mark-rbp_shift]	; mark  = lev->mark;
+	mov	edi, [rbp+level_limit-rbp_shift]
+
+	vpextrq	rax, xmm_comp, 0
+
+	jmp do_loop_split%1
+%endmacro
+
+%macro footer 0
+exit:
+	;  SAVE_FINAL_STATE(lev);
+	; !!!
+	vmovdqa	cur(list, 0), ymm_list
+	vmovdqa	cur(dist, 0), ymm_dist
+	vmovdqa	cur(comp, 0), ymm_comp
+
+	;      lev->mark = mark;
+	mov	[rbp+level_mark-rbp_shift], ebx
+	mov	[rbp+level_limit-rbp_shift], edi
+
+	mov	rbx, [work_pnodes]	; *pnodes -= nodes;
+	sub	[rbx], r15d
+
+	mov	eax, edx	; return depth;
+
+	mov	rsp, [work_oldrsp]
+	movdqa	xmm6, [rsp+0x00]
+	movdqa	xmm7, [rsp+0x10]
+	movdqa	xmm8, [rsp+0x20]
+	movdqa	xmm9, [rsp+0x30]
+	movdqa	xmm10, [rsp+0x40]
+	movdqa	xmm11, [rsp+0x50]
+	movdqa	xmm12, [rsp+0x60]
+	movdqa	xmm13, [rsp+0x70]
+	movdqa	xmm14, [rsp+0x80]
+	movdqa	xmm15, [rsp+0x90]
+	add	rsp, 0xA8
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	emms
+	ret
+%endmacro
+
+%macro body 1
+	%assign max_id %1
+	%assign id 1
+	%rep %1
+		%assign next_id id + 1
+		%if next_id > max_id
+			%assign next_id max_id
+		%endif
+
+		%assign prev_id id - 1
+		%if prev_id < 1
+			%assign prev_id 1
+		%endif
+
+		func id, next_id, prev_id
+		%assign id id + 1
+	%endrep
+%endmacro
+
+global	_ogrng64_cycle_256_cj1_avx2
+global	ogrng64_cycle_256_cj1_avx2
+_ogrng64_cycle_256_cj1_avx2:
+ogrng64_cycle_256_cj1_avx2:
+
+	header 5
+	body 30
+	footer
diff --git a/ogr/amd64/ogrng64-cj1-avx2.cpp b/ogr/amd64/ogrng64-cj1-avx2.cpp
new file mode 100644
index 000000000..8cace7046
--- /dev/null
+++ b/ogr/amd64/ogrng64-cj1-avx2.cpp
@@ -0,0 +1,54 @@
+#include "ansi/ogrng-64.h"
+
+#define OGROPT_HAVE_FIND_FIRST_ZERO_BIT_ASM   0 /* 0-2 - 'no'            */
+#define OGROPT_ALTERNATE_CYCLE                1 /* 0/1 - 'yes'           */
+#define OGR_NG_GET_DISPATCH_TABLE_FXN  ogrng64_get_dispatch_table_cj1_avx2
+#define OGROPT_SPECIFIC_LEVEL_STRUCT
+
+/*
+ ** Level datas.
+ */
+struct OgrLevel {
+   BMAP list[OGRNG_BITMAPS_WORDS];
+   BMAP dist[OGRNG_BITMAPS_WORDS];
+   BMAP comp[OGRNG_BITMAPS_WORDS];
+   int mark;
+   int limit;
+   int pad0;
+   int pad1;
+   int pad2;
+   int pad3;
+   int pad4;
+   int pad5;
+};
+
+#include "ansi/ogrng_codebase.cpp"
+
+#include "ccoreio.h"       /* CDECL */
+#include <stddef.h>        /* offsetof */
+
+extern "C" int CDECL ogrng64_cycle_256_cj1_avx2(struct OgrState *oState, int *pnodes, const u16* pchoose);
+
+static int ogr_cycle_256(struct OgrState *oState, int *pnodes, const u16* pchoose)
+{
+    /* Check structures layout and alignment to match assembly */
+
+    STATIC_ASSERT(offsetof(struct OgrState, max)         == 0 );
+    STATIC_ASSERT(offsetof(struct OgrState, maxdepthm1)  == 8 );
+    STATIC_ASSERT(offsetof(struct OgrState, half_depth)  == 12);
+    STATIC_ASSERT(offsetof(struct OgrState, half_depth2) == 16);
+    STATIC_ASSERT(offsetof(struct OgrState, stopdepth)   == 24);
+    STATIC_ASSERT(offsetof(struct OgrState, depth)       == 28);
+    STATIC_ASSERT(offsetof(struct OgrState, Levels)      == 32);
+
+    STATIC_ASSERT(sizeof(struct OgrLevel) == 128);
+    STATIC_ASSERT(sizeof(oState->Levels)  == 128 * OGR_MAXDEPTH);
+
+    STATIC_ASSERT(offsetof(struct OgrLevel, list)  ==   0);
+    STATIC_ASSERT(offsetof(struct OgrLevel, dist)  ==  32);
+    STATIC_ASSERT(offsetof(struct OgrLevel, comp)  ==  64);
+    STATIC_ASSERT(offsetof(struct OgrLevel, mark)  ==  96);
+    STATIC_ASSERT(offsetof(struct OgrLevel, limit) == 100);
+
+    return ogrng64_cycle_256_cj1_avx2(oState, pnodes, pchoose);
+}
diff --git a/plat/win/msvcver.cmd b/plat/win/msvcver.cmd
index 6ee1bc114..31bf1e695 100755
--- a/plat/win/msvcver.cmd
+++ b/plat/win/msvcver.cmd
@@ -13,6 +13,7 @@ rem   15 = cl 15.00.xxxx = Visual Studio 2008 (VC9)
 rem   16 = cl 16.00.xxxx = Visual Studio 2010 (VC10)
 rem   17 = cl 17.00.xxxx = Visual Studio 2012 (VC11)
 rem   18 = cl 18.00.xxxx = Visual Studio 2013 (VC12)
+rem   19 = cl 18.00.xxxx = Visual Studio 2015 (VC14)
 
 for %%i in (cl.exe) do if "%%~$PATH:i"=="" goto notfound
 
diff --git a/plat/win/w32cons.cpp b/plat/win/w32cons.cpp
index 5732e559c..dba446637 100644
--- a/plat/win/w32cons.cpp
+++ b/plat/win/w32cons.cpp
@@ -7629,7 +7629,7 @@ static int __w32ConOutX(const char *text, int iserr)
   if (!handled)
   {
     /* note the spaces around the caption! Don't let this window be "findable" */
-    MessageBox(NULL,text, " "W32CLI_CONSOLE_NAME" ",MB_OK|MB_TASKMODAL
+    MessageBox(NULL,text, " " W32CLI_CONSOLE_NAME " ",MB_OK|MB_TASKMODAL
                             |(iserr?MB_ICONHAND:MB_ICONINFORMATION));
   }
   return 0;
diff --git a/plat/win/w32svc.cpp b/plat/win/w32svc.cpp
index b00a0a495..8bc0dfbc0 100644
--- a/plat/win/w32svc.cpp
+++ b/plat/win/w32svc.cpp
@@ -92,9 +92,9 @@ return "@(#)$Id: w32svc.cpp,v 1.12 2012/08/21 18:55:55 sla Exp $"; }
   const char *W9xSERVICEKEY = "distributed.net client";
   #endif
 #endif /* PROXYTYPE or not */
-const char *APPDESCRIP = "distributed.net "SERVICEFOR;
+const char *APPDESCRIP = "distributed.net " SERVICEFOR;
 
-#define SERVICEMUTEX "distributed.net "SERVICEFOR" service mutex"
+#define SERVICEMUTEX "distributed.net " SERVICEFOR " service mutex"
 
 /* ---------------------------------------------------------- */
 
@@ -529,7 +529,7 @@ int win32CliDetectRunningService(void) /* <0=err, 0=no, >0=yes */
 int win32CliUninstallService(int quiet)
 {
   int retcode = -1;
-  const char *msg = "A distributed.net "SERVICEFOR" could not be uninstalled";
+  const char *msg = "A distributed.net " SERVICEFOR " could not be uninstalled";
 
   if (__winGetVersion() < 400) /* win16 */
   {