Skip to content

Commit

Permalink
Added AVX512 support
Browse files Browse the repository at this point in the history
  • Loading branch information
lemire committed Aug 25, 2016
1 parent b859343 commit abc2aaf
Show file tree
Hide file tree
Showing 7 changed files with 19,487 additions and 3,835 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ endif # debug
CFLAGS = -std=c99 $(GENFLAGS)
CXXFLAGS = -std=c++11 $(GENFLAGS)

HEADERS=src/bpacking.h src/dict.h src/scalarcodec.h src/avxbpacking.h src/avxcodec.h src/avxdict.h
HEADERS=src/bpacking.h src/dict.h src/scalarcodec.h src/avx512bpacking.h src/avx512codec.h src/avx512dict.h src/avxbpacking.h src/avxcodec.h src/avxdict.h
EXECUTABLES=scalartest avxtest decodebenchmark

all: $(EXECUTABLES)
Expand Down
97 changes: 97 additions & 0 deletions scripts/avx512dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python
import sys
def howmany(bit):
""" how many values are we going to pack? """
return 512

def howmany64perwideword():
return 512/64

def howmanywords(bit):
return (howmany(bit) * bit + 511)/512

def howmanybytes(bit):
return howmanywords(bit) * 64

print("""
/** avxdict512 **/
typedef long long myint64;
""")

print("""typedef void (*avx512unpackdictfnc)(const __m512i * compressed, const myint64 * dictionary, int64_t * pout);""")






def plurial(number):
if(number <> 1):
return "s"
else :
return ""

print("static void avx512unpackdict0(const __m512i * compressed, const myint64 * dictionary, int64_t * pout) {");
print(" (void) compressed;");
print(" __m512i * out = (__m512i *) pout;");
print(" const __m512i uniquew = _mm512_set1_epi64x(dictionary[0]);");
print(" for(int k = 0; k < {0}; k++) {{".format(howmany(0)/howmany64perwideword()));
print(" _mm512_storeu_si512(out + k, uniquew);")
print(" }");
print("}");
print("")

for bit in range(1,33):
print("")
print("/* we packed {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avx512unpackdict{0}(const __m512i * compressed, const myint64 * dictionary, int64_t * pout) {{".format(bit));
print(" /* we are going to access {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m512i w0;")
else:
print(" __m512i w0, w1;")
print(" __m512i wout;")
print(" __m512i * out = (__m512i *) pout;");
if(bit < 32): print(" const __m512i mask = _mm512_set1_epi32({0});".format((1<<bit)-1));
maskstr = " _mm512_and_si512 ( mask, {0}) "
if (bit == 32) : maskstr = " {0} " # no need
oldword = 0
print(" w0 = _mm512_lddqu_si512 (compressed);")
for j in range(howmany(bit)/16):
firstword = j * bit / 32
secondword = (j * bit + bit - 1)/32
if(secondword > oldword):
print(" w{0} = _mm512_lddqu_si512 (compressed + {1});".format(secondword%2,secondword))
oldword = secondword
firstshift = (j*bit) % 32
firstshiftstr = "_mm512_srli_epi32( w{0} , "+str(firstshift)+") "
if(firstshift == 0):
firstshiftstr =" w{0} " # no need
wfirst = firstshiftstr.format(firstword%2)
if( firstword == secondword):
if(firstshift + bit <> 32):
wfirst = maskstr.format(wfirst)
print(" wout = {0}; // 512-bit word to be output".format(wfirst));
print(" _mm512_storeu_si512(out + {0},_mm512_i32gather_epi64(dictionary,_mm512_castsi512_si256(wout), 8)); // load from dictionary and store".format(2*j))
print(" _mm512_storeu_si512(out + {0},_mm512_i32gather_epi64(dictionary,_mm512_extracti64x4_epi64(wout,1), 8)); // load from dictionary and store".format(2*j+1))
else:
secondshift = (32-firstshift)
wsecond = "_mm512_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
wfirstorsecond = " _mm512_or_si512 ({0},{1}) ".format(wfirst,wsecond)
wfirstorsecond = maskstr.format(wfirstorsecond)
print(" wout = {0}; // 512-bit word to be output".format(wfirstorsecond));
print(" _mm512_storeu_si512(out + {0},_mm512_i32gather_epi64(dictionary,_mm512_castsi512_si256(wout), 8)); // load from dictionary and store".format(2*j))
print(" _mm512_storeu_si512(out + {0},_mm512_i32gather_epi64(dictionary,_mm512_extracti64x4_epi64(wout,1), 8)); // load from dictionary and store".format(2*j+1))
print("}");
print("")



print("static avx512unpackdictfnc avx512funcUnpackDictArr[] = {")
for bit in range(0,32):
print("&avx512unpackdict{0},".format(bit))
print("&avx512unpackdict32")
print("};")
print("/** end of avxdict **/")
7 changes: 3 additions & 4 deletions scripts/avxdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def howmanywords(bit):
return (howmany(bit) * bit + 255)/256

def howmanybytes(bit):
return howmanywords(bit) * 16
return howmanywords(bit) * 32

print("""
/** avxdict **/
Expand Down Expand Up @@ -74,16 +74,15 @@ def plurial(number):
if(firstshift + bit <> 32):
wfirst = maskstr.format(wfirst)
print(" wout = {0}; // 256-bit word to be output".format(wfirst));
#print(" _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst))
print(" _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_extractf128_si256(wout,0), 8)); // load from dictionary and store".format(2*j))
print(" _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_castsi256_si128(wout), 8)); // load from dictionary and store".format(2*j))
print(" _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_extractf128_si256(wout,1), 8)); // load from dictionary and store".format(2*j+1))
else:
secondshift = (32-firstshift)
wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
wfirstorsecond = maskstr.format(wfirstorsecond)
print(" wout = {0}; // 256-bit word to be output".format(wfirstorsecond));
print(" _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_extractf128_si256(wout,0), 8)); // load from dictionary and store".format(2*j))
print(" _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_castsi256_si128(wout), 8)); // load from dictionary and store".format(2*j))
print(" _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_extractf128_si256(wout,1), 8)); // load from dictionary and store".format(2*j+1))
print("}");
print("")
Expand Down
Loading

0 comments on commit abc2aaf

Please sign in to comment.