Adding range decoding.

fast-pack · Aug 24, 2016 · 3de5f35 · 3de5f35
1 parent 91f3e15
commit 3de5f35
Show file tree

Hide file tree

Showing 7 changed files with 178 additions and 11 deletions.
diff --git a/Makefile b/Makefile
@@ -12,7 +12,7 @@ endif # debug
 CFLAGS =  -std=c99 $(GENFLAGS)
 CXXFLAGS = -std=c++11 -fpermissive $(GENFLAGS)
 
-HEADERS=src/bpacking.h src/dict.h  src/scalar.h src/avxbpacking.h src/avxcodec.h  src/avxdict.h
+HEADERS=src/bpacking.h src/dict.h  src/scalarcodec.h src/avxbpacking.h src/avxcodec.h  src/avxdict.h
 EXECUTABLES=scalartest avxtest decodebenchmark
 
 all: $(EXECUTABLES)

diff --git a/README.md b/README.md
@@ -26,9 +26,30 @@ for the sake of this experiment.
 It is tempting in dictionary coding, to first unpack the indexes to a temporary buffer
 and then run through it and look-up the values in the dictionary. What if it were possible
 to decode the indexes and look-up the values in the dictionary in one step?
-It is possible with vector instructions as long as you have access to a ``gather`` 
+It is possible with vector instructions as long as you have access to a ``gather``
 instruction. Thankfully, recent commodity x64 processors have such an instruction.
 
+## A word on RAM access
+
+There is no slower processor than an idle processor waiting for the memory
+subsystem.
+
+When working with large data sets, it is tempting to decompress them from RAM
+to RAM, converting gigabytes of compressed data into (many more) gigabytes of
+uncompressed data.
+
+If the purpose of compression is to keep more of the data close to the CPU,
+then this is wasteful.
+
+One should engineer applications so as to work on cache-friendly blocks. For
+example, if have an array made of billions of values, instead of decoding them
+all to RAM, and then reading them, it is much better to decode them in small blocks
+at a time. In fact, ideally, one would prefer not to decode the data at all if possible:
+working directly over the compressed data would be ideal.
+
+If you must decode gigabytes of data to RAM or to disk, then you should expect
+to be wasting enormous quantities of CPU cycles.
+
 ## Credit
 
 Builds on work done by Eric Daniel for ``parquet-cpp``.  
@@ -44,7 +65,7 @@ make && make test
 
 We find that an AVX2 dictionary decoder can be more than twice as fast as a good scalar decoder
 on a recent Intel processor (Skylake). See results below. We expect results on older
-Intel architectures to be less impressive because the ``vpgather`` instruction that we use was 
+Intel architectures to be less impressive because the ``vpgather`` instruction that we use was
 quite slow in its early incarnations.
 
 ```bash
@@ -119,4 +140,3 @@ testing with dictionary of size 32768
 - This code makes up its own convenient format. It is not meant to plug as-is into an existing framework.
 - We assume that the arrays are large. If you have tiny arrays... well...
 - We effectively measure steady-state throughput. So we ignore costs such as loading up the dictionary in CPU cache.
-
diff --git a/benchmarks/decodebenchmark.cpp b/benchmarks/decodebenchmark.cpp
@@ -10,7 +10,7 @@
 #include "avxcodec.h"
 #endif
 
-#include "scalar.h"
+#include "scalarcodec.h"
 
 
 void fill_buffer(uint64_t * buf, uint32_t length, uint32_t distinct)
@@ -38,6 +38,34 @@ void scalartest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
     delete[] buf;
 }
 
+
+size_t decodetocache(SimpleDictCODEC * scalarcodec, dictionary_coded_t * t, uint64_t * newbuf, size_t blocksize) {
+  size_t totaldecoded = 0;
+  size_t leftover = t->array_length;
+  for(size_t i = 0; i <  t->array_length; i += blocksize) {
+    size_t todecode = leftover > blocksize ? blocksize : leftover;
+    totaldecoded += todecode;
+    leftover = scalarcodec->rangeuncompress(*t,newbuf, i , todecode);
+  }
+  return totaldecoded;
+}
+
+void scalarcachetest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
+    uint64_t * buf = new uint64_t[length];
+    fill_buffer(buf, length, distinct);
+    SimpleDictCODEC sc;
+    dictionary_coded_t t (sc.compress(buf, length) );
+    size_t bufsize = 1 << 16;
+    uint64_t * newbuf = new uint64_t[bufsize];
+    BEST_TIME(decodetocache(&sc, &t,newbuf,bufsize), length, repeat, length);
+    for(size_t i = length - bufsize; i < length; i++) {
+        assert(buf[i] == newbuf[i - length + bufsize]);
+    }
+    delete[] newbuf;
+    delete[] buf;
+}
+
+
 #ifdef __AVX2__
 void mediumtest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
     uint64_t * buf = new uint64_t[length];
@@ -58,9 +86,7 @@ void mediumtest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
 void fasttest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
     uint64_t * buf = new uint64_t[length];
     fill_buffer(buf, length, distinct);
-
-    AVXDictCODEC codec;
-    dictionary_coded_t t (codec.compress(buf, length) );
+    dictionary_coded_t t (AVXDictCODEC().compress(buf, length) );
     uint64_t * newbuf = new uint64_t[length];
     BEST_TIME(AVXDictCODEC::fastuncompress(t,newbuf), length, repeat, length);
     for(size_t i = 0; i < length; i++) {
@@ -69,19 +95,49 @@ void fasttest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
     delete[] newbuf;
     delete[] buf;
 }
+
+size_t AVXdecodetocache(dictionary_coded_t * t, uint64_t * newbuf, size_t blocksize) {
+  size_t totaldecoded = 0;
+  size_t leftover = t->array_length;
+  for(size_t i = 0; i <  t->array_length; i += blocksize) {
+    size_t todecode = leftover > blocksize ? blocksize : leftover;
+    totaldecoded += todecode;
+    leftover = AVXDictCODEC::fastrangeuncompress(*t,newbuf, i , todecode);
+  }
+  return totaldecoded;
+}
+
+void fastcachetest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
+    uint64_t * buf = new uint64_t[length];
+    fill_buffer(buf, length, distinct);
+
+    dictionary_coded_t t (AVXDictCODEC().compress(buf, length) );
+    size_t bufsize = 1 << 16;
+    uint64_t * newbuf = new uint64_t[bufsize];
+    BEST_TIME(AVXdecodetocache(&t,newbuf,bufsize), length, repeat, length);
+    for(size_t i = length - bufsize; i < length; i++) {
+        assert(buf[i] == newbuf[i - length + bufsize]);
+    }
+    delete[] newbuf;
+    delete[] buf;
+}
 #endif
 
 int main() {
     printf("For this benchmark, use a recent (Skylake) Intel processor for best results.\n");
     tellmeall();
     uint32_t length = 1<<23;    // larger than L3 cache
+    printf("Using array sizes of %u values or %lu kiB.\n", length, length * sizeof(uint64_t) / 1024);
     int repeat = 5;
     for(uint32_t distinct = 2; distinct <= (1<<20); distinct *=2) {
         std::cout << "testing with dictionary of size " << distinct << std::endl;
         scalartest(distinct, length, repeat);
+        scalarcachetest(distinct, length, repeat);
+
 #ifdef __AVX2__
         mediumtest(distinct, length, repeat);
         fasttest(distinct, length, repeat);
+        fastcachetest(distinct, length, repeat);
 #endif
         std::cout<<std::endl;
     }

diff --git a/src/avxcodec.h b/src/avxcodec.h
@@ -66,6 +66,7 @@ class AVXDictCODEC {
         assert(length % 256 == 0);
         out.bit_width = 32 - __builtin_clz(out.dictionary_size);
         out.compressed_data_size = sizeof(uint32_t) * out.bit_width * length / 32;
+        assert(out.array_length * out.bit_width ==  out.compressed_data_size * 8);
         out.compressed_data = new char[out.compressed_data_size];
         avxpackwithoutmask(tmpbuffer,(__m256i *) out.compressed_data, out.array_length, out.bit_width);
         return out;
@@ -86,6 +87,7 @@ class AVXDictCODEC {
     inline uint32_t uncompress(const dictionary_coded_t & t, uint64_t * out) {
         ensureBufferCapacity(t.array_length);
         assert(t.array_length % 256 == 0);
+        assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
         avxunpack((const __m256i*) t.compressed_data, tmpbuffer, t.array_length, t.bit_width);
         for(size_t i = 0; i < t.array_length; ++i) {
             out[i] = t.dictionary[tmpbuffer[i]];
@@ -97,18 +99,40 @@ class AVXDictCODEC {
     * Prototype code that uncompresses an array of 64-bit integers.
     * The out array should have enough space.
     *
+    * If the size of the compressed data does not fit in fast CPU cache,
+    * consider using fastrangeuncompress instead, to decompress data to fast CPU cache
+    * in blocks. Pushing data back and forth from RAM can be slow.
     *
     * For simplicity, array lengths are assumed to be multiples of 256.
     *
     * Return array size
     */
     static inline uint32_t fastuncompress(const dictionary_coded_t & t, uint64_t * out) {
         assert(t.array_length % 256 == 0);
+        assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
         avxunpackdict((const __m256i*) t.compressed_data,
                       (const int64_t *) t.dictionary,(int64_t *)  out, t.array_length, t.bit_width);
         return t.array_length;
     }
 
+    /**
+    * Uncompresses size values from index start
+    * This can be used to uncompress data to cache.
+    *
+    * For simplicity, all indexes and lengths are are assumed to be multiples of 256.
+    * Return the number of remain values left to be decoded (starting at index start+length)
+    */
+    static inline uint32_t fastrangeuncompress(const dictionary_coded_t & t, uint64_t * out, size_t start, size_t length) {
+        assert(t.array_length % 256 == 0);
+        assert(start % 256 == 0);
+        assert(length % 256 == 0);
+        assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
+        assert(start + length <= t.array_length);
+        avxunpackdict((const __m256i*) ( t.compressed_data + start * t.bit_width / 8),
+                      (const int64_t *) t.dictionary,(int64_t *)  out, length, t.bit_width);
+        return t.array_length - start - length;
+    }
+
 
     inline void clearBuffer() {
         buffercapacity = 0;

diff --git a/src/scalar.h → src/scalarcodec.h b/src/scalar.h → src/scalarcodec.h
@@ -79,6 +79,26 @@ class SimpleDictCODEC {
         return t.array_length;
     }
 
+    /**
+    * Uncompresses size values from index start
+    * This can be used to uncompress data to cache.
+    *
+    * For simplicity, all indexes and lengths are are assumed to be multiples of 32.
+    * Return the number of remain values left to be decoded (starting at index start+length)
+    */
+    inline uint32_t rangeuncompress(const dictionary_coded_t & t, uint64_t * out, size_t start, size_t length) {
+        assert(t.array_length % 32 == 0);
+        assert(start % 32 == 0);
+        assert(length % 32 == 0);
+        assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
+        assert(start + length <= t.array_length);
+        ensureBufferCapacity(t.array_length);
+        unpack32((const uint32_t*) (t.compressed_data + start * t.bit_width / 8), tmpbuffer, length, t.bit_width);
+        for(size_t i = 0; i < length; ++i) {
+            out[i] = t.dictionary[tmpbuffer[i]];
+        }
+        return t.array_length - start - length;
+    }
 
 
     inline void clearBuffer() {

diff --git a/tests/avxtest.cpp b/tests/avxtest.cpp
@@ -1,3 +1,4 @@
+#include <string.h>
 #include <cassert>
 #include <iostream>
 
@@ -12,6 +13,7 @@ void basictest(uint32_t distinct, uint32_t length) {
     AVXDictCODEC codec;
     dictionary_coded_t t (codec.compress(buf, length));
     uint64_t * newbuf = new uint64_t[length];
+    memset(newbuf,0,sizeof(uint64_t) * length);
 
     size_t newlength = codec.uncompress(t,newbuf);
     assert(length == newlength);
@@ -28,7 +30,7 @@ void fasttest(uint32_t distinct, uint32_t length) {
     AVXDictCODEC codec;
     dictionary_coded_t t (codec.compress(buf, length) );
     uint64_t * newbuf = new uint64_t[length];
-
+    memset(newbuf,0,sizeof(uint64_t) * length);
     size_t newlength = AVXDictCODEC::fastuncompress(t,newbuf);
     assert(length == newlength);
     for(size_t i = 0; i < length; i++) {
@@ -37,13 +39,34 @@ void fasttest(uint32_t distinct, uint32_t length) {
     delete[] newbuf;
     delete[] buf;
 }
-
+void fastrangetest(uint32_t distinct, uint32_t length) {
+    uint64_t * buf = new uint64_t[length];
+    for(size_t i = 0; i < length; i++) {
+        buf[i] = (i % distinct) * UINT64_C(0xcb9fe8c7cff9982a) + 77777 ;// made up
+    }
+    AVXDictCODEC codec;
+    dictionary_coded_t t (codec.compress(buf, length) );
+    for(size_t blocksize = 256; blocksize <= length; blocksize += 256) {
+      uint64_t * newbuf = new uint64_t[length];
+      memset(newbuf,0,sizeof(uint64_t) * length);
+      size_t leftover = length;
+      for(size_t i = 0; i < length; i += blocksize) {
+        leftover = AVXDictCODEC::fastrangeuncompress(t,newbuf + i , i , leftover > blocksize ? blocksize : leftover);
+      }
+      for(size_t i = 0; i < length; i++) {
+        assert(buf[i] == newbuf[i]);
+      }
+      delete[] newbuf;
+    }
+    delete[] buf;
+}
 
 int main() {
     for(uint32_t length = 256; length <= 65536; length *=2) {
         for(uint32_t distinct = 1; distinct <= 65536; distinct *=2) {
             basictest(distinct, length);
             fasttest(distinct, length);
+            fastrangetest(distinct, length);
         }
         std::cout << ".";
         std::cout.flush();

diff --git a/tests/scalartest.cpp b/tests/scalartest.cpp
@@ -1,7 +1,7 @@
 #include <cassert>
 #include <iostream>
 
-#include "scalar.h"
+#include "scalarcodec.h"
 
 
 void basictest(uint32_t distinct, uint32_t length) {
@@ -22,10 +22,34 @@ void basictest(uint32_t distinct, uint32_t length) {
     delete[] buf;
 }
 
+void rangetest(uint32_t distinct, uint32_t length) {
+    uint64_t * buf = new uint64_t[length];
+    for(size_t i = 0; i < length; i++) {
+        buf[i] = (i % distinct) * UINT64_C(0xcb9fe8c7cff9982a) + 77777 ;// made up
+    }
+    SimpleDictCODEC codec;
+    dictionary_coded_t t (codec.compress(buf, length) );
+    for(size_t blocksize = 32; blocksize <= length; blocksize += 32) {
+      uint64_t * newbuf = new uint64_t[length];
+      memset(newbuf,0,sizeof(uint64_t) * length);
+      size_t leftover = length;
+      for(size_t i = 0; i < length; i += blocksize) {
+        leftover = codec.rangeuncompress(t,newbuf + i , i , leftover > blocksize ? blocksize : leftover);
+      }
+      for(size_t i = 0; i < length; i++) {
+        assert(buf[i] == newbuf[i]);
+      }
+      delete[] newbuf;
+    }
+    delete[] buf;
+}
+
+
 int main() {
     for(uint32_t length = 256; length < 65536; length *=2) {
         for(uint32_t distinct = 1; distinct < 65536; distinct *=2) {
             basictest(distinct, length);
+            rangetest(distinct, length);
         }
         std::cout << ".";
         std::cout.flush();