Skip to content

Commit

Permalink
Adding range decoding.
Browse files Browse the repository at this point in the history
  • Loading branch information
lemire committed Aug 24, 2016
1 parent 91f3e15 commit 3de5f35
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ endif # debug
CFLAGS = -std=c99 $(GENFLAGS)
CXXFLAGS = -std=c++11 -fpermissive $(GENFLAGS)

HEADERS=src/bpacking.h src/dict.h src/scalar.h src/avxbpacking.h src/avxcodec.h src/avxdict.h
HEADERS=src/bpacking.h src/dict.h src/scalarcodec.h src/avxbpacking.h src/avxcodec.h src/avxdict.h
EXECUTABLES=scalartest avxtest decodebenchmark

all: $(EXECUTABLES)
Expand Down
26 changes: 23 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,30 @@ for the sake of this experiment.
It is tempting in dictionary coding, to first unpack the indexes to a temporary buffer
and then run through it and look-up the values in the dictionary. What if it were possible
to decode the indexes and look-up the values in the dictionary in one step?
It is possible with vector instructions as long as you have access to a ``gather``
It is possible with vector instructions as long as you have access to a ``gather``
instruction. Thankfully, recent commodity x64 processors have such an instruction.

## A word on RAM access

There is no slower processor than an idle processor waiting for the memory
subsystem.

When working with large data sets, it is tempting to decompress them from RAM
to RAM, converting gigabytes of compressed data into (many more) gigabytes of
uncompressed data.

If the purpose of compression is to keep more of the data close to the CPU,
then this is wasteful.

One should engineer applications so as to work on cache-friendly blocks. For
example, if have an array made of billions of values, instead of decoding them
all to RAM, and then reading them, it is much better to decode them in small blocks
at a time. In fact, ideally, one would prefer not to decode the data at all if possible:
working directly over the compressed data would be ideal.

If you must decode gigabytes of data to RAM or to disk, then you should expect
to be wasting enormous quantities of CPU cycles.

## Credit

Builds on work done by Eric Daniel for ``parquet-cpp``.
Expand All @@ -44,7 +65,7 @@ make && make test

We find that an AVX2 dictionary decoder can be more than twice as fast as a good scalar decoder
on a recent Intel processor (Skylake). See results below. We expect results on older
Intel architectures to be less impressive because the ``vpgather`` instruction that we use was
Intel architectures to be less impressive because the ``vpgather`` instruction that we use was
quite slow in its early incarnations.

```bash
Expand Down Expand Up @@ -119,4 +140,3 @@ testing with dictionary of size 32768
- This code makes up its own convenient format. It is not meant to plug as-is into an existing framework.
- We assume that the arrays are large. If you have tiny arrays... well...
- We effectively measure steady-state throughput. So we ignore costs such as loading up the dictionary in CPU cache.

64 changes: 60 additions & 4 deletions benchmarks/decodebenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "avxcodec.h"
#endif

#include "scalar.h"
#include "scalarcodec.h"


void fill_buffer(uint64_t * buf, uint32_t length, uint32_t distinct)
Expand Down Expand Up @@ -38,6 +38,34 @@ void scalartest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
delete[] buf;
}


size_t decodetocache(SimpleDictCODEC * scalarcodec, dictionary_coded_t * t, uint64_t * newbuf, size_t blocksize) {
size_t totaldecoded = 0;
size_t leftover = t->array_length;
for(size_t i = 0; i < t->array_length; i += blocksize) {
size_t todecode = leftover > blocksize ? blocksize : leftover;
totaldecoded += todecode;
leftover = scalarcodec->rangeuncompress(*t,newbuf, i , todecode);
}
return totaldecoded;
}

void scalarcachetest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
uint64_t * buf = new uint64_t[length];
fill_buffer(buf, length, distinct);
SimpleDictCODEC sc;
dictionary_coded_t t (sc.compress(buf, length) );
size_t bufsize = 1 << 16;
uint64_t * newbuf = new uint64_t[bufsize];
BEST_TIME(decodetocache(&sc, &t,newbuf,bufsize), length, repeat, length);
for(size_t i = length - bufsize; i < length; i++) {
assert(buf[i] == newbuf[i - length + bufsize]);
}
delete[] newbuf;
delete[] buf;
}


#ifdef __AVX2__
void mediumtest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
uint64_t * buf = new uint64_t[length];
Expand All @@ -58,9 +86,7 @@ void mediumtest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
void fasttest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
uint64_t * buf = new uint64_t[length];
fill_buffer(buf, length, distinct);

AVXDictCODEC codec;
dictionary_coded_t t (codec.compress(buf, length) );
dictionary_coded_t t (AVXDictCODEC().compress(buf, length) );
uint64_t * newbuf = new uint64_t[length];
BEST_TIME(AVXDictCODEC::fastuncompress(t,newbuf), length, repeat, length);
for(size_t i = 0; i < length; i++) {
Expand All @@ -69,19 +95,49 @@ void fasttest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
delete[] newbuf;
delete[] buf;
}

size_t AVXdecodetocache(dictionary_coded_t * t, uint64_t * newbuf, size_t blocksize) {
size_t totaldecoded = 0;
size_t leftover = t->array_length;
for(size_t i = 0; i < t->array_length; i += blocksize) {
size_t todecode = leftover > blocksize ? blocksize : leftover;
totaldecoded += todecode;
leftover = AVXDictCODEC::fastrangeuncompress(*t,newbuf, i , todecode);
}
return totaldecoded;
}

void fastcachetest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
uint64_t * buf = new uint64_t[length];
fill_buffer(buf, length, distinct);

dictionary_coded_t t (AVXDictCODEC().compress(buf, length) );
size_t bufsize = 1 << 16;
uint64_t * newbuf = new uint64_t[bufsize];
BEST_TIME(AVXdecodetocache(&t,newbuf,bufsize), length, repeat, length);
for(size_t i = length - bufsize; i < length; i++) {
assert(buf[i] == newbuf[i - length + bufsize]);
}
delete[] newbuf;
delete[] buf;
}
#endif

int main() {
printf("For this benchmark, use a recent (Skylake) Intel processor for best results.\n");
tellmeall();
uint32_t length = 1<<23; // larger than L3 cache
printf("Using array sizes of %u values or %lu kiB.\n", length, length * sizeof(uint64_t) / 1024);
int repeat = 5;
for(uint32_t distinct = 2; distinct <= (1<<20); distinct *=2) {
std::cout << "testing with dictionary of size " << distinct << std::endl;
scalartest(distinct, length, repeat);
scalarcachetest(distinct, length, repeat);

#ifdef __AVX2__
mediumtest(distinct, length, repeat);
fasttest(distinct, length, repeat);
fastcachetest(distinct, length, repeat);
#endif
std::cout<<std::endl;
}
Expand Down
24 changes: 24 additions & 0 deletions src/avxcodec.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class AVXDictCODEC {
assert(length % 256 == 0);
out.bit_width = 32 - __builtin_clz(out.dictionary_size);
out.compressed_data_size = sizeof(uint32_t) * out.bit_width * length / 32;
assert(out.array_length * out.bit_width == out.compressed_data_size * 8);
out.compressed_data = new char[out.compressed_data_size];
avxpackwithoutmask(tmpbuffer,(__m256i *) out.compressed_data, out.array_length, out.bit_width);
return out;
Expand All @@ -86,6 +87,7 @@ class AVXDictCODEC {
inline uint32_t uncompress(const dictionary_coded_t & t, uint64_t * out) {
ensureBufferCapacity(t.array_length);
assert(t.array_length % 256 == 0);
assert(t.array_length * t.bit_width == t.compressed_data_size * 8);
avxunpack((const __m256i*) t.compressed_data, tmpbuffer, t.array_length, t.bit_width);
for(size_t i = 0; i < t.array_length; ++i) {
out[i] = t.dictionary[tmpbuffer[i]];
Expand All @@ -97,18 +99,40 @@ class AVXDictCODEC {
* Prototype code that uncompresses an array of 64-bit integers.
* The out array should have enough space.
*
* If the size of the compressed data does not fit in fast CPU cache,
* consider using fastrangeuncompress instead, to decompress data to fast CPU cache
* in blocks. Pushing data back and forth from RAM can be slow.
*
* For simplicity, array lengths are assumed to be multiples of 256.
*
* Return array size
*/
static inline uint32_t fastuncompress(const dictionary_coded_t & t, uint64_t * out) {
assert(t.array_length % 256 == 0);
assert(t.array_length * t.bit_width == t.compressed_data_size * 8);
avxunpackdict((const __m256i*) t.compressed_data,
(const int64_t *) t.dictionary,(int64_t *) out, t.array_length, t.bit_width);
return t.array_length;
}

/**
* Uncompresses size values from index start
* This can be used to uncompress data to cache.
*
* For simplicity, all indexes and lengths are are assumed to be multiples of 256.
* Return the number of remain values left to be decoded (starting at index start+length)
*/
static inline uint32_t fastrangeuncompress(const dictionary_coded_t & t, uint64_t * out, size_t start, size_t length) {
assert(t.array_length % 256 == 0);
assert(start % 256 == 0);
assert(length % 256 == 0);
assert(t.array_length * t.bit_width == t.compressed_data_size * 8);
assert(start + length <= t.array_length);
avxunpackdict((const __m256i*) ( t.compressed_data + start * t.bit_width / 8),
(const int64_t *) t.dictionary,(int64_t *) out, length, t.bit_width);
return t.array_length - start - length;
}


inline void clearBuffer() {
buffercapacity = 0;
Expand Down
20 changes: 20 additions & 0 deletions src/scalar.h → src/scalarcodec.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,26 @@ class SimpleDictCODEC {
return t.array_length;
}

/**
* Uncompresses size values from index start
* This can be used to uncompress data to cache.
*
* For simplicity, all indexes and lengths are are assumed to be multiples of 32.
* Return the number of remain values left to be decoded (starting at index start+length)
*/
inline uint32_t rangeuncompress(const dictionary_coded_t & t, uint64_t * out, size_t start, size_t length) {
assert(t.array_length % 32 == 0);
assert(start % 32 == 0);
assert(length % 32 == 0);
assert(t.array_length * t.bit_width == t.compressed_data_size * 8);
assert(start + length <= t.array_length);
ensureBufferCapacity(t.array_length);
unpack32((const uint32_t*) (t.compressed_data + start * t.bit_width / 8), tmpbuffer, length, t.bit_width);
for(size_t i = 0; i < length; ++i) {
out[i] = t.dictionary[tmpbuffer[i]];
}
return t.array_length - start - length;
}


inline void clearBuffer() {
Expand Down
27 changes: 25 additions & 2 deletions tests/avxtest.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <string.h>
#include <cassert>
#include <iostream>

Expand All @@ -12,6 +13,7 @@ void basictest(uint32_t distinct, uint32_t length) {
AVXDictCODEC codec;
dictionary_coded_t t (codec.compress(buf, length));
uint64_t * newbuf = new uint64_t[length];
memset(newbuf,0,sizeof(uint64_t) * length);

size_t newlength = codec.uncompress(t,newbuf);
assert(length == newlength);
Expand All @@ -28,7 +30,7 @@ void fasttest(uint32_t distinct, uint32_t length) {
AVXDictCODEC codec;
dictionary_coded_t t (codec.compress(buf, length) );
uint64_t * newbuf = new uint64_t[length];

memset(newbuf,0,sizeof(uint64_t) * length);
size_t newlength = AVXDictCODEC::fastuncompress(t,newbuf);
assert(length == newlength);
for(size_t i = 0; i < length; i++) {
Expand All @@ -37,13 +39,34 @@ void fasttest(uint32_t distinct, uint32_t length) {
delete[] newbuf;
delete[] buf;
}

void fastrangetest(uint32_t distinct, uint32_t length) {
uint64_t * buf = new uint64_t[length];
for(size_t i = 0; i < length; i++) {
buf[i] = (i % distinct) * UINT64_C(0xcb9fe8c7cff9982a) + 77777 ;// made up
}
AVXDictCODEC codec;
dictionary_coded_t t (codec.compress(buf, length) );
for(size_t blocksize = 256; blocksize <= length; blocksize += 256) {
uint64_t * newbuf = new uint64_t[length];
memset(newbuf,0,sizeof(uint64_t) * length);
size_t leftover = length;
for(size_t i = 0; i < length; i += blocksize) {
leftover = AVXDictCODEC::fastrangeuncompress(t,newbuf + i , i , leftover > blocksize ? blocksize : leftover);
}
for(size_t i = 0; i < length; i++) {
assert(buf[i] == newbuf[i]);
}
delete[] newbuf;
}
delete[] buf;
}

int main() {
for(uint32_t length = 256; length <= 65536; length *=2) {
for(uint32_t distinct = 1; distinct <= 65536; distinct *=2) {
basictest(distinct, length);
fasttest(distinct, length);
fastrangetest(distinct, length);
}
std::cout << ".";
std::cout.flush();
Expand Down
26 changes: 25 additions & 1 deletion tests/scalartest.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <cassert>
#include <iostream>

#include "scalar.h"
#include "scalarcodec.h"


void basictest(uint32_t distinct, uint32_t length) {
Expand All @@ -22,10 +22,34 @@ void basictest(uint32_t distinct, uint32_t length) {
delete[] buf;
}

void rangetest(uint32_t distinct, uint32_t length) {
uint64_t * buf = new uint64_t[length];
for(size_t i = 0; i < length; i++) {
buf[i] = (i % distinct) * UINT64_C(0xcb9fe8c7cff9982a) + 77777 ;// made up
}
SimpleDictCODEC codec;
dictionary_coded_t t (codec.compress(buf, length) );
for(size_t blocksize = 32; blocksize <= length; blocksize += 32) {
uint64_t * newbuf = new uint64_t[length];
memset(newbuf,0,sizeof(uint64_t) * length);
size_t leftover = length;
for(size_t i = 0; i < length; i += blocksize) {
leftover = codec.rangeuncompress(t,newbuf + i , i , leftover > blocksize ? blocksize : leftover);
}
for(size_t i = 0; i < length; i++) {
assert(buf[i] == newbuf[i]);
}
delete[] newbuf;
}
delete[] buf;
}


int main() {
for(uint32_t length = 256; length < 65536; length *=2) {
for(uint32_t distinct = 1; distinct < 65536; distinct *=2) {
basictest(distinct, length);
rangetest(distinct, length);
}
std::cout << ".";
std::cout.flush();
Expand Down

0 comments on commit 3de5f35

Please sign in to comment.