diff --git a/src/cuda/decode.cuh b/src/cuda/decode.cuh index bebf5da31..018a1f900 100644 --- a/src/cuda/decode.cuh +++ b/src/cuda/decode.cuh @@ -403,6 +403,14 @@ decode3( uint granularity ); +#pragma pack(1) +typedef struct { + uint64 base_offset; + uint16 lengths[ZFP_PARTITION_SIZE]; +} hybrid_index_entry; + +static_assert(ZFP_PARTITION_SIZE == 32, "CUDA backend expects that the hybrid index partition size is equal to the size of a CUDA warp"); + // compute bit offset to compressed block inline __device__ unsigned long long @@ -412,22 +420,20 @@ block_offset(const Word* d_index, zfp_index_type index_type, size_t chunk_idx) return d_index[chunk_idx]; if (index_type == zfp_index_hybrid) { - const size_t thread_idx = threadIdx.x; - // TODO: Why subtract thread_idx? And should granularity not matter? - const size_t warp_idx = (chunk_idx - thread_idx) / 32; - // warp operates on 32 blocks indexed by one 64-bit offset, 32 16-bit sizes - const uint64* data64 = (const uint64*)d_index + warp_idx * 9; - const uint16* data16 = (const uint16*)data64 + 3; - // TODO: use warp shuffle instead of shared memory - __shared__ uint64 offset[32]; - offset[thread_idx] = thread_idx ? data16[thread_idx] : *data64; - // compute prefix sum in parallel - for (uint i = 1u; i < 32u; i <<= 1) { - if (thread_idx + i < 32u) - offset[thread_idx + i] += offset[thread_idx]; - __syncthreads(); + const uint local_id = threadIdx.x & 0x1f; + const size_t index_partition = chunk_idx / ZFP_PARTITION_SIZE; + const hybrid_index_entry * const idx = (hybrid_index_entry *)d_index + index_partition; + + size_t offset = local_id > 0 ? idx->lengths[local_id - 1] : idx->base_offset; + + __syncwarp(); + #pragma unroll + for (uint i = 1; i < 32; i<<=1) { + size_t v = __shfl_up_sync(__activemask(), offset, i); + offset = local_id >= i ? offset + v : offset; } - return offset[thread_idx]; + __syncwarp(); + return offset; } return 0;