gpu library

friofry · May 7, 2021 · 5387185 · 5387185
1 parent f25b21d
commit 5387185
Show file tree

Hide file tree

Showing 22 changed files with 838 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,6 +26,11 @@ add_subdirectory(cpu_lib)
 add_subdirectory(benchmark_cpu)
 #add_subdirectory(motif_finder)
 
+
+
+add_subdirectory(gpu_lib)
+add_subdirectory(benchmark_gpu)
+
 find_package( CUDA 10.0 QUIET )
 if(NOT CUDA_FOUND)
     message("CUDA 10.0 not found")

diff --git a/benchmark_gpu/CMakeLists.txt b/benchmark_gpu/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.17)
+project(benchmark_gpu CXX)
+
+set(CMAKE_CXX_STANDARD 14)
+
+add_executable(benchmark_gpu
+               main.cpp)
+
+target_link_libraries(benchmark_gpu
+        lib::common
+        lib::gpu
+        3rdparty::cxxopts
+        )
diff --git a/benchmark_gpu/main.cpp b/benchmark_gpu/main.cpp
@@ -0,0 +1,37 @@
+#include <sstream>
+#include <vector>
+#include <cstdint>
+
+#include <config.h>
+#include <motif_finder_gpu_internal.h>
+#include <motif_finder_gpu_external.h>
+#include <fst_reader.h>
+
+using namespace std;
+
+void run()
+{
+    GpuInternalParams params;
+    params.sequences = read_fasta("test.fst");
+    params.search_complementary = false;
+    params.gpu_count = 1;
+    params.unified_memory = false;
+    params.motif_range_size = MOT_PER_CHUNK;
+    params.threads_per_block = THREADS_PER_BLOCK;
+
+    std::vector<uint16_t> res;
+    find_motifs_internal_gpu(res, params);
+
+
+    std::vector<uint32_t> motif_hashes;
+    GpuExternalParams external_params(motif_hashes);
+    find_motifs_external_gpu(res, params);
+}
+
+int main(int argc, char **argv)
+{
+    argc = 0;
+    argv = 0;
+    run();
+    return 0;
+}
diff --git a/benchmark_gpu/test.fst b/benchmark_gpu/test.fst
@@ -0,0 +1,21 @@
+> 0
+CGAATAATGGCGCAACGACC
+> 1
+TCGCACCAAATATAGGCTGG
+> 2
+AGCATCTGTCAAAAATGGGA
+> 3
+GCCCCTCGGTAATTAAGGCA
+> 4
+ACACCTTAAATTTAGGACAA
+> 5
+AAAAAATAAAATTGGGCTTA
+> 6
+AAAAAAGGCGACTCTGGGGT
+> 7
+TAAACGCACTTAACAAATCG
+> 8
+TCTCACTTTCTTTACTATTA
+> 9
+CTGCGGACATAATCGTGATG
+
diff --git a/common_lib/config.h b/common_lib/config.h
@@ -17,7 +17,7 @@
 const unsigned int MOT_PER_CHUNK = 512*64*64*4;
 
 // CUDA unified memory
-#define UNIFIED_MEMORY_SUPPORT 0
+#define UNIFIED_MEMORY_SUPPORT false
 
 // CUDA threads per block
 #define THREADS_PER_BLOCK (512)

diff --git a/gpu_lib/CMakeLists.txt b/gpu_lib/CMakeLists.txt
@@ -0,0 +1,44 @@
+cmake_minimum_required(VERSION 3.17)
+project(motif_finder_gpu LANGUAGES CUDA)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 14)
+include_directories(.)
+set(COMMON_NVCC_FLAGS -Xptxas -O3)
+
+set(ARCH "30 35 37 50 52 60 61 70 80 86")
+
+set(SOURCES
+        finder_kernel.cu
+        finder_kernel.cuh
+        gpu_cuda_params.h
+        gpu_info.cpp
+        gpu_info.h
+        gpu_memory.cu
+        gpu_memory.h
+        gpu_range_motif_finder_external.cpp
+        gpu_range_motif_finder_external.h
+        gpu_range_motif_finder_internal.cpp
+        gpu_range_motif_finder_internal.h
+        motif_finder_gpu_external.h
+        motif_finder_gpu_external.cpp
+        motif_finder_gpu_internal.cpp
+        motif_finder_gpu_internal.h
+        sequence_hashes_params.h)
+
+add_library(${PROJECT_NAME} ${SOURCES})
+add_library(lib::gpu ALIAS ${PROJECT_NAME})
+
+set_target_properties(
+        ${PROJECT_NAME}
+        PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_ARCHITECTURES ${ARCH})
+
+
+target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${COMMON_NVCC_FLAGS}>)
+target_link_libraries(${PROJECT_NAME} lib::common)
+target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/.)
+target_include_directories(${PROJECT_NAME} PRIVATE .)
+
+
diff --git a/gpu_lib/finder_kernel.cu b/gpu_lib/finder_kernel.cu
@@ -0,0 +1,132 @@
+#include "finder_kernel.cuh"
+
+namespace {
+__device__ uint16_t get_occurrence(uint32_t motif_hash,
+                                   uint32_t *seq_hashes,
+                                   uint32_t sequences_count,
+                                   uint32_t *hash_lengths,
+                                   uint32_t *hash_begins,
+                                   uint32_t threads_per_block)
+{
+    // Shared memory size bytes: hash length * sizeof (uint32_t)
+    // Max sequence length: shared memory size / 4bytes (16-25k)
+    extern __shared__ uint32_t sequence[];
+
+    // Max number of sequences: 64k
+    uint16_t result = 0;
+
+    for (uint32_t j = 0; j < sequences_count; j++) {
+        uint32_t seq_offset = hash_begins[j];
+        uint32_t seq_length = hash_lengths[j];
+
+        // Load sequence to fast shared memory
+        for (uint32_t i = threadIdx.x; i < seq_length; i += threads_per_block) {
+            if (i < seq_length) {
+                sequence[i] = seq_hashes[i + seq_offset];
+            }
+        }
+
+        __syncthreads();
+        // Match motif to sequence
+        uint8_t found = 0;
+        for (uint32_t i = 0; (i < length) && !found; i++) {
+            found = (sequence[i] & motif_hash) == sequence[i];
+        }
+        result += found;
+        __syncthreads();
+    }
+    return result;
+}
+
+ __device__ uint32_t motif_index_to_hash(uint32_t motif_index) {
+     uint32_t result = 0;
+     uint32_t mult = 1;
+     for (uint32_t i = MOTIV_LEN - 1; i >= 0; i--) {
+         result += (motif_index % ALPH_SIZE + 1) * mult;
+         mult *= HASH_BASE;
+         motif_index /= ALPH_SIZE;
+     }
+     return result;
+ }
+
+__global__ void motif_finder_kernel_external(
+        uint16_t *weights_out,
+        uint32_t *seq_hashes,
+        uint32_t sequences_count,
+        uint32_t *hash_lengths,
+        uint32_t *hash_begins,
+        uint32_t *mot_hashes,
+        uint32_t mots_to_copy,
+        uint32_t threads_per_block)
+{
+    uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t motif_hash = (index <= mots_to_copy) ? mot_hashes[index] : 0;
+    weights_out[index] = get_occurrence(motif_hash, seq_hashes, sequences_count, hash_lengths, hash_begins, threads_per_block);
+}
+
+__global__ void motif_finder_kernel_internal(
+        uint16_t *weights_out,
+        uint32_t *seq_hashes,
+        uint32_t sequences_count,
+        uint32_t *hash_lengths,
+        uint32_t *hash_begins,
+        uint32_t mots_to_copy,
+        uint32_t motif_idx_offset,
+        uint32_t threads_per_block)
+{
+    uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t motif_hash = (index <= mots_to_copy) ? motif_index_to_hash(motif_idx_offset + index) : 0;
+    weights_out[index] = get_occurrence(motif_hash, seq_hashes, sequences_count, hash_lengths, hash_begins, threads_per_block);
+}
+} // namespace
+
+void motif_finder_gpu_internal(
+        uint16_t *weights_out,
+        uint32_t *seq_hashes,
+        uint32_t sequences_count,
+        uint32_t *hash_lengths,
+        uint32_t *hash_begins,
+        uint32_t mots_to_copy,
+        uint32_t motif_range_size,
+        uint32_t shared_memory_size,
+        uint32_t motif_idx_offset,
+        uint32_t threads_per_block)
+{
+    motif_finder_kernel_internal<<<motif_range_size/threads_per_block, threads_per_block, shared_memory_size>>>(
+            weights_out,
+            seq_hashes,
+            sequences_count,
+            hash_lengths,
+            hash_begins,
+            mots_to_copy,
+            motif_idx_offset,
+            threads_per_block);
+    cudaDeviceSynchronize();
+}
+
+void motif_finder_gpu_external(
+        uint16_t *weights_out,
+        uint32_t *seq_hashes,
+        uint32_t sequences_count,
+        uint32_t *hash_lengths,
+        uint32_t *hash_begins,
+        uint32_t *motif_hashes,
+        uint32_t mots_to_copy,
+        uint32_t motif_range_size,
+        uint32_t shared_memory_size,
+        uint32_t motif_idx_offset,
+        uint32_t threads_per_block)
+{
+    motif_finder_kernel_external<<<motif_range_size/threads_per_block, threads_per_block, shared_memory_size>>>(
+            weights_out,
+            seq_hashes,
+            sequences_count,
+            hash_lengths,
+            hash_begins,
+            motif_hashes,
+            mots_to_copy,
+            motif_idx_offset,
+            threads_per_block);
+    cudaDeviceSynchronize();
+}
+
diff --git a/gpu_lib/finder_kernel.cuh b/gpu_lib/finder_kernel.cuh
@@ -0,0 +1,31 @@
+#ifndef MOTIF_FINDER_FINDER_KERNEL_CUH
+#define MOTIF_FINDER_FINDER_KERNEL_CUH
+
+// these are wrapper functions for cuda kernels
+
+void motif_finder_gpu_external(
+        uint16_t *weights_out,
+        uint32_t *seq_hashes,
+        uint32_t sequences_count,
+        uint32_t *hash_lengths,
+        uint32_t *hash_begins,
+        uint32_t *motif_hashes,
+        uint32_t mots_to_copy,
+        uint32_t motif_range_size,
+        uint32_t shared_memory_size,
+        uint32_t motif_idx_offset,
+        uint32_t threads_per_block);
+
+void motif_finder_gpu_internal(
+        uint16_t *weights_out,
+        uint32_t *seq_hashes,
+        uint32_t sequences_count,
+        uint32_t *hash_lengths,
+        uint32_t *hash_begins,
+        uint32_t mots_to_copy,
+        uint32_t motif_range_size,
+        uint32_t shared_memory_size,
+        uint32_t motif_idx_offset,
+        uint32_t threads_per_block);
+
+#endif //MOTIF_FINDER_FINDER_KERNEL_CUH
diff --git a/gpu_lib/gpu_cuda_params.h b/gpu_lib/gpu_cuda_params.h
@@ -0,0 +1,11 @@
+#ifndef MOTIF_FINDER_GPU_CUDA_PARAMS_H
+#define MOTIF_FINDER_GPU_CUDA_PARAMS_H
+#include <config.h>
+
+struct GpuCudaParams {
+    bool unified_memory {UNIFIED_MEMORY_SUPPORT};
+    uint32_t motif_range_size {MOT_PER_CHUNK};
+    uint32_t threads_per_block {THREADS_PER_BLOCK};
+};
+
+#endif //MOTIF_FINDER_GPU_CUDA_PARAMS_H
diff --git a/gpu_lib/gpu_info.cpp b/gpu_lib/gpu_info.cpp
@@ -0,0 +1,27 @@
+#include "gpu_info.h"
+
+#include <stdio.h>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+
+int gpu_count()
+{
+    int nDevices;
+    cudaGetDeviceCount(&nDevices);
+    //printf("Device Count: %d\n", nDevices);
+    for (int i = 0; i < nDevices; i++) {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, i);
+        if (false) {
+            printf("Device Number: %d\n", i);
+            printf("  Device name: %s\n", prop.name);
+            printf("  Memory Clock Rate (KHz): %d\n",
+                   prop.memoryClockRate);
+            printf("  Memory Bus Width (bits): %d\n",
+                   prop.memoryBusWidth);
+            printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
+                   2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
+        }
+    }
+    return nDevices;
+}
diff --git a/gpu_lib/gpu_info.h b/gpu_lib/gpu_info.h
@@ -0,0 +1,7 @@
+#ifndef MOTIF_FINDER_GPU_INFO_H
+#define MOTIF_FINDER_GPU_INFO_H
+
+// get number of available GPUs
+int gpu_count();
+
+#endif //MOTIF_FINDER_GPU_INFO_H