Skip to content

Commit

Permalink
gpu library
Browse files Browse the repository at this point in the history
  • Loading branch information
friofry committed May 7, 2021
1 parent f25b21d commit 5387185
Show file tree
Hide file tree
Showing 22 changed files with 838 additions and 1 deletion.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ add_subdirectory(cpu_lib)
add_subdirectory(benchmark_cpu)
#add_subdirectory(motif_finder)



add_subdirectory(gpu_lib)
add_subdirectory(benchmark_gpu)

find_package( CUDA 10.0 QUIET )
if(NOT CUDA_FOUND)
message("CUDA 10.0 not found")
Expand Down
13 changes: 13 additions & 0 deletions benchmark_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
cmake_minimum_required(VERSION 3.17)
project(benchmark_gpu CXX)

set(CMAKE_CXX_STANDARD 14)

add_executable(benchmark_gpu
main.cpp)

target_link_libraries(benchmark_gpu
lib::common
lib::gpu
3rdparty::cxxopts
)
37 changes: 37 additions & 0 deletions benchmark_gpu/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include <sstream>
#include <vector>
#include <cstdint>

#include <config.h>
#include <motif_finder_gpu_internal.h>
#include <motif_finder_gpu_external.h>
#include <fst_reader.h>

using namespace std;

void run()
{
GpuInternalParams params;
params.sequences = read_fasta("test.fst");
params.search_complementary = false;
params.gpu_count = 1;
params.unified_memory = false;
params.motif_range_size = MOT_PER_CHUNK;
params.threads_per_block = THREADS_PER_BLOCK;

std::vector<uint16_t> res;
find_motifs_internal_gpu(res, params);


std::vector<uint32_t> motif_hashes;
GpuExternalParams external_params(motif_hashes);
find_motifs_external_gpu(res, params);
}

int main(int argc, char **argv)
{
argc = 0;
argv = 0;
run();
return 0;
}
21 changes: 21 additions & 0 deletions benchmark_gpu/test.fst
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
> 0
CGAATAATGGCGCAACGACC
> 1
TCGCACCAAATATAGGCTGG
> 2
AGCATCTGTCAAAAATGGGA
> 3
GCCCCTCGGTAATTAAGGCA
> 4
ACACCTTAAATTTAGGACAA
> 5
AAAAAATAAAATTGGGCTTA
> 6
AAAAAAGGCGACTCTGGGGT
> 7
TAAACGCACTTAACAAATCG
> 8
TCTCACTTTCTTTACTATTA
> 9
CTGCGGACATAATCGTGATG

2 changes: 1 addition & 1 deletion common_lib/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
const unsigned int MOT_PER_CHUNK = 512*64*64*4;

// CUDA unified memory
#define UNIFIED_MEMORY_SUPPORT 0
#define UNIFIED_MEMORY_SUPPORT false

// CUDA threads per block
#define THREADS_PER_BLOCK (512)
Expand Down
44 changes: 44 additions & 0 deletions gpu_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
cmake_minimum_required(VERSION 3.17)
project(motif_finder_gpu LANGUAGES CUDA)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CUDA_STANDARD 14)
include_directories(.)
set(COMMON_NVCC_FLAGS -Xptxas -O3)

set(ARCH "30 35 37 50 52 60 61 70 80 86")

set(SOURCES
finder_kernel.cu
finder_kernel.cuh
gpu_cuda_params.h
gpu_info.cpp
gpu_info.h
gpu_memory.cu
gpu_memory.h
gpu_range_motif_finder_external.cpp
gpu_range_motif_finder_external.h
gpu_range_motif_finder_internal.cpp
gpu_range_motif_finder_internal.h
motif_finder_gpu_external.h
motif_finder_gpu_external.cpp
motif_finder_gpu_internal.cpp
motif_finder_gpu_internal.h
sequence_hashes_params.h)

add_library(${PROJECT_NAME} ${SOURCES})
add_library(lib::gpu ALIAS ${PROJECT_NAME})

set_target_properties(
${PROJECT_NAME}
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_ARCHITECTURES ${ARCH})


target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${COMMON_NVCC_FLAGS}>)
target_link_libraries(${PROJECT_NAME} lib::common)
target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/.)
target_include_directories(${PROJECT_NAME} PRIVATE .)


132 changes: 132 additions & 0 deletions gpu_lib/finder_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#include "finder_kernel.cuh"

namespace {
__device__ uint16_t get_occurrence(uint32_t motif_hash,
uint32_t *seq_hashes,
uint32_t sequences_count,
uint32_t *hash_lengths,
uint32_t *hash_begins,
uint32_t threads_per_block)
{
// Shared memory size bytes: hash length * sizeof (uint32_t)
// Max sequence length: shared memory size / 4bytes (16-25k)
extern __shared__ uint32_t sequence[];

// Max number of sequences: 64k
uint16_t result = 0;

for (uint32_t j = 0; j < sequences_count; j++) {
uint32_t seq_offset = hash_begins[j];
uint32_t seq_length = hash_lengths[j];

// Load sequence to fast shared memory
for (uint32_t i = threadIdx.x; i < seq_length; i += threads_per_block) {
if (i < seq_length) {
sequence[i] = seq_hashes[i + seq_offset];
}
}

__syncthreads();
// Match motif to sequence
uint8_t found = 0;
for (uint32_t i = 0; (i < length) && !found; i++) {
found = (sequence[i] & motif_hash) == sequence[i];
}
result += found;
__syncthreads();
}
return result;
}

__device__ uint32_t motif_index_to_hash(uint32_t motif_index) {
uint32_t result = 0;
uint32_t mult = 1;
for (uint32_t i = MOTIV_LEN - 1; i >= 0; i--) {
result += (motif_index % ALPH_SIZE + 1) * mult;
mult *= HASH_BASE;
motif_index /= ALPH_SIZE;
}
return result;
}

__global__ void motif_finder_kernel_external(
uint16_t *weights_out,
uint32_t *seq_hashes,
uint32_t sequences_count,
uint32_t *hash_lengths,
uint32_t *hash_begins,
uint32_t *mot_hashes,
uint32_t mots_to_copy,
uint32_t threads_per_block)
{
uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t motif_hash = (index <= mots_to_copy) ? mot_hashes[index] : 0;
weights_out[index] = get_occurrence(motif_hash, seq_hashes, sequences_count, hash_lengths, hash_begins, threads_per_block);
}

__global__ void motif_finder_kernel_internal(
uint16_t *weights_out,
uint32_t *seq_hashes,
uint32_t sequences_count,
uint32_t *hash_lengths,
uint32_t *hash_begins,
uint32_t mots_to_copy,
uint32_t motif_idx_offset,
uint32_t threads_per_block)
{
uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t motif_hash = (index <= mots_to_copy) ? motif_index_to_hash(motif_idx_offset + index) : 0;
weights_out[index] = get_occurrence(motif_hash, seq_hashes, sequences_count, hash_lengths, hash_begins, threads_per_block);
}
} // namespace

void motif_finder_gpu_internal(
uint16_t *weights_out,
uint32_t *seq_hashes,
uint32_t sequences_count,
uint32_t *hash_lengths,
uint32_t *hash_begins,
uint32_t mots_to_copy,
uint32_t motif_range_size,
uint32_t shared_memory_size,
uint32_t motif_idx_offset,
uint32_t threads_per_block)
{
motif_finder_kernel_internal<<<motif_range_size/threads_per_block, threads_per_block, shared_memory_size>>>(
weights_out,
seq_hashes,
sequences_count,
hash_lengths,
hash_begins,
mots_to_copy,
motif_idx_offset,
threads_per_block);
cudaDeviceSynchronize();
}

void motif_finder_gpu_external(
uint16_t *weights_out,
uint32_t *seq_hashes,
uint32_t sequences_count,
uint32_t *hash_lengths,
uint32_t *hash_begins,
uint32_t *motif_hashes,
uint32_t mots_to_copy,
uint32_t motif_range_size,
uint32_t shared_memory_size,
uint32_t motif_idx_offset,
uint32_t threads_per_block)
{
motif_finder_kernel_external<<<motif_range_size/threads_per_block, threads_per_block, shared_memory_size>>>(
weights_out,
seq_hashes,
sequences_count,
hash_lengths,
hash_begins,
motif_hashes,
mots_to_copy,
motif_idx_offset,
threads_per_block);
cudaDeviceSynchronize();
}

31 changes: 31 additions & 0 deletions gpu_lib/finder_kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef MOTIF_FINDER_FINDER_KERNEL_CUH
#define MOTIF_FINDER_FINDER_KERNEL_CUH

// these are wrapper functions for cuda kernels

void motif_finder_gpu_external(
uint16_t *weights_out,
uint32_t *seq_hashes,
uint32_t sequences_count,
uint32_t *hash_lengths,
uint32_t *hash_begins,
uint32_t *motif_hashes,
uint32_t mots_to_copy,
uint32_t motif_range_size,
uint32_t shared_memory_size,
uint32_t motif_idx_offset,
uint32_t threads_per_block);

void motif_finder_gpu_internal(
uint16_t *weights_out,
uint32_t *seq_hashes,
uint32_t sequences_count,
uint32_t *hash_lengths,
uint32_t *hash_begins,
uint32_t mots_to_copy,
uint32_t motif_range_size,
uint32_t shared_memory_size,
uint32_t motif_idx_offset,
uint32_t threads_per_block);

#endif //MOTIF_FINDER_FINDER_KERNEL_CUH
11 changes: 11 additions & 0 deletions gpu_lib/gpu_cuda_params.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#ifndef MOTIF_FINDER_GPU_CUDA_PARAMS_H
#define MOTIF_FINDER_GPU_CUDA_PARAMS_H
#include <config.h>

struct GpuCudaParams {
bool unified_memory {UNIFIED_MEMORY_SUPPORT};
uint32_t motif_range_size {MOT_PER_CHUNK};
uint32_t threads_per_block {THREADS_PER_BLOCK};
};

#endif //MOTIF_FINDER_GPU_CUDA_PARAMS_H
27 changes: 27 additions & 0 deletions gpu_lib/gpu_info.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include "gpu_info.h"

#include <stdio.h>
#include <cuda_runtime_api.h>
#include <cuda.h>

int gpu_count()
{
int nDevices;
cudaGetDeviceCount(&nDevices);
//printf("Device Count: %d\n", nDevices);
for (int i = 0; i < nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
if (false) {
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}
}
return nDevices;
}
7 changes: 7 additions & 0 deletions gpu_lib/gpu_info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#ifndef MOTIF_FINDER_GPU_INFO_H
#define MOTIF_FINDER_GPU_INFO_H

// get number of available GPUs
int gpu_count();

#endif //MOTIF_FINDER_GPU_INFO_H
Loading

0 comments on commit 5387185

Please sign in to comment.