Fix gpu benchmark

friofry · May 11, 2021 · 564ad36 · 564ad36
1 parent cb1c333
commit 564ad36
Show file tree

Hide file tree

Showing 17 changed files with 168 additions and 140 deletions.
diff --git a/3rdparty/json/json.hpp b/3rdparty/json/json.hpp
@@ -2387,7 +2387,7 @@ using is_detected_convertible =
 
 #include <cstdint> // int64_t, uint64_t
 #include <map> // map
-#include <memory> // allocator
+#include <memory> // alloc
 #include <string> // string
 #include <vector> // vector
 
@@ -14640,7 +14640,7 @@ default; will be used in @ref number_integer_t)
 `uint64_t` by default; will be used in @ref number_unsigned_t)
 @tparam NumberFloatType type for JSON floating-point numbers (`double` by
 default; will be used in @ref number_float_t)
-@tparam AllocatorType type of the allocator to use (`std::allocator` by
+@tparam AllocatorType type of the alloc to use (`std::alloc` by
 default)
 @tparam JSONSerializer the serializer to resolve internal calls to `to_json()`
 and `from_json()` (@ref adl_serializer by default)
@@ -14808,7 +14808,7 @@ class basic_json
     /// a type to represent container sizes
     using size_type = std::size_t;
 
-    /// the allocator type
+    /// the alloc type
     using allocator_type = AllocatorType<basic_json>;
 
     /// the type of an element pointer
@@ -14829,7 +14829,7 @@ class basic_json
 
 
     /*!
-    @brief returns the allocator associated with the container
+    @brief returns the alloc associated with the container
     */
     static allocator_type get_allocator()
     {
@@ -14952,21 +14952,21 @@ class basic_json
     @tparam StringType the type of the keys or names (e.g., `std::string`).
     The comparison function `std::less<StringType>` is used to order elements
     inside the container.
-    @tparam AllocatorType the allocator to use for objects (e.g.,
-    `std::allocator`)
+    @tparam AllocatorType the alloc to use for objects (e.g.,
+    `std::alloc`)
 
     #### Default type
 
     With the default values for @a ObjectType (`std::map`), @a StringType
-    (`std::string`), and @a AllocatorType (`std::allocator`), the default
+    (`std::string`), and @a AllocatorType (`std::alloc`), the default
     value for @a object_t is:
 
     @code {.cpp}
     std::map<
       std::string, // key_type
       basic_json, // value_type
       std::less<std::string>, // key_compare
-      std::allocator<std::pair<const std::string, basic_json>> // allocator_type
+      std::alloc<std::pair<const std::string, basic_json>> // allocator_type
     >
     @endcode
 
@@ -15036,17 +15036,17 @@ class basic_json
 
     @tparam ArrayType  container type to store arrays (e.g., `std::vector` or
     `std::list`)
-    @tparam AllocatorType allocator to use for arrays (e.g., `std::allocator`)
+    @tparam AllocatorType alloc to use for arrays (e.g., `std::alloc`)
 
     #### Default type
 
     With the default values for @a ArrayType (`std::vector`) and @a
-    AllocatorType (`std::allocator`), the default value for @a array_t is:
+    AllocatorType (`std::alloc`), the default value for @a array_t is:
 
     @code {.cpp}
     std::vector<
       basic_json, // value_type
-      std::allocator<basic_json> // allocator_type
+      std::alloc<basic_json> // allocator_type
     >
     @endcode
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,5 +32,4 @@ if(NOT CUDA_FOUND)
 else()
     add_subdirectory(gpu_lib)
     add_subdirectory(benchmark_gpu)
-    add_subdirectory(motif_finder_gpu)
 endif()
diff --git a/benchmark_common_lib/sequence_generator.cpp b/benchmark_common_lib/sequence_generator.cpp
@@ -1,4 +1,5 @@
 #include "sequence_generator.h"
+#include <stdexcept>
 
 #include <letter_conversions.h>
 #include <fst_reader.h>

diff --git a/benchmark_gpu/CMakeLists.txt b/benchmark_gpu/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.17)
-project(benchmark_gpu CXX)
+project(benchmark_gpu CUDA CXX)
 
 set(CMAKE_CXX_STANDARD 14)
 

diff --git a/benchmark_gpu/cmd_parser.cpp b/benchmark_gpu/cmd_parser.cpp
@@ -70,7 +70,7 @@ TestParams parse_gpu_input_args(int argc, char **argv)
     res.seq_lengths = parse_int_vec(r, "lengths", {10});
     res.gpu_counts = parse_int_vec(r, "gpu_counts", {1});
     res.complementary = parse_int_vec(r, "complementary", {0});
-    res.algorithms = parse_string_vec(r, "algorithms", {"gpu_naive_full"});
+    res.algorithms = parse_string_vec(r, "algorithms", {"internal"});
     res.sequences_file = parse_string_param(r, "sequencesfile", "");
     res.threads_per_block = parse_int_vec(r, "threads_per_block", {THREADS_PER_BLOCK});
     res.chunk_sizes = parse_int_vec(r, "chunk_sizes", {MOT_PER_CHUNK});
@@ -81,7 +81,7 @@ TestParams parse_gpu_input_args(int argc, char **argv)
     return res;
 }
 
-std::string gpu_result_json(const RunParams &params, float result, unsigned int motifs_count)
+std::string gpu_result_json(const GpuRunParams &params, float result, unsigned int motifs_count)
 {
     json o;
     o["count"] = params.count;

diff --git a/benchmark_gpu/cmd_parser.h b/benchmark_gpu/cmd_parser.h
@@ -17,7 +17,7 @@ struct TestParams {
     std::vector<int> unified_memory;
 };
 
-struct RunParams {
+struct GpuRunParams {
     int count {0};
     int length{0};
     int gpus{0};
@@ -31,7 +31,7 @@ struct RunParams {
 };
 
 TestParams parse_gpu_input_args(int argc, char **argv);
-std::string gpu_result_json(const RunParams &params, float result, unsigned int motifs_count);
+std::string gpu_result_json(const GpuRunParams &params, float result, unsigned int motifs_count);
 std::string combine_results_json(const std::vector<std::string> &results);
 
 #endif //MOTIF_FINDER_CMD_PARSER_H
diff --git a/benchmark_gpu/main.cpp b/benchmark_gpu/main.cpp
@@ -33,6 +33,12 @@ std::string run(const vector<uint32_t> &motif_hashes, const SequenceHashes &sequ
     Timer t;
     t.silence();
 
+    GpuCudaParams cuda_params;
+    cuda_params.gpu_count = params.gpus;
+    cuda_params.unified_memory = params.unified_memory;
+    cuda_params.threads_per_block = params.threads_per_block;
+    cuda_params.motif_range_size = params.motif_chunk_size;
+
     if (params.algorithm == "internal") {
         std::vector<uint16_t> occurrences;
         internal_gpu_algorithm(sequence_hashes, occurrences, cuda_params);
@@ -104,7 +110,7 @@ int main(int argc, char **argv)
                                 for (int threads_per_block : params.threads_per_block) {
                                     for (int chunk_size : params.chunk_sizes) {
                                         for (bool unified_mem : params.unified_memory) {
-                                            RunParams cur_params;
+                                            GpuRunParams cur_params;
                                             cur_params.count = count;
                                             cur_params.length = length;
                                             cur_params.gpus = gpu_count;

diff --git a/conversion_lib/hash_conversions_x4.h b/conversion_lib/hash_conversions_x4.h
@@ -2,6 +2,7 @@
 #define MOTIF_FINDER_HASH_CONVERSIONS_X4_H
 
 #include <vector>
+#include <cstdint>
 
 /// предрассчитать хэши для всех возможны 4-буквенных индексов
 std::vector<uint32_t> calc_hash_x4();

diff --git a/gpu_lib/CMakeLists.txt b/gpu_lib/CMakeLists.txt
@@ -33,7 +33,8 @@ set_target_properties(
 
 target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${COMMON_NVCC_FLAGS}>)
 target_link_libraries(${PROJECT_NAME}
-        lib::common)
+        lib::common
+        lib::conversion)
 
 target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/.)
 target_include_directories(${PROJECT_NAME} PRIVATE .)

diff --git a/gpu_lib/external_gpu_algorithm.cpp b/gpu_lib/external_gpu_algorithm.cpp
@@ -1,5 +1,7 @@
 #include "external_gpu_algorithm.h"
 
+#include <thread>
+
 #include <config.h>
 #include <hash_conversions.h>
 #include <run_parallel.h>
@@ -15,18 +17,19 @@ void external_gpu_algorithm(const std::vector<uint32_t> &motif_hashes,
                             const GpuCudaParams &params)
 {
     out_motif_weights.resize(TOTAL_MOT, 0);
-    threads = (threads > 0) ? threads : std::thread::hardware_concurrency();
+    uint32_t threads = (params.gpu_count > 0) ? params.gpu_count : std::thread::hardware_concurrency();
 
     SafeCounter motifs_counter(TOTAL_MOT);
 
     run_parallel(threads, [&](uint32_t thread_id) {
-        GpuInternalMemory gpu_memory(params, sequence_hashes);
+        GpuExternalMemory gpu_memory(params, sequence_hashes);
         while (true) {
             const auto range = motifs_counter.get_and_increment_range_info(params.motif_range_size);
             if (range.count() == 0) {
                 break;
             }
-            motif_finder_gpu_internal(gpu_memory, params, out_motif_weights, range.start, range.count(), thread_id);
+            motif_finder_gpu_external(
+                motif_hashes, gpu_memory, params, out_motif_weights, range.start, range.count(), thread_id);
         }
     });
 }
diff --git a/gpu_lib/external_gpu_algorithm.h b/gpu_lib/external_gpu_algorithm.h
@@ -1,5 +1,3 @@
-#endif //MOTIF_FINDER_EXTERNAL_CPU_ALGORITHM_H
-
 #ifndef MOTIF_FINDER_MOTIF_FINDER_GPU_EXTERNAL_H
 #define MOTIF_FINDER_MOTIF_FINDER_GPU_EXTERNAL_H