From e95c15842dfb2577cc519352024cd2163a35e899 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 31 May 2016 20:37:17 +0200 Subject: [PATCH 1/9] Unit-tests are now based on string-kernels instead of external-file-kernels to make it possible to run the unit test executables anywhere --- CHANGELOG | 3 +++ test/tuner.cc | 25 +++++++++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 8395f9c..d9a3f92 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,7 @@ +Development version (next release) +- Made it possible to run the unit-tests independently of the provided OpenCL kernel samples + Version 2.3.1 - Fixed a bug where an output buffer could not be used as input at the same time - Fixed computing the validation error for half-precision fp16 data-types diff --git a/test/tuner.cc b/test/tuner.cc index 0f09db6..4bd09d3 100644 --- a/test/tuner.cc +++ b/test/tuner.cc @@ -20,6 +20,23 @@ const size_t kDeviceID = 0; const size_t kInvalidPlatformID = 99; const size_t kInvalidDeviceID = 99; +// Example kernels +const auto kernel1 = R"( +__kernel void small_kernel(__global float* array) { + array[get_global_id(0)] = result; +})"; +const auto kernel2 = R"( +__kernel void matvec_reference(const int kSizeM, const int kSizeN, + const __global float* mat_a, + const __global float* vec_x, + __global float* vec_y) { + float result = 0.0f; + for (int j=0; j> kExampleKernels = { - {"../samples/multiple_kernels/multiple_kernels_reference.opencl","matvec_reference"}, - {"../samples/multiple_kernels/multiple_kernels_unroll.opencl","matvec_unroll"} + {kernel1,"small_kernel"}, + {kernel2,"matvec_reference"} }; const auto kExampleParameter = std::string{"TEST_PARAM"}; const auto kExampleParameterValues = std::initializer_list{6, 9, 1003}; @@ -55,8 +72,8 @@ SCENARIO("kernels can be added", "[Tuner]") { for (; counter Date: Thu, 16 Jun 2016 20:18:49 +0200 Subject: [PATCH 2/9] Added a VERBOSE option to CMake to get additional diagnostic messages --- CHANGELOG | 1 + CMakeLists.txt | 7 +++++++ include/internal/tuner_impl.h | 1 + src/tuner_impl.cc | 36 ++++++++++++++++++++++++----------- 4 files changed, 34 insertions(+), 11 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index d9a3f92..f064355 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ Development version (next release) - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples +- Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON) Version 2.3.1 - Fixed a bug where an output buffer could not be used as input at the same time diff --git a/CMakeLists.txt b/CMakeLists.txt index dff5b2b..159a56a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,13 @@ else() message("-- Building with CUDA") endif() +# Compile in verbose mode with additional diagnostic messages +option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF) +if(VERBOSE) + message("-- Building in verbose mode") + add_definitions(-DVERBOSE) +endif() + # ================================================================================================== # RPATH settings diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h index 1267d11..ffb1b69 100644 --- a/include/internal/tuner_impl.h +++ b/include/internal/tuner_impl.h @@ -81,6 +81,7 @@ class TunerImpl { static const std::string kMessageHead; static const std::string kMessageRun; static const std::string kMessageInfo; + static const std::string kMessageVerbose; static const std::string kMessageOK; static const std::string kMessageWarning; static const std::string kMessageFailure; diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc index 76ee05c..5cb7908 100644 --- a/src/tuner_impl.cc +++ b/src/tuner_impl.cc @@ -53,6 +53,7 @@ const std::string TunerImpl::kMessageFull = "\x1b[32m[==========]\x1b[0m"; const std::string TunerImpl::kMessageHead = "\x1b[32m[----------]\x1b[0m"; const std::string TunerImpl::kMessageRun = "\x1b[32m[ RUN ]\x1b[0m"; const std::string TunerImpl::kMessageInfo = "\x1b[32m[ INFO ]\x1b[0m"; +const std::string TunerImpl::kMessageVerbose = "\x1b[39m[ VERBOSE ]\x1b[0m"; const std::string TunerImpl::kMessageOK = "\x1b[32m[ OK ]\x1b[0m"; const std::string TunerImpl::kMessageWarning = "\x1b[33m[ WARNING ]\x1b[0m"; const std::string TunerImpl::kMessageFailure = "\x1b[31m[ FAILED ]\x1b[0m"; @@ -162,6 +163,9 @@ void TunerImpl::Tune() { } else { // Computes the permutations of all parameters and pass them to a (smart) search algorithm + #ifdef VERBOSE + fprintf(stdout, "%s Computing the permutations of all parameters\n", kMessageVerbose.c_str()); + #endif kernel.SetConfigurations(); // Creates the selected search algorithm @@ -185,6 +189,10 @@ void TunerImpl::Tune() { // Iterates over all possible configurations (the permutations of the tuning parameters) for (auto p=size_t{0}; pNumConfigurations(); ++p) { + #ifdef VERBOSE + fprintf(stdout, "%s Exploring configuration (%zu out of %zu)\n", kMessageVerbose.c_str(), + p + 1, search->NumConfigurations()); + #endif auto permutation = search->GetConfiguration(); // Adds the parameters to the source-code string as defines @@ -238,21 +246,14 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker const size_t configuration_id, const size_t num_configurations) { - // Note: the following code is disabled because of GCC 4.8.0 compatibility - auto processed_source = source; - /* - // Removes the use of C++11 string literals (if any) from the kernel source code - auto string_literal_start = std::regex{"R\"\\("}; - auto string_literal_end = std::regex{"\\)\";"}; - auto processed_source = std::regex_replace(source, string_literal_start, ""); - processed_source = std::regex_replace(processed_source, string_literal_end, ""); - */ - // In case of an exception, skip this run try { // Compiles the kernel and prints the compiler errors/warnings - auto program = Program(context_, processed_source); + #ifdef VERBOSE + fprintf(stdout, "%s Starting compilation\n", kMessageVerbose.c_str()); + #endif + auto program = Program(context_, source); auto options = std::vector{}; auto build_status = program.Build(device_, options); if (build_status == BuildStatus::kError) { @@ -263,6 +264,9 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker if (build_status == BuildStatus::kInvalid) { throw std::runtime_error("Invalid program binary"); } + #ifdef VERBOSE + fprintf(stdout, "%s Finished compilation\n", kMessageVerbose.c_str()); + #endif // Clears all previous copies of output buffer(s) for (auto &mem_info: arguments_output_copy_) { @@ -275,6 +279,9 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker arguments_output_copy_.clear(); // Creates a copy of the output buffer(s) + #ifdef VERBOSE + fprintf(stdout, "%s Creating a copy of the output buffer\n", kMessageVerbose.c_str()); + #endif for (auto &output: arguments_output_) { switch (output.type) { case MemType::kShort: arguments_output_copy_.push_back(CopyOutputBuffer(output)); break; @@ -290,6 +297,9 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker } // Sets the kernel and its arguments + #ifdef VERBOSE + fprintf(stdout, "%s Setting kernel arguments\n", kMessageVerbose.c_str()); + #endif auto tune_kernel = Kernel(program, kernel.name()); for (auto &i: arguments_input_) { tune_kernel.SetArgument(i.index, i.buffer); } for (auto &i: arguments_output_copy_) { tune_kernel.SetArgument(i.index, i.buffer); } @@ -317,6 +327,10 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker fprintf(stdout, "%s Running %s\n", kMessageRun.c_str(), kernel.name().c_str()); auto events = std::vector(kNumRuns); for (auto t=size_t{0}; t Date: Thu, 16 Jun 2016 20:20:44 +0200 Subject: [PATCH 3/9] Fixed the RPATH settings for OSX --- CHANGELOG | 1 + CMakeLists.txt | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f064355..1d18f46 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ Development version (next release) - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON) +- Fixed the RPATH settings on OSX Version 2.3.1 - Fixed a bug where an output buffer could not be used as input at the same time diff --git a/CMakeLists.txt b/CMakeLists.txt index 159a56a..8b03acf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,10 +58,7 @@ endif() # ================================================================================================== # RPATH settings -set(CMAKE_SKIP_BUILD_RPATH false) # Use, i.e. don't skip the full RPATH for the build tree -set(CMAKE_BUILD_WITH_INSTALL_RPATH false) # When building, don't use the install RPATH already -set(CMAKE_INSTALL_RPATH "") # The RPATH to be used when installing -set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically determined parts +set(CMAKE_MACOSX_RPATH 1) # ================================================================================================== From fca2ad1ccd97691f096d1e17b9de544c75e8925e Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 29 Jun 2016 17:03:42 +0200 Subject: [PATCH 4/9] Added Appveyor CI and added OS X compilation for Travis --- .appveyor.yml | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ .gitignore | 3 ++- .travis.yml | 8 +++++-- CHANGELOG | 1 + README.md | 5 +++- 5 files changed, 78 insertions(+), 4 deletions(-) create mode 100644 .appveyor.yml diff --git a/.appveyor.yml b/.appveyor.yml new file mode 100644 index 0000000..a9142b1 --- /dev/null +++ b/.appveyor.yml @@ -0,0 +1,65 @@ +environment: + global: + CLTUNE_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\cltune" + OPENCL_REGISTRY: "https://www.khronos.org/registry/cl" + OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl" + +platform: + - x64 + - x86 + +configuration: + - Release + +init: + - cmake --version + - C:\"Program Files (x86)"\"Microsoft Visual Studio 14.0"\VC\vcvarsall.bat %PLATFORM% + +# Creates an OpenCL library to link against. Taken from clMathLibraries/clBLAS +install: + - ps: mkdir $env:OPENCL_ROOT + - ps: pushd $env:OPENCL_ROOT + - ps: $opencl_registry = $env:OPENCL_REGISTRY + # This downloads the source to the Khronos ICD library + - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git + - ps: mv ./OpenCL-ICD-Loader/* . + # This downloads all the opencl header files + # The cmake build files expect a directory called inc + - ps: mkdir inc/CL + - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL + - ps: wget $opencl_registry/api/2.1/cl.hpp -OutFile inc/CL/cl.hpp + # - ps: dir; if( $lastexitcode -eq 0 ){ dir include/CL } else { Write-Output boom } + # Create the static import lib in a directory called lib, so findopencl() will find it + - ps: mkdir lib + - ps: pushd lib + - cmake -G "NMake Makefiles" .. + - nmake + - ps: popd + # Switch to OpenCL 1.2 headers + - ps: pushd inc/CL + - git fetch origin opencl12:opencl12 + - git checkout opencl12 + - ps: popd + # Rename the inc directory to include, so FindOpencl() will find it + - ps: ren inc include + - ps: popd + +before_build: + - ps: mkdir $env:CLTUNE_ROOT + - ps: pushd $env:CLTUNE_ROOT + - ps: mkdir install_dir + - cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=install_dir -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DSAMPLES=ON -DTESTS=ON %APPVEYOR_BUILD_FOLDER% + +build_script: + - nmake + - nmake install + +after_build: + - ps: pushd $env:CLTUNE_ROOT + - 7z a CLTune-Windows-%PLATFORM%.zip .\install_dir\* + - ps: mv CLTune-Windows-%PLATFORM%.zip $env:APPVEYOR_BUILD_FOLDER + +artifacts: + - path: '*.zip' + name: release + type: zip diff --git a/.gitignore b/.gitignore index 567609b..53f935f 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -build/ +build +.* diff --git a/.travis.yml b/.travis.yml index b6f3a42..fd31c0e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,10 @@ language: cpp sudo: required dist: trusty +os: + - linux + - osx + compiler: - gcc - clang @@ -16,7 +20,7 @@ addons: env: global: - - CLTUNE_ROOT=${TRAVIS_BUILD_DIR}/bin/make/release + - CLTUNE_ROOT=${TRAVIS_BUILD_DIR}/bin/cltune - OPENCL_REGISTRY=https://www.khronos.org/registry/cl - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl @@ -55,7 +59,7 @@ install: before_script: - mkdir -p ${CLTUNE_ROOT} - pushd ${CLTUNE_ROOT} - - cmake -DSAMPLES=ON -DTESTS=ON -DOPENCL_ROOT=${OPENCL_ROOT} ${TRAVIS_BUILD_DIR} + - cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DSAMPLES=ON -DTESTS=ON ${TRAVIS_BUILD_DIR} script: - make diff --git a/CHANGELOG b/CHANGELOG index 1d18f46..aa780d4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,7 @@ Development version (next release) - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON) - Fixed the RPATH settings on OSX +- Added Appveyor continuous integration and increased coverage of the Travis builds Version 2.3.1 - Fixed a bug where an output buffer could not be used as input at the same time diff --git a/README.md b/README.md index 14ff2f4..0445696 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,10 @@ CLTune: Automatic OpenCL kernel tuning ================ -[![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=master)](https://travis-ci.org/CNugteren/CLTune) +| | master | development | +|-----|-----|-----| +| Linux/OS X | [![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=master)](https://travis-ci.org/CNugteren/CLTune/branches) | [![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=development)](https://travis-ci.org/CNugteren/CLTune/branches) | +| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/cltune?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/cltune) | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/cltune?branch=development&svg=true)](https://ci.appveyor.com/project/CNugteren/cltune) | CLTune is a C++ library which can be used to automatically tune your OpenCL and CUDA kernels. The only thing you'll need to provide is a tuneable kernel and a list of allowed parameters and values. From 609ea4cf78ad9ddb2788f692f5802ae52e17a441 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 29 Jun 2016 17:49:52 +0200 Subject: [PATCH 5/9] Removed building of tests for AppVeyor CI --- .appveyor.yml | 2 +- .gitignore | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index a9142b1..3e26ba8 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -48,7 +48,7 @@ before_build: - ps: mkdir $env:CLTUNE_ROOT - ps: pushd $env:CLTUNE_ROOT - ps: mkdir install_dir - - cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=install_dir -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DSAMPLES=ON -DTESTS=ON %APPVEYOR_BUILD_FOLDER% + - cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=install_dir -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DSAMPLES=ON %APPVEYOR_BUILD_FOLDER% build_script: - nmake diff --git a/.gitignore b/.gitignore index 53f935f..2502bdc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ build -.* +.* \ No newline at end of file From 6177c14d0828a5093e93640cbb6cb4fa893bb1fe Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 29 Jun 2016 17:50:12 +0200 Subject: [PATCH 6/9] Updated to version 6.0 of the CLCudaAPI header --- CHANGELOG | 1 + include/internal/clpp11.h | 51 ++++++++++++++++++++++++++++++--------- include/internal/cupp11.h | 34 ++++++++++++++++++++------ src/tuner_impl.cc | 2 +- 4 files changed, 68 insertions(+), 20 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index aa780d4..62d306f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ Development version (next release) - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON) +- Now using version 6.0 of the CLCudaAPI header - Fixed the RPATH settings on OSX - Added Appveyor continuous integration and increased coverage of the Travis builds diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index 91daa6b..e3bcb01 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -11,7 +11,7 @@ // Portability here means that a similar header exists for CUDA with the same classes and // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. // -// This is version 4.0 of CLCudaAPI . +// This is version 6.0 of CLCudaAPI . // // ================================================================================================= // @@ -214,6 +214,14 @@ class Device { return true; } + // Query for a specific type of device or brand + bool IsCPU() const { return Type() == "CPU"; } + bool IsGPU() const { return Type() == "GPU"; } + bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; } + bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; } + bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; } + bool IsARM() const { return Vendor() == "ARM"; } + // Accessor to the private data-member const cl_device_id& operator()() const { return device_; } private: @@ -276,10 +284,14 @@ class Context { // Accessor to the private data-member const cl_context& operator()() const { return *context_; } + cl_context* pointer() const { return &(*context_); } private: std::shared_ptr context_; }; +// Pointer to an OpenCL context +using ContextPointer = cl_context*; + // ================================================================================================= // Enumeration of build statuses of the run-time compilation process @@ -290,7 +302,7 @@ class Program { public: // Note that there is no constructor based on the regular OpenCL data-type because of extra state - // Regular constructor with memory management + // Source-based constructor with memory management explicit Program(const Context &context, std::string source): program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), length_(source.length()), @@ -301,6 +313,22 @@ class Program { CheckError(status); } + // Binary-based constructor with memory management + explicit Program(const Device &device, const Context &context, const std::string& binary): + program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), + length_(binary.length()), + source_(binary), + source_ptr_(&source_[0]) { + auto status1 = CL_SUCCESS; + auto status2 = CL_SUCCESS; + const cl_device_id dev = device(); + *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_, + reinterpret_cast(&source_ptr_), + &status1, &status2); + CheckError(status1); + CheckError(status2); + } + // Compiles the device program and returns whether or not there where any warnings/errors BuildStatus Build(const Device &device, std::vector &options) { auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); @@ -329,7 +357,7 @@ class Program { return result; } - // Retrieves an intermediate representation of the compiled program + // Retrieves a binary or an intermediate representation of the compiled program std::string GetIR() const { auto bytes = size_t{0}; CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); @@ -345,7 +373,7 @@ class Program { private: std::shared_ptr program_; size_t length_; - std::string source_; + std::string source_; // Note: the source can also be a binary or IR const char* source_ptr_; }; @@ -627,15 +655,15 @@ class Kernel { // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector &global, - const std::vector &local, Event &event) { + const std::vector &local, EventPointer event) { CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), local.data(), - 0, nullptr, &(event()))); + 0, nullptr, event)); } // As above, but with an event waiting list void Launch(const Queue &queue, const std::vector &global, - const std::vector &local, Event &event, + const std::vector &local, EventPointer event, std::vector& waitForEvents) { if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } @@ -648,15 +676,16 @@ class Kernel { // Launches the kernel while waiting for other events CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), local.data(), - waitForEventsPlain.size(), waitForEventsPlain.data(), - &(event()))); + static_cast(waitForEventsPlain.size()), + waitForEventsPlain.data(), + event)); } // As above, but with the default local workgroup size - void Launch(const Queue &queue, const std::vector &global, Event &event) { + void Launch(const Queue &queue, const std::vector &global, EventPointer event) { CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), nullptr, - 0, nullptr, &(event()))); + 0, nullptr, event)); } // Accessor to the private data-member diff --git a/include/internal/cupp11.h b/include/internal/cupp11.h index 515fee6..c1098f4 100644 --- a/include/internal/cupp11.h +++ b/include/internal/cupp11.h @@ -11,7 +11,7 @@ // Portability here means that a similar header exists for OpenCL with the same classes and // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change. // -// This is version 4.0 of CLCudaAPI . +// This is version 6.0 of CLCudaAPI . // // ================================================================================================= // @@ -106,13 +106,14 @@ class Event { // Accessors to the private data-members const CUevent& start() const { return *start_; } const CUevent& end() const { return *end_; } + Event* pointer() { return this; } private: std::shared_ptr start_; std::shared_ptr end_; }; // Pointer to a CUDA event -using EventPointer = CUevent*; +using EventPointer = Event*; // ================================================================================================= @@ -159,6 +160,11 @@ class Device { CheckError(cuDriverGetVersion(&result)); return "CUDA driver "+std::to_string(result); } + size_t VersionNumber() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return static_cast(result); + } std::string Vendor() const { return "NVIDIA Corporation"; } std::string Name() const { auto result = std::string{}; @@ -206,6 +212,14 @@ class Device { return true; } + // Query for a specific type of device or brand + bool IsCPU() const { return false; } + bool IsGPU() const { return true; } + bool IsAMD() const { return false; } + bool IsNVIDIA() const { return true; } + bool IsIntel() const { return false; } + bool IsARM() const { return false; } + // Accessor to the private data-member const CUdevice& operator()() const { return device_; } private: @@ -239,10 +253,14 @@ class Context { // Accessor to the private data-member const CUcontext& operator()() const { return *context_; } + CUcontext* pointer() const { return &(*context_); } private: std::shared_ptr context_; }; +// Pointer to an OpenCL context +using ContextPointer = CUcontext*; + // ================================================================================================= // Enumeration of build statuses of the run-time compilation process @@ -253,7 +271,7 @@ class Program { public: // Note that there is no constructor based on the regular CUDA data-type because of extra state - // Regular constructor with memory management + // Source-based constructor with memory management explicit Program(const Context &, std::string source): program_(new nvrtcProgram, [](nvrtcProgram* p) { CheckError(nvrtcDestroyProgram(p)); delete p; }), @@ -555,7 +573,7 @@ class Kernel { // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector &global, - const std::vector &local, Event &event) { + const std::vector &local, EventPointer event) { // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) auto grid = std::vector{1, 1, 1}; @@ -571,16 +589,16 @@ class Kernel { } // Launches the kernel, its execution time is recorded by events - CheckError(cuEventRecord(event.start(), queue())); + CheckError(cuEventRecord(event->start(), queue())); CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, queue(), pointers.data(), nullptr)); - CheckError(cuEventRecord(event.end(), queue())); + CheckError(cuEventRecord(event->end(), queue())); } // As above, but with an event waiting list // TODO: Implement this function void Launch(const Queue &queue, const std::vector &global, - const std::vector &local, Event &event, + const std::vector &local, EventPointer event, std::vector& waitForEvents) { if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } Error("launching with an event waiting list is not implemented for the CUDA back-end"); @@ -588,7 +606,7 @@ class Kernel { // As above, but with the default local workgroup size // TODO: Implement this function - void Launch(const Queue &, const std::vector &, Event &) { + void Launch(const Queue &, const std::vector &, EventPointer) { Error("launching with a default workgroup size is not implemented for the CUDA back-end"); } diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc index 5cb7908..4a67495 100644 --- a/src/tuner_impl.cc +++ b/src/tuner_impl.cc @@ -331,7 +331,7 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker fprintf(stdout, "%s Launching kernel (%zu out of %zu for averaging)\n", kMessageVerbose.c_str(), t + 1, kNumRuns); #endif - tune_kernel.Launch(queue_, global, local, events[t]); + tune_kernel.Launch(queue_, global, local, events[t].pointer()); queue_.Finish(events[t]); } queue_.Finish(); From 0526f9ded2e3976abd68d618b66e7ee00601ee43 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 29 Jun 2016 18:08:10 +0200 Subject: [PATCH 7/9] Made it possible to run some of the GEMM kernels using CUDA (those without shared memory) --- samples/cl_to_cuda.h | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/cl_to_cuda.h b/samples/cl_to_cuda.h index 3f23d2e..badfa5a 100644 --- a/samples/cl_to_cuda.h +++ b/samples/cl_to_cuda.h @@ -17,6 +17,7 @@ #define __local __shared__ #define restrict __restrict__ #define __constant const +#define inline __device__ inline // assumes all device functions are annotated with inline in OpenCL // Replaces OpenCL synchronisation with CUDA synchronisation #define barrier(x) __syncthreads() From 45b2c5262c19855b9ce8c4ee6589b47449d84312 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 29 Jun 2016 18:10:46 +0200 Subject: [PATCH 8/9] Updated to version 2.4.0 --- .appveyor.yml | 4 ++-- CHANGELOG | 2 +- CMakeLists.txt | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 3e26ba8..ca885b8 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -56,8 +56,8 @@ build_script: after_build: - ps: pushd $env:CLTUNE_ROOT - - 7z a CLTune-Windows-%PLATFORM%.zip .\install_dir\* - - ps: mv CLTune-Windows-%PLATFORM%.zip $env:APPVEYOR_BUILD_FOLDER + - 7z a CLTune-Windows-$env:PLATFORM.zip .\install_dir\* + - ps: mv CLTune-Windows-$env:PLATFORM.zip $env:APPVEYOR_BUILD_FOLDER artifacts: - path: '*.zip' diff --git a/CHANGELOG b/CHANGELOG index 62d306f..fd6ff46 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,5 @@ -Development version (next release) +Version 2.4.0 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON) - Now using version 6.0 of the CLCudaAPI header diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b03acf..831b421 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,8 +32,8 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla # CMake project details project("cltune" CXX) set(cltune_VERSION_MAJOR 2) -set(cltune_VERSION_MINOR 3) -set(cltune_VERSION_PATCH 1) +set(cltune_VERSION_MINOR 4) +set(cltune_VERSION_PATCH 0) # Options option(SAMPLES "Enable compilation of sample programs" ON) From a001605156e7e6f6ad88c624e8246ee85a2c40a4 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 29 Jun 2016 18:21:25 +0200 Subject: [PATCH 9/9] Minor fix to the AppVeyor CI build --- .appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index ca885b8..508489e 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -56,8 +56,8 @@ build_script: after_build: - ps: pushd $env:CLTUNE_ROOT - - 7z a CLTune-Windows-$env:PLATFORM.zip .\install_dir\* - - ps: mv CLTune-Windows-$env:PLATFORM.zip $env:APPVEYOR_BUILD_FOLDER + - 7z a CLTune-Windows.zip .\install_dir\* + - ps: mv CLTune-Windows.zip $env:APPVEYOR_BUILD_FOLDER artifacts: - path: '*.zip'