From f6b81460a1f623634d8ace89bec277aa7a2fabaa Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 8 May 2015 16:25:49 +0200 Subject: [PATCH 01/10] Added support for the OPENCL_ROOT environmental variable --- cmake/Modules/FindOpenCL.cmake | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake index f31807b..9c0e7f2 100644 --- a/cmake/Modules/FindOpenCL.cmake +++ b/cmake/Modules/FindOpenCL.cmake @@ -50,15 +50,17 @@ find_path(OPENCL_INCLUDE_DIRS NAMES OpenCL/cl.h CL/cl.h HINTS - ${OPENCL_ROOT}/include - $ENV{AMDAPPSDKROOT}/include - $ENV{CUDA_PATH}/include + ${OPENCL_ROOT} + $ENV{OPENCL_ROOT} + $ENV{AMDAPPSDKROOT} + $ENV{CUDA_PATH} + DOC "OpenCL header file path" + PATH_SUFFIXES include PATHS /usr/include /usr/local/include /usr/local/cuda/include /opt/cuda/include - DOC "OpenCL header file path" ) mark_as_advanced( OPENCL_INCLUDE_DIRS ) @@ -69,11 +71,12 @@ if( LIB64 ) find_library( OPENCL_LIBRARIES NAMES OpenCL HINTS - ${OPENCL_ROOT}/lib - $ENV{AMDAPPSDKROOT}/lib - $ENV{CUDA_PATH}/lib + ${OPENCL_ROOT} + $ENV{OPENCL_ROOT} + $ENV{AMDAPPSDKROOT} + $ENV{CUDA_PATH} DOC "OpenCL dynamic library path" - PATH_SUFFIXES x86_64 x64 + PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 PATHS /usr/lib /usr/local/cuda/lib From 032aa2c9fb95534722419c3efda08967c2238c4e Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 8 May 2015 16:28:18 +0200 Subject: [PATCH 02/10] Added support for 16-wide vectors to the GEMM sample --- samples/gemm/gemm.opencl | 87 ++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/samples/gemm/gemm.opencl b/samples/gemm/gemm.opencl index c974443..fce29c2 100644 --- a/samples/gemm/gemm.opencl +++ b/samples/gemm/gemm.opencl @@ -80,7 +80,8 @@ #define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) // Settings -#define USE_MAD 0 // Uses the non IEEE-754 compliant mad() function +#define USE_VECTOR_MAD 1 // Don't unroll the vector MAD computation +#define USE_CL_MAD 0 // Uses the non-IEEE754 compliant OpenCL mad() (if above is 0) // ================================================================================================= @@ -90,6 +91,7 @@ typedef float2 real2; typedef float4 real4; typedef float8 real8; + typedef float16 real16; #define ZERO 0.0f #elif PRECISION == 64 #if __OPENCL_VERSION__ <= CL_VERSION_1_1 // This the default on OpenCL 1.2 or higher @@ -99,6 +101,7 @@ typedef double2 real2; typedef double4 real4; typedef double8 real8; + typedef double16 real16; #define ZERO 0.0 #endif @@ -113,6 +116,8 @@ typedef real4 realM; #elif VWM == 8 typedef real8 realM; +#elif VWM == 16 + typedef real16 realM; #endif // Data-widths in dimension N @@ -124,6 +129,8 @@ typedef real4 realN; #elif VWN == 8 typedef real8 realN; +#elif VWN == 16 + typedef real16 realN; #endif // ================================================================================================= @@ -300,7 +307,7 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int // ================================================================================================= // The basic scalar multiply-add function -#if USE_MAD == 1 +#if USE_CL_MAD == 1 #define MultiplyAdd(cval, aval, bval) (cval = mad(aval, bval, cval)) #else #define MultiplyAdd(cval, aval, bval) (cval += (aval) * (bval)) @@ -308,25 +315,46 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int // The vectorised multiply-add function inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) { - #if VWM == 1 - MultiplyAdd(cvec, avec, bval); - #elif VWM == 2 - MultiplyAdd(cvec.x , avec.x, bval); - MultiplyAdd(cvec.y , avec.y, bval); - #elif VWM == 4 - MultiplyAdd(cvec.x , avec.x, bval); - MultiplyAdd(cvec.y , avec.y, bval); - MultiplyAdd(cvec.z , avec.z, bval); - MultiplyAdd(cvec.w , avec.w, bval); - #elif VWM == 8 - MultiplyAdd(cvec.s0, avec.s0, bval); - MultiplyAdd(cvec.s1, avec.s1, bval); - MultiplyAdd(cvec.s2, avec.s2, bval); - MultiplyAdd(cvec.s3, avec.s3, bval); - MultiplyAdd(cvec.s4, avec.s4, bval); - MultiplyAdd(cvec.s5, avec.s5, bval); - MultiplyAdd(cvec.s6, avec.s6, bval); - MultiplyAdd(cvec.s7, avec.s7, bval); + #if USE_VECTOR_MAD == 1 + cvec += avec * bval; + #else + #if VWM == 1 + MultiplyAdd(cvec, avec, bval); + #elif VWM == 2 + MultiplyAdd(cvec.x , avec.x, bval); + MultiplyAdd(cvec.y , avec.y, bval); + #elif VWM == 4 + MultiplyAdd(cvec.x , avec.x, bval); + MultiplyAdd(cvec.y , avec.y, bval); + MultiplyAdd(cvec.z , avec.z, bval); + MultiplyAdd(cvec.w , avec.w, bval); + #elif VWM == 8 + MultiplyAdd(cvec.s0, avec.s0, bval); + MultiplyAdd(cvec.s1, avec.s1, bval); + MultiplyAdd(cvec.s2, avec.s2, bval); + MultiplyAdd(cvec.s3, avec.s3, bval); + MultiplyAdd(cvec.s4, avec.s4, bval); + MultiplyAdd(cvec.s5, avec.s5, bval); + MultiplyAdd(cvec.s6, avec.s6, bval); + MultiplyAdd(cvec.s7, avec.s7, bval); + #elif VWM == 16 + MultiplyAdd(cvec.s0, avec.s0, bval); + MultiplyAdd(cvec.s1, avec.s1, bval); + MultiplyAdd(cvec.s2, avec.s2, bval); + MultiplyAdd(cvec.s3, avec.s3, bval); + MultiplyAdd(cvec.s4, avec.s4, bval); + MultiplyAdd(cvec.s5, avec.s5, bval); + MultiplyAdd(cvec.s6, avec.s6, bval); + MultiplyAdd(cvec.s7, avec.s7, bval); + MultiplyAdd(cvec.s8, avec.s8, bval); + MultiplyAdd(cvec.s9, avec.s9, bval); + MultiplyAdd(cvec.sA, avec.sA, bval); + MultiplyAdd(cvec.sB, avec.sB, bval); + MultiplyAdd(cvec.sC, avec.sC, bval); + MultiplyAdd(cvec.sD, avec.sD, bval); + MultiplyAdd(cvec.sE, avec.sE, bval); + MultiplyAdd(cvec.sF, avec.sF, bval); + #endif #endif return cvec; } @@ -356,6 +384,23 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], apm[mi], bpm[ni].s5); cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], apm[mi], bpm[ni].s6); cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], apm[mi], bpm[ni].s7); + #elif VWN == 16 + cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], apm[mi], bpm[ni].s0); + cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], apm[mi], bpm[ni].s1); + cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], apm[mi], bpm[ni].s2); + cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], apm[mi], bpm[ni].s3); + cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], apm[mi], bpm[ni].s4); + cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], apm[mi], bpm[ni].s5); + cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], apm[mi], bpm[ni].s6); + cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], apm[mi], bpm[ni].s7); + cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], apm[mi], bpm[ni].s8); + cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], apm[mi], bpm[ni].s9); + cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], apm[mi], bpm[ni].sA); + cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], apm[mi], bpm[ni].sB); + cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], apm[mi], bpm[ni].sC); + cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], apm[mi], bpm[ni].sD); + cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], apm[mi], bpm[ni].sE); + cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], apm[mi], bpm[ni].sF); #endif } } From e0b16c10fdb2351b9f2e28ce5450b5dcd472f7d6 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 14 May 2015 09:38:59 +0200 Subject: [PATCH 03/10] Updated compiler check and per-compiler flags --- CMakeLists.txt | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3285152..c90f3be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,12 +46,30 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically deter # Compiler-version check if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) - message(FATAL_ERROR "GCC version must be at least 4.9 (for full C++11 compatibility)") + message(FATAL_ERROR "GCC version must be at least 4.9") + endif() +elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") # Note: what about AppleClang? + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) + message(FATAL_ERROR "Clang version must be at least 3.3") + endif() +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) + message(FATAL_ERROR "ICC version must be at least 14.0") + endif() +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) + message(FATAL_ERROR "Visual Studio version must be at least 18.0") endif() endif() -# C++11 compiler settings -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wall -Wno-comment") +# C++ compiler settings +set(FLAGS "-O3 -std=c++11") +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(FLAGS "${FLAGS} -Wall -Wno-comment") +elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") # Note: what about AppleClang? + #set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") +endif() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") # ================================================================================================== From 517fc470aa434b5e70f75ec48114162458f02207 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 14 May 2015 09:40:54 +0200 Subject: [PATCH 04/10] Added support for multiple input files, minor fixes to the samples --- include/cltune.h | 6 +++--- samples/conv/conv.cc | 4 ++-- samples/gemm/gemm.cc | 4 ++-- samples/simple/simple.cc | 20 ++++++++++++-------- src/tuner.cc | 17 ++++++++++++----- 5 files changed, 31 insertions(+), 20 deletions(-) diff --git a/include/cltune.h b/include/cltune.h index 7b98d82..0454ab0 100644 --- a/include/cltune.h +++ b/include/cltune.h @@ -96,12 +96,12 @@ class Tuner { // Adds a new kernel to the list of tuning-kernels and returns a unique ID (to be used when // adding tuning parameters) - int AddKernel(const std::string &filename, const std::string &kernel_name, - const cl::NDRange &global, const cl::NDRange &local); + size_t AddKernel(const std::vector &filenames, const std::string &kernel_name, + const cl::NDRange &global, const cl::NDRange &local); // Sets the reference kernel. Same as the AddKernel function, but in this case there is only one // reference kernel. Calling this function again will overwrite the previous reference kernel. - void SetReference(const std::string &filename, const std::string &kernel_name, + void SetReference(const std::vector &filenames, const std::string &kernel_name, const cl::NDRange &global, const cl::NDRange &local); // Adds a new tuning parameter for a kernel with a specific ID. The parameter has a name, the diff --git a/samples/conv/conv.cc b/samples/conv/conv.cc index be047e4..ffcc7ce 100644 --- a/samples/conv/conv.cc +++ b/samples/conv/conv.cc @@ -121,7 +121,7 @@ int main(int argc, char* argv[]) { // =============================================================================================== // Adds a heavily tuneable kernel and some example parameter values - auto id = tuner.AddKernel("../samples/conv/conv.opencl", "conv", {kSizeX, kSizeY}, {1, 1}); + auto id = tuner.AddKernel({"../samples/conv/conv.opencl"}, "conv", {kSizeX, kSizeY}, {1, 1}); tuner.AddParameter(id, "TBX", {8, 16, 32, 64}); tuner.AddParameter(id, "TBY", {8, 16, 32, 64}); tuner.AddParameter(id, "LOCAL", {0, 1, 2}); @@ -181,7 +181,7 @@ int main(int argc, char* argv[]) { // Sets the tuner's golden reference function. This kernel contains the reference code to which // the output is compared. Supplying such a function is not required, but it is necessary for // correctness checks to be enabled. - tuner.SetReference("../samples/conv/conv_reference.opencl", "conv_reference", {kSizeX, kSizeY}, {8,8}); + tuner.SetReference({"../samples/conv/conv_reference.opencl"}, "conv_reference", {kSizeX, kSizeY}, {8,8}); // Sets the function's arguments. Note that all kernels have to accept (but not necessarily use) // all input arguments. diff --git a/samples/gemm/gemm.cc b/samples/gemm/gemm.cc index 3b68087..9821b5d 100644 --- a/samples/gemm/gemm.cc +++ b/samples/gemm/gemm.cc @@ -105,7 +105,7 @@ int main(int argc, char* argv[]) { // Adds a heavily tuneable kernel and some example parameter values. Others can be added, but for // this example this already leads to plenty of kernels to test. - auto id = tuner.AddKernel("../samples/gemm/gemm.opencl", "gemm_fast", {kSizeM, kSizeN}, {1, 1}); + auto id = tuner.AddKernel({"../samples/gemm/gemm.opencl"}, "gemm_fast", {kSizeM, kSizeN}, {1, 1}); tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); tuner.AddParameter(id, "KWG", {16, 32}); @@ -164,7 +164,7 @@ int main(int argc, char* argv[]) { // Sets the tuner's golden reference function. This kernel contains the reference code to which // the output is compared. Supplying such a function is not required, but it is necessarily for // correctness checks to be enabled. - tuner.SetReference("../samples/gemm/gemm_reference.opencl", "gemm_reference", {kSizeM, kSizeN}, {8,8}); + tuner.SetReference({"../samples/gemm/gemm_reference.opencl"}, "gemm_reference", {kSizeM, kSizeN}, {8,8}); // Sets the function's arguments. Note that all kernels have to accept (but not necessarily use) // all input arguments. diff --git a/samples/simple/simple.cc b/samples/simple/simple.cc index f05cf24..a73d54d 100644 --- a/samples/simple/simple.cc +++ b/samples/simple/simple.cc @@ -25,9 +25,9 @@ // // ================================================================================================= -#include -#include #include +#include +#include // Includes the OpenCL tuner library #include "cltune.h" @@ -48,10 +48,14 @@ int main() { std::vector vec_x(kSizeN); std::vector vec_y(kSizeM); + // Create a random number generator + const auto random_seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::default_random_engine generator(random_seed); + std::uniform_real_distribution distribution(-2.0f, 2.0f); + // Populates input data structures - srand(time(nullptr)); - for (auto &item: mat_a) { item = (float)rand() / (float)RAND_MAX; } - for (auto &item: vec_x) { item = (float)rand() / (float)RAND_MAX; } + for (auto &item: mat_a) { item = distribution(generator); } + for (auto &item: vec_x) { item = distribution(generator); } for (auto &item: vec_y) { item = 0.0; } // Initializes the tuner (platform 0, device 1) @@ -60,20 +64,20 @@ int main() { // Adds a kernel which supports unrolling through the UNROLL parameter. Note that the kernel // itself needs to implement the UNROLL parameter and (in this case) only accepts a limited // amount of values. - auto id = tuner.AddKernel("../samples/simple/simple_unroll.opencl", "matvec_unroll", {kSizeM}, {128}); + auto id = tuner.AddKernel({"../samples/simple/simple_unroll.opencl"}, "matvec_unroll", {kSizeM}, {128}); tuner.AddParameter(id, "UNROLL", {1, 2, 4}); // Adds another kernel and its parameters. This kernel caches the input vector X into local // memory to save global memory accesses. Note that the kernel's workgroup size is determined by // the tile size parameter TS. - id = tuner.AddKernel("../samples/simple/simple_tiled.opencl", "matvec_tiled", {kSizeM}, {1}); + id = tuner.AddKernel({"../samples/simple/simple_tiled.opencl"}, "matvec_tiled", {kSizeM}, {1}); tuner.AddParameter(id, "TS", {32, 64, 128, 256, 512}); tuner.MulLocalSize(id, {"TS"}); // Sets the tuner's golden reference function. This kernel contains the reference code to which // the output is compared. Supplying such a function is not required, but it is necessarily for // correctness checks to be enabled. - tuner.SetReference("../samples/simple/simple_reference.opencl", "matvec_reference", {kSizeM}, {128}); + tuner.SetReference({"../samples/simple/simple_reference.opencl"}, "matvec_reference", {kSizeM}, {128}); // Sets the function's arguments. Note that all kernels have to accept (but not necessarily use) // all input arguments. diff --git a/src/tuner.cc b/src/tuner.cc index d09880a..d2dd86d 100644 --- a/src/tuner.cc +++ b/src/tuner.cc @@ -94,11 +94,14 @@ Tuner::~Tuner() { // Loads the OpenCL source-code from a file and creates a new variable of type KernelInfo to store // all the kernel-information. -int Tuner::AddKernel(const std::string &filename, const std::string &kernel_name, - const cl::NDRange &global, const cl::NDRange &local) { +size_t Tuner::AddKernel(const std::vector &filenames, const std::string &kernel_name, + const cl::NDRange &global, const cl::NDRange &local) { // Loads the source-code and adds the kernel - auto source = LoadFile(filename); + auto source = std::string{}; + for (auto &filename: filenames) { + source += LoadFile(filename); + } kernels_.push_back(KernelInfo(kernel_name, source, opencl_)); // Sets the global and local thread sizes @@ -113,10 +116,13 @@ int Tuner::AddKernel(const std::string &filename, const std::string &kernel_name // Sets the reference kernel (source-code location, kernel name, global/local thread-sizes) and // sets a flag to indicate that there is now a reference. Calling this function again will simply // overwrite the old reference. -void Tuner::SetReference(const std::string &filename, const std::string &kernel_name, +void Tuner::SetReference(const std::vector &filenames, const std::string &kernel_name, const cl::NDRange &global, const cl::NDRange &local) { has_reference_ = true; - auto source = LoadFile(filename); + auto source = std::string{}; + for (auto &filename: filenames) { + source += LoadFile(filename); + } reference_kernel_.reset(new KernelInfo(kernel_name, source, opencl_)); reference_kernel_->set_global_base(global); reference_kernel_->set_local_base(local); @@ -211,6 +217,7 @@ void Tuner::AddArgumentScalar(const T argument) { arguments_scalar_.push_back({argument_counter_++, argument}); } template void Tuner::AddArgumentScalar(const int); +template void Tuner::AddArgumentScalar(const size_t); template void Tuner::AddArgumentScalar(const float); template void Tuner::AddArgumentScalar(const double); From cce96129135f7b5b0d73b7130df46a61f44020b3 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 14 May 2015 10:11:06 +0200 Subject: [PATCH 05/10] Added support for std::complex data-types --- include/cltune.h | 8 +++++- include/cltune/memory.h | 9 +++++- src/memory.cc | 6 +++- src/tuner.cc | 63 ++++++++++++++++++++++++++++++++++------- 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/include/cltune.h b/include/cltune.h index 0454ab0..25fbe32 100644 --- a/include/cltune.h +++ b/include/cltune.h @@ -173,6 +173,7 @@ class Tuner { // Downloads the output of a tuning run and compares it against the reference run bool VerifyOutput(); template bool DownloadAndCompare(const MemArgument &device_buffer, const size_t i); + template double AbsoluteDifference(const T reference, const T result); // Prints results of a particular kernel run void PrintResult(FILE* fp, const TunerResult &result, const std::string &message) const; @@ -201,7 +202,12 @@ class Tuner { std::vector kernels_; std::vector arguments_input_; std::vector arguments_output_; - std::vector> arguments_scalar_; + std::vector> arguments_int_; + std::vector> arguments_size_t_; + std::vector> arguments_float_; + std::vector> arguments_double_; + std::vector> arguments_float2_; + std::vector> arguments_double2_; // Storage for the reference kernel and output std::unique_ptr reference_kernel_; diff --git a/include/cltune/memory.h b/include/cltune/memory.h index 181d129..2919ab2 100644 --- a/include/cltune/memory.h +++ b/include/cltune/memory.h @@ -34,14 +34,21 @@ #include #include #include +#include #include "cltune/opencl.h" namespace cltune { // ================================================================================================= +// Shorthands for complex data-types +using float2 = std::complex; // cl_float2; +using double2 = std::complex; // cl_double2; + +// ================================================================================================= + // Enumeration of currently supported data-types by this class -enum class MemType { kInt, kFloat, kDouble }; +enum class MemType { kInt, kFloat, kDouble, kFloat2, kDouble2 }; // See comment at top of file for a description of the class template diff --git a/src/memory.cc b/src/memory.cc index 903fa63..af6b4db 100644 --- a/src/memory.cc +++ b/src/memory.cc @@ -35,6 +35,8 @@ namespace cltune { template <> const MemType Memory::type = MemType::kInt; template <> const MemType Memory::type = MemType::kFloat; template <> const MemType Memory::type = MemType::kDouble; +template <> const MemType Memory::type = MemType::kFloat2; +template <> const MemType Memory::type = MemType::kDouble2; // Initializes the memory class, creating a host array with zeroes and an uninitialized device // buffer. @@ -42,7 +44,7 @@ template Memory::Memory(const size_t size, cl::CommandQueue queue, const cl::Context &context, const cl_mem_flags flags): size_(size), - host_(size, static_cast(0)), + host_(size, T{0}), device_(new cl::Buffer(context, flags, size*sizeof(T))), queue_(queue) { } @@ -81,6 +83,8 @@ void Memory::DownloadFromDevice() { template class Memory; template class Memory; template class Memory; +template class Memory; +template class Memory; // ================================================================================================= } // namespace cltune diff --git a/src/tuner.cc b/src/tuner.cc index d2dd86d..592d5e8 100644 --- a/src/tuner.cc +++ b/src/tuner.cc @@ -198,6 +198,8 @@ void Tuner::AddArgumentInput(const std::vector &source) { template void Tuner::AddArgumentInput(const std::vector&); template void Tuner::AddArgumentInput(const std::vector&); template void Tuner::AddArgumentInput(const std::vector&); +template void Tuner::AddArgumentInput(const std::vector&); +template void Tuner::AddArgumentInput(const std::vector&); // As above, but now marked as output buffer template @@ -210,16 +212,28 @@ void Tuner::AddArgumentOutput(const std::vector &source) { template void Tuner::AddArgumentOutput(const std::vector&); template void Tuner::AddArgumentOutput(const std::vector&); template void Tuner::AddArgumentOutput(const std::vector&); +template void Tuner::AddArgumentOutput(const std::vector&); +template void Tuner::AddArgumentOutput(const std::vector&); -// Sets a simple scalar value as an argument to the kernel -template -void Tuner::AddArgumentScalar(const T argument) { - arguments_scalar_.push_back({argument_counter_++, argument}); +// Sets a scalar value as an argument to the kernel +template <> void Tuner::AddArgumentScalar(const int argument) { + arguments_int_.push_back({argument_counter_++, argument}); +} +template <> void Tuner::AddArgumentScalar(const size_t argument) { + arguments_size_t_.push_back({argument_counter_++, argument}); +} +template <> void Tuner::AddArgumentScalar(const float argument) { + arguments_float_.push_back({argument_counter_++, argument}); +} +template <> void Tuner::AddArgumentScalar(const double argument) { + arguments_double_.push_back({argument_counter_++, argument}); +} +template <> void Tuner::AddArgumentScalar(const float2 argument) { + arguments_float2_.push_back({argument_counter_++, argument}); +} +template <> void Tuner::AddArgumentScalar(const double2 argument) { + arguments_double2_.push_back({argument_counter_++, argument}); } -template void Tuner::AddArgumentScalar(const int); -template void Tuner::AddArgumentScalar(const size_t); -template void Tuner::AddArgumentScalar(const float); -template void Tuner::AddArgumentScalar(const double); // ================================================================================================= @@ -495,6 +509,8 @@ Tuner::TunerResult Tuner::RunKernel(const std::string &source, const KernelInfo case MemType::kInt: ResetMemArgument(output); break; case MemType::kFloat: ResetMemArgument(output); break; case MemType::kDouble: ResetMemArgument(output); break; + case MemType::kFloat2: ResetMemArgument(output); break; + case MemType::kDouble2: ResetMemArgument(output); break; default: throw Exception("Unsupported reference output data-type"); } } @@ -503,7 +519,12 @@ Tuner::TunerResult Tuner::RunKernel(const std::string &source, const KernelInfo auto tune_kernel = cl::Kernel(program, kernel.name().c_str()); for (auto &i: arguments_input_) { tune_kernel.setArg(i.index, i.buffer); } for (auto &i: arguments_output_) { tune_kernel.setArg(i.index, i.buffer); } - for (auto &i: arguments_scalar_) { tune_kernel.setArg(i.first, i.second); } + for (auto &i: arguments_int_) { tune_kernel.setArg(i.first, i.second); } + for (auto &i: arguments_size_t_) { tune_kernel.setArg(i.first, i.second); } + for (auto &i: arguments_float_) { tune_kernel.setArg(i.first, i.second); } + for (auto &i: arguments_double_) { tune_kernel.setArg(i.first, i.second); } + for (auto &i: arguments_float2_) { tune_kernel.setArg(i.first, i.second); } + for (auto &i: arguments_double2_) { tune_kernel.setArg(i.first, i.second); } // Sets the global and local thread-sizes auto global = kernel.global(); @@ -569,7 +590,7 @@ template void Tuner::ResetMemArgument(MemArgument &argument) { // Create an array with zeroes - std::vector buffer(argument.size, static_cast(0)); + std::vector buffer(argument.size, T{0}); // Copy the new array to the OpenCL buffer on the device auto bytes = sizeof(T)*argument.size; @@ -589,6 +610,8 @@ void Tuner::StoreReferenceOutput() { case MemType::kInt: DownloadReference(output_buffer); break; case MemType::kFloat: DownloadReference(output_buffer); break; case MemType::kDouble: DownloadReference(output_buffer); break; + case MemType::kFloat2: DownloadReference(output_buffer); break; + case MemType::kDouble2: DownloadReference(output_buffer); break; default: throw Exception("Unsupported reference output data-type"); } } @@ -615,6 +638,8 @@ bool Tuner::VerifyOutput() { case MemType::kInt: status &= DownloadAndCompare(output_buffer, i); break; case MemType::kFloat: status &= DownloadAndCompare(output_buffer, i); break; case MemType::kDouble: status &= DownloadAndCompare(output_buffer, i); break; + case MemType::kFloat2: status &= DownloadAndCompare(output_buffer, i); break; + case MemType::kDouble2: status &= DownloadAndCompare(output_buffer, i); break; default: throw Exception("Unsupported output data-type"); } ++i; @@ -636,7 +661,7 @@ bool Tuner::DownloadAndCompare(const MemArgument &device_buffer, const size_t i) // Compares the results (L2 norm) T* reference_output = (T*)reference_outputs_[i]; for (auto j=0UL; j +double Tuner::AbsoluteDifference(const T reference, const T result) { + return fabs(static_cast(reference) - static_cast(result)); +} +template <> double Tuner::AbsoluteDifference(const float2 reference, const float2 result) { + auto real = fabs(static_cast(reference.real()) - static_cast(result.real())); + auto imag = fabs(static_cast(reference.imag()) - static_cast(result.imag())); + return real + imag; +} +template <> double Tuner::AbsoluteDifference(const double2 reference, const double2 result) { + auto real = fabs(reference.real() - result.real()); + auto imag = fabs(reference.imag() - result.imag()); + return real + imag; +} + // ================================================================================================= // Prints a result by looping over all its configuration parameters From d2e118062079a129315fb36cce58fc0c365d92bc Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 15 May 2015 09:33:26 +0200 Subject: [PATCH 06/10] Better size_t support --- include/cltune.h | 26 +++++++++++++------------- include/cltune/kernel_info.h | 10 +++++----- samples/conv/conv.cc | 24 +++++++++++++----------- samples/gemm/gemm.cc | 10 +++++----- src/kernel_info.cc | 8 ++++---- src/tuner.cc | 12 ++++++------ 6 files changed, 46 insertions(+), 44 deletions(-) diff --git a/include/cltune.h b/include/cltune.h index 25fbe32..b73ec9d 100644 --- a/include/cltune.h +++ b/include/cltune.h @@ -67,7 +67,7 @@ class Tuner { // Helper structure to store an OpenCL memory argument for a kernel struct MemArgument { - int index; // The OpenCL kernel-argument index + size_t index; // The OpenCL kernel-argument index size_t size; // The number of elements (not bytes) MemType type; // The data-type (e.g. float) cl::Buffer buffer; // The host memory and OpenCL buffer on the device @@ -91,7 +91,7 @@ class Tuner { // Initialize either with platform 0 and device 0 or with a custom platform/device explicit Tuner(); - explicit Tuner(int platform_id, int device_id); + explicit Tuner(size_t platform_id, size_t device_id); ~Tuner(); // Adds a new kernel to the list of tuning-kernels and returns a unique ID (to be used when @@ -108,8 +108,8 @@ class Tuner { // number of values, and a list of values. // TODO: Remove all following functions (those that take "const size_t id" as first argument) and // make the KernelInfo class publicly accessible instead. - void AddParameter(const size_t id, const std::string parameter_name, - const std::initializer_list values); + void AddParameter(const size_t id, const std::string ¶meter_name, + const std::initializer_list &values); // Modifies the global or local thread-size (in NDRange form) by one of the parameters (in // StringRange form). The modifier can be multiplication or division. @@ -137,8 +137,8 @@ class Tuner { // Configures a specific search method. The default search method is "FullSearch" void UseFullSearch(); - void UseRandomSearch(const float fraction); - void UseAnnealing(const float fraction, const double max_temperature); + void UseRandomSearch(const double fraction); + void UseAnnealing(const double fraction, const double max_temperature); void UsePSO(const double fraction, const size_t swarm_size, const double influence_global, const double influence_local, const double influence_random); @@ -198,16 +198,16 @@ class Tuner { std::vector search_args_; // Storage of kernel sources, arguments, and parameters - int argument_counter_; + size_t argument_counter_; std::vector kernels_; std::vector arguments_input_; std::vector arguments_output_; - std::vector> arguments_int_; - std::vector> arguments_size_t_; - std::vector> arguments_float_; - std::vector> arguments_double_; - std::vector> arguments_float2_; - std::vector> arguments_double2_; + std::vector> arguments_int_; + std::vector> arguments_size_t_; + std::vector> arguments_float_; + std::vector> arguments_double_; + std::vector> arguments_float2_; + std::vector> arguments_double2_; // Storage for the reference kernel and output std::unique_ptr reference_kernel_; diff --git a/include/cltune/kernel_info.h b/include/cltune/kernel_info.h index dc806bb..a4457bf 100644 --- a/include/cltune/kernel_info.h +++ b/include/cltune/kernel_info.h @@ -55,14 +55,14 @@ class KernelInfo { // Helper structure holding a parameter name and a list of all values struct Parameter { std::string name; - std::vector values; + std::vector values; }; // Helper structure holding a setting: a name and a value. Multiple settings combined make a // single configuration. struct Setting { std::string name; - int value; + size_t value; std::string GetDefine() const { return "#define "+name+" "+GetValueString()+"\n"; } std::string GetConfig() const { return name+" "+GetValueString(); } std::string GetDatabase() const { return "{\""+name+"\","+GetValueString()+"}"; } @@ -78,14 +78,14 @@ class KernelInfo { // Helper structure holding a constraint on parameters. This constraint consists of a constraint // function object and a vector of paramater names represented as strings. - using ConstraintFunction = std::function)>; + using ConstraintFunction = std::function)>; struct Constraint { ConstraintFunction valid_if; std::vector parameters; }; // As above, but for local memory size. - using LocalMemoryFunction = std::function)>; + using LocalMemoryFunction = std::function)>; struct LocalMemory { LocalMemoryFunction amount; std::vector parameters; @@ -116,7 +116,7 @@ class KernelInfo { void set_local_base(cl::NDRange local) { local_base_ = local; local_ = local; } // Adds a new parameter with a name and a vector of possible values - void AddParameter(const std::string name, const std::vector values); + void AddParameter(const std::string &name, const std::vector &values); // Checks wheter a parameter exists, returns "true" if it does exist bool ParameterExists(const std::string parameter_name); diff --git a/samples/conv/conv.cc b/samples/conv/conv.cc index ffcc7ce..df1fd94 100644 --- a/samples/conv/conv.cc +++ b/samples/conv/conv.cc @@ -35,10 +35,10 @@ #include "cltune.h" // Helper function to perform an integer division + ceiling (round-up) -int CeilDiv(int a, int b) { return (a + b - 1)/b; } +size_t CeilDiv(size_t a, size_t b) { return (a + b - 1)/b; } // Helper function to determine whether or not 'a' is a multiple of 'b' -bool IsMultiple(int a, int b) { +bool IsMultiple(size_t a, size_t b) { return ((a/b)*b == a) ? true : false; }; @@ -135,13 +135,15 @@ int main(int argc, char* argv[]) { // In this case, the workgroup size (TBX by TBY) is extra large (TBX_XL by TBY_XL) because it uses // extra threads to compute the halo threads. How many extra threads are needed is dependend on // the filter size. Here we support a the TBX and TBY size plus up to 10 extra threads. - auto integers = {8,9,10,11,12,13,14,15, - 16,17,18,19,20,21,22,23,24,25,26, - 32,33,34,35,36,37,38,39,40,41,42, - 64,65,66,67,68,69,70,71,72,73,74}; + auto integers = std::initializer_list{ + 8,9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26, + 32,33,34,35,36,37,38,39,40,41,42, + 64,65,66,67,68,69,70,71,72,73,74 + }; tuner.AddParameter(id, "TBX_XL", integers); tuner.AddParameter(id, "TBY_XL", integers); - auto HaloThreads = [] (std::vector v) { + auto HaloThreads = [] (std::vector v) { if (v[0] == 2) { return (v[1] == v[2] + CeilDiv(2*HFS,v[3])); } // With halo threads else { return (v[1] == v[2]); } // Without halo threads }; @@ -149,22 +151,22 @@ int main(int argc, char* argv[]) { tuner.AddConstraint(id, HaloThreads, {"LOCAL", "TBY_XL", "TBY", "WPTY"}); // Sets the constrains on the vector size - auto VectorConstraint = [] (std::vector v) { + auto VectorConstraint = [] (std::vector v) { if (v[0] == 2) { return IsMultiple(v[2],v[1]) && IsMultiple(2*HFS,v[1]); } else { return IsMultiple(v[2],v[1]); } }; tuner.AddConstraint(id, VectorConstraint, {"LOCAL", "VECTOR", "WPTX"}); // Makes sure the work per thread is not too high, otherwise too many registers would be used. - //auto WorkPerThreadConstraint = [] (std::vector v) { return (v[0]*v[1] < 32); }; + //auto WorkPerThreadConstraint = [] (std::vector v) { return (v[0]*v[1] < 32); }; //tuner.AddConstraint(id, WorkPerThreadConstraint, {"WPTX", "WPTY"}); // Sets padding to zero in case local memory is not used - auto PaddingConstraint = [] (std::vector v) { return (v[1] == 0 || v[0] != 0); }; + auto PaddingConstraint = [] (std::vector v) { return (v[1] == 0 || v[0] != 0); }; tuner.AddConstraint(id, PaddingConstraint, {"LOCAL", "PADDING"}); // Sets the constraints for local memory size limitations - auto LocalMemorySize = [] (std::vector v) { + auto LocalMemorySize = [] (std::vector v) { if (v[0] != 0) { return ((v[3]*v[4] + 2*HFS) * (v[1]*v[2] + 2*HFS + v[5]))*sizeof(float); } else { return 0UL; } }; diff --git a/samples/gemm/gemm.cc b/samples/gemm/gemm.cc index 9821b5d..dfa5f37 100644 --- a/samples/gemm/gemm.cc +++ b/samples/gemm/gemm.cc @@ -38,7 +38,7 @@ #include "cltune.h" // Helper function to determine whether or not 'a' is a multiple of 'b' -bool IsMultiple(int a, int b) { +bool IsMultiple(size_t a, size_t b) { return ((a/b)*b == a) ? true : false; }; @@ -129,9 +129,9 @@ int main(int argc, char* argv[]) { // a boolean value whether or not the tuning configuration is legal. In this case, the helper // function 'IsMultiple' is employed for convenience. In the calls to 'AddConstraint' below, the // vector of parameter names (as strings) matches the input integer vector of the lambda's. - auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; - auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; - auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; + auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Sets constraints: Requirement for unrolling the KWG loop tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"}); @@ -149,7 +149,7 @@ int main(int argc, char* argv[]) { tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); // Sets the constraints for local memory size limitations - auto LocalMemorySize = [] (std::vector v) { + auto LocalMemorySize = [] (std::vector v) { return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*sizeof(float)); }; tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM", "SB", "KWG", "NWG", "VWN"}); diff --git a/src/kernel_info.cc b/src/kernel_info.cc index 2dc407d..2947d1b 100644 --- a/src/kernel_info.cc +++ b/src/kernel_info.cc @@ -41,7 +41,7 @@ KernelInfo::KernelInfo(const std::string name, const std::string source, parameters_(), configurations_(), constraints_(), - local_memory_(LocalMemory{[] (std::vector v) { return 0UL; }, std::vector(0)}), + local_memory_(LocalMemory{[] (std::vector v) { return 0UL; }, std::vector(0)}), opencl_(opencl), global_base_(), local_base_(), global_(), local_(), @@ -51,7 +51,7 @@ KernelInfo::KernelInfo(const std::string name, const std::string source, // ================================================================================================= // Pushes a new parameter to the list of parameters -void KernelInfo::AddParameter(const std::string name, const std::vector values) { +void KernelInfo::AddParameter(const std::string &name, const std::vector &values) { Parameter parameter = {name, values}; parameters_.push_back(parameter); } @@ -196,7 +196,7 @@ bool KernelInfo::ValidConfiguration(const Configuration &config) { for (auto &constraint: constraints_) { // Finds the values of the parameters - std::vector values(0); + std::vector values(size_t{0}); for (auto ¶meter: constraint.parameters) { for (auto &setting: config) { if (setting.name == parameter) { @@ -222,7 +222,7 @@ bool KernelInfo::ValidConfiguration(const Configuration &config) { if (!opencl_->ValidThreadSizes(global_, local_)) { return false; }; // Verifies the local memory usage - std::vector values_local_memory(0); + std::vector values_local_memory(size_t{0}); for (auto ¶meter: local_memory_.parameters) { for (auto &setting: config) { if (setting.name == parameter) { diff --git a/src/tuner.cc b/src/tuner.cc index 592d5e8..007eca7 100644 --- a/src/tuner.cc +++ b/src/tuner.cc @@ -69,7 +69,7 @@ Tuner::Tuner(): } // Initializes with a custom platform and device -Tuner::Tuner(int platform_id, int device_id): +Tuner::Tuner(size_t platform_id, size_t device_id): opencl_(new OpenCL(platform_id, device_id)), has_reference_(false), suppress_output_(false), @@ -131,8 +131,8 @@ void Tuner::SetReference(const std::vector &filenames, const std::s // ================================================================================================= // Adds parameters for a kernel to tune. Also checks whether this parameter already exists. -void Tuner::AddParameter(const size_t id, const std::string parameter_name, - const std::initializer_list values) { +void Tuner::AddParameter(const size_t id, const std::string ¶meter_name, + const std::initializer_list &values) { if (id >= kernels_.size()) { throw Exception("Invalid kernel ID"); } if (kernels_[id].ParameterExists(parameter_name)) { throw Exception("Parameter already exists"); @@ -243,13 +243,13 @@ void Tuner::UseFullSearch() { } // Use random search as a search strategy. -void Tuner::UseRandomSearch(const float fraction) { +void Tuner::UseRandomSearch(const double fraction) { search_method_ = SearchMethod::RandomSearch; search_args_.push_back(fraction); } // Use simulated annealing as a search strategy. -void Tuner::UseAnnealing(const float fraction, const double max_temperature) { +void Tuner::UseAnnealing(const double fraction, const double max_temperature) { search_method_ = SearchMethod::Annealing; search_args_.push_back(fraction); search_args_.push_back(max_temperature); @@ -458,7 +458,7 @@ void Tuner::PrintToFile(const std::string &filename) const { fprintf(file, "%.2lf;", tuning_result.time); fprintf(file, "%lu;", tuning_result.threads); for (auto &setting: tuning_result.configuration) { - fprintf(file, "%d;", setting.value); + fprintf(file, "%lu;", setting.value); } fprintf(file, "\n"); } From 8d989b002b4d1cc7f8aef45a8453efd7f032274a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 18 May 2015 16:10:29 +0200 Subject: [PATCH 07/10] Updated the tests --- test/kernel_info.cc | 28 ++++++++++++++------------ test/tuner.cc | 48 ++++++++++++++++++++++----------------------- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/test/kernel_info.cc b/test/kernel_info.cc index 4999e7b..87efc17 100644 --- a/test/kernel_info.cc +++ b/test/kernel_info.cc @@ -37,24 +37,25 @@ // Initializes a KernelInfo test fixture class KernelInfoTest : public testing::Test { protected: - static constexpr auto kNumParameters = 8; - static constexpr auto kNumRanges = 8; + static constexpr auto kNumParameters = size_t{8}; + static constexpr auto kNumRanges = size_t{8}; // Constructor explicit KernelInfoTest() : - kernel_{new cltune::KernelInfo("name", "source")} { + opencl_{new cltune::OpenCL(0, 0)}, + kernel_{new cltune::KernelInfo("name", "source", opencl_)} { } // Initializes the tester virtual void SetUp() { // Sets a bunch of parameters to test - for (auto i=0; i(i)); - auto values = std::vector{1, 6+i, 9, 1*i, 2000}; - for (auto j=0; j{1, 6+i, 9, 1*i, 2000}; + for (auto j=size_t{0}; j(i*i); @@ -100,9 +101,10 @@ class KernelInfoTest : public testing::Test { } // Member variables + std::shared_ptr opencl_; std::unique_ptr kernel_; std::vector names_; - std::vector> values_list_; + std::vector> values_list_; std::vector ranges_; std::vector string_ranges_; }; @@ -111,7 +113,7 @@ class KernelInfoTest : public testing::Test { // Tests set_global_base for a number of example NDRange values TEST_F(KernelInfoTest, SetGlobalBase) { - for (auto i=0; iset_global_base(ranges_[i]); ASSERT_EQ(ranges_[i].dimensions(), kernel_->global_base().dimensions()); for (auto j=static_cast(0); jglobal_base().dimensions(); ++j) { @@ -122,7 +124,7 @@ TEST_F(KernelInfoTest, SetGlobalBase) { // Tests set_local_base for a number of example NDRange values TEST_F(KernelInfoTest, SetLocalBase) { - for (auto i=0; iset_local_base(ranges_[i]); ASSERT_EQ(ranges_[i].dimensions(), kernel_->local_base().dimensions()); for (auto j=static_cast(0); jlocal_base().dimensions(); ++j) { @@ -135,12 +137,12 @@ TEST_F(KernelInfoTest, SetLocalBase) { TEST_F(KernelInfoTest, AddParameter) { // Adds several parameters - for (auto i=0; iAddParameter(names_[i], values_list_[i]); } // Tests each parameter - for (auto i=0; iparameters()[i].values.size()); EXPECT_EQ(names_[i], kernel_->parameters()[i].name); for (auto j=static_cast(0); jparameters()[i].values.size(); ++j) { @@ -157,7 +159,7 @@ TEST_F(KernelInfoTest, CreateLocalRange) { config.push_back(cltune::KernelInfo::Setting({"PARAM", 32})); // Tests a couple of different ranges against this configuration - for (auto i=0; iset_global_base(ranges_[i]); kernel_->set_local_base(ranges_[i]); kernel_->ComputeRanges(config); diff --git a/test/tuner.cc b/test/tuner.cc index f2152c5..60b5270 100644 --- a/test/tuner.cc +++ b/test/tuner.cc @@ -42,21 +42,21 @@ class TunerTest : public testing::Test { }; // Test parameters - static constexpr auto kNumKernelAdditions = 1; - static constexpr auto kNumParameters = 2; - static constexpr auto kNumParameterAdditions = 3; + static constexpr auto kNumKernelAdditions = size_t{1}; + static constexpr auto kNumParameters = size_t{2}; + static constexpr auto kNumParameterAdditions = size_t{3}; // Test kernels (taken from the samples folder) - static constexpr auto kNumKernels = 2; + static constexpr auto kNumKernels = size_t{2}; const std::vector kKernelFiles = { - {"../samples/simple_reference.opencl","matvec_reference"}, - {"../samples/simple_unroll.opencl","matvec_unroll"} + {"../samples/simple/simple_reference.opencl","matvec_reference"}, + {"../samples/simple/simple_unroll.opencl","matvec_unroll"} }; // Test matrix sizes - static constexpr auto kSizeM = 128; - static constexpr auto kSizeN = 512; - static constexpr auto kSizeK = 256; + static constexpr auto kSizeM = size_t{128}; + static constexpr auto kSizeN = size_t{512}; + static constexpr auto kSizeK = size_t{256}; // Constructor explicit TunerTest() : @@ -72,9 +72,9 @@ class TunerTest : public testing::Test { local_ = cl::NDRange{8, 1}; // Adds example parameters - for (auto k=0; k(k)); - auto values = {5, 1, 999}; + auto values = {size_t{5}, size_t{1}, size_t{999}}; auto string_range = cltune::StringRange{parameter, parameter}; parameter_list_.push_back(parameter); values_list_.push_back(values); @@ -90,7 +90,7 @@ class TunerTest : public testing::Test { cl::NDRange global_; cl::NDRange local_; std::vector parameter_list_; - std::vector> values_list_; + std::vector> values_list_; std::vector string_ranges_; }; @@ -113,10 +113,10 @@ TEST_F(TunerTest, InitOpenCL) { // Checks whether AddKernel returns an incrementing ID TEST_F(TunerTest, AddKernel) { - auto counter = 0; + auto counter = size_t{0}; for (auto &kernel_file: kKernelFiles) { - for (auto i=0; iAddKernel(kernel_file.filename, kernel_file.kernel_name, global_, local_); + for (auto i=size_t{0}; iAddKernel({kernel_file.filename}, kernel_file.kernel_name, global_, local_); EXPECT_EQ(counter, id); counter++; } @@ -127,17 +127,17 @@ TEST_F(TunerTest, AddKernel) { TEST_F(TunerTest, AddParameter) { // Adds parameters for invalid kernels, expecting a crash - for (auto k=0; kAddParameter(k, parameter_list_[k], values_list_[k]), cltune::Tuner::Exception); } // Adds a new kernel and then adds parameters for (auto &kernel_file: kKernelFiles) { - for (auto i=0; iAddKernel(kernel_file.filename, kernel_file.kernel_name, global_, local_); - for (auto k=0; kAddKernel({kernel_file.filename}, kernel_file.kernel_name, global_, local_); + for (auto k=size_t{0}; kAddParameter(id, parameter_list_[k], values_list_[k]); } @@ -156,7 +156,7 @@ TEST_F(TunerTest, AddParameter) { TEST_F(TunerTest, ModifyThreadSize) { // Modifies parameters for invalid kernels, expecting a crash - for (auto k=0; kMulGlobalSize(k, string_ranges_[k]), cltune::Tuner::Exception); ASSERT_THROW(tuner_->DivGlobalSize(k, string_ranges_[k]), cltune::Tuner::Exception); ASSERT_THROW(tuner_->MulLocalSize(k, string_ranges_[k]), cltune::Tuner::Exception); @@ -165,9 +165,9 @@ TEST_F(TunerTest, ModifyThreadSize) { // Adds a new kernel and then modifies the thread-sizes for (auto &kernel_file: kKernelFiles) { - for (auto i=0; iAddKernel(kernel_file.filename, kernel_file.kernel_name, global_, local_); - for (auto k=0; kAddKernel({kernel_file.filename}, kernel_file.kernel_name, global_, local_); + for (auto k=size_t{0}; kMulGlobalSize(id, string_ranges_[k]); tuner_->DivGlobalSize(id, string_ranges_[k]); tuner_->MulLocalSize(id, string_ranges_[k]); From 4bbd62687b2c9fc89da221d553de11d92367fc5f Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 18 May 2015 16:10:59 +0200 Subject: [PATCH 08/10] New custom FindOpenCL.cmake --- cmake/Modules/FindOpenCL.cmake | 159 ++++++++++++++------------------- 1 file changed, 69 insertions(+), 90 deletions(-) diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake index 9c0e7f2..3ca8fa8 100644 --- a/cmake/Modules/FindOpenCL.cmake +++ b/cmake/Modules/FindOpenCL.cmake @@ -1,107 +1,86 @@ -# ######################################################################## -# Copyright 2013 Advanced Micro Devices, Inc. +# ================================================================================================== +# This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses +# a tab-size of two spaces and a max-width of 100 characters per line. # +# Author: cedric.nugteren@surfsara.nl (Cedric Nugteren) +# +# Defines the following variables: +# OPENCL_FOUND Boolean holding whether or not the OpenCL library was found +# OPENCL_INCLUDE_DIRS The OpenCL include directory +# OPENCL_LIBRARIES The OpenCL library +# +# In case OpenCL is not installed in the default directory, set the OPENCL_ROOT variable to point to +# the root of OpenCL, such that 'OpenCL/cl.h' or 'CL/cl.h' can be found in $OPENCL_ROOT/include. +# This can either be done using an environmental variable (e.g. export OPENCL_ROOT=/path/to/opencl) +# or using a CMake variable (e.g. cmake -DOPENCL_ROOT=/path/to/opencl ..). +# +# -------------------------------------------------------------------------------------------------- +# +# Copyright 2014 SURFsara +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# +# +# http://www.apache.org/licenses/LICENSE-2.0 +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# ######################################################################## - - -# Locate an OpenCL implementation. -# Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/) -# -# Defines the following variables: -# -# OPENCL_FOUND - Found the OPENCL framework -# OPENCL_INCLUDE_DIRS - Include directories -# -# Also defines the library variables below as normal -# variables. These contain debug/optimized keywords when -# a debugging library is found. -# -# OPENCL_LIBRARIES - libopencl # -# Accepts the following variables as input: -# -# OPENCL_ROOT - (as a CMake or environment variable) -# The root directory of the OpenCL implementation found -# -# FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findOpenCL should search for -# 64bit or 32bit libs -#----------------------- -# Example Usage: -# -# find_package(OPENCL REQUIRED) -# include_directories(${OPENCL_INCLUDE_DIRS}) -# -# add_executable(foo foo.cc) -# target_link_libraries(foo ${OPENCL_LIBRARIES}) -# -#----------------------- +# ================================================================================================== -find_path(OPENCL_INCLUDE_DIRS - NAMES OpenCL/cl.h CL/cl.h - HINTS - ${OPENCL_ROOT} - $ENV{OPENCL_ROOT} - $ENV{AMDAPPSDKROOT} - $ENV{CUDA_PATH} - DOC "OpenCL header file path" - PATH_SUFFIXES include - PATHS - /usr/include - /usr/local/include - /usr/local/cuda/include - /opt/cuda/include +# Sets the possible install locations +set(OPENCL_HINTS + ${OPENCL_ROOT} + $ENV{OPENCL_ROOT} + $ENV{AMDAPPSDKROOT} + $ENV{CUDA_PATH} + $ENV{INTELOCLSDKROOT} + $ENV{NVSDKCOMPUTE_ROOT} + $ENV{ATISTREAMSDKROOT} +) +set(OPENCL_PATHS + /usr/local/cuda + /opt/cuda + /usr + /usr/local ) -mark_as_advanced( OPENCL_INCLUDE_DIRS ) -# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else -get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) +# Finds the include directories +find_path(OPENCL_INCLUDE_DIRS + NAMES OpenCL/cl.h CL/cl.h + HINTS ${OPENCL_HINTS} + PATH_SUFFIXES include OpenCL/common/inc inc include/x86_64 include/x64 + PATHS ${OPENCL_PATHS} + DOC "OpenCL include header OpenCL/cl.h or CL/cl.h" +) +mark_as_advanced(OPENCL_INCLUDE_DIRS) -if( LIB64 ) - find_library( OPENCL_LIBRARIES - NAMES OpenCL - HINTS - ${OPENCL_ROOT} - $ENV{OPENCL_ROOT} - $ENV{AMDAPPSDKROOT} - $ENV{CUDA_PATH} - DOC "OpenCL dynamic library path" - PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 - PATHS - /usr/lib - /usr/local/cuda/lib - /opt/cuda/lib - ) -else( ) - find_library( OPENCL_LIBRARIES - NAMES OpenCL - HINTS - ${OPENCL_ROOT}/lib - $ENV{AMDAPPSDKROOT}/lib - $ENV{CUDA_PATH}/lib - DOC "OpenCL dynamic library path" - PATH_SUFFIXES x86 Win32 - PATHS - /usr/lib - /usr/local/cuda/lib - /opt/cuda/lib - ) -endif( ) -mark_as_advanced( OPENCL_LIBRARIES ) +# Finds the library +find_library(OPENCL_LIBRARIES + NAMES OpenCL + HINTS ${OPENCL_HINTS} + PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64 + PATHS ${OPENCL_PATHS} + DOC "OpenCL library" +) +mark_as_advanced(OPENCL_LIBRARIES) -include( FindPackageHandleStandardArgs ) -FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS ) +# ================================================================================================== -if( NOT OPENCL_FOUND ) - message( STATUS "FindOpenCL looked for libraries named: OpenCL" ) +# Notification messages +if(NOT OPENCL_INCLUDE_DIRS) + message(STATUS "Could NOT find 'OpenCL/cl.h' or 'CL/cl.h', install OpenCL or set OPENCL_ROOT") endif() +if(NOT OPENCL_LIBRARIES) + message(STATUS "Could NOT find OpenCL library, install it or set OPENCL_ROOT") +endif() + +# Determines whether or not OpenCL was found +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_INCLUDE_DIRS OPENCL_LIBRARIES) + +# ================================================================================================== From c7a2078a74b1952350f4b3a8f3d6bcfbe94e3776 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 18 May 2015 16:11:18 +0200 Subject: [PATCH 09/10] Updated the README w.r.t. the latest changes --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7eaa022..a378634 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,9 @@ Before we start using the tuner, we'll have to create one. The constructor takes cltune::Tuner my_tuner(0, 1); // Tuner on device 1 of OpenCL platform 0 -Now that we have a tuner, we can add a tuning kernel. This is done by providing the path to an OpenCL kernel (first argument), the name of the kernel (second argument), a list of global thread dimensions (third argument), and a list of local thread or workgroup dimensions (fourth argument). Here is an example: +Now that we have a tuner, we can add a tuning kernel. This is done by providing a list of paths to OpenCL kernel files (first argument), the name of the kernel (second argument), a list of global thread dimensions (third argument), and a list of local thread or workgroup dimensions (fourth argument). Here is an example: - int id = my_tuner.AddKernel("path/to/kernel.opencl", "my_kernel", {1024,512}, {16,8}); + size_t id = my_tuner.AddKernel({"path/to/kernel.opencl"}, "my_kernel", {1024,512}, {16,8}); Notice that the AddKernel function returns an integer: it is the ID of the added kernel. We'll need this ID when we want to add tuning parameters to this kernel. Let's say that our kernel has two pre-processor parameters named `PARAM_1` and `PARAM_2`: @@ -58,7 +58,7 @@ Notice that the AddKernel function returns an integer: it is the ID of the added Now that we've added a kernel and its parameters, we can add another one if we wish. When we're done, there are a couple of things left to be done. Let's start with adding an reference kernel. This reference kernel can provide the tuner with the ground-truth and is optional - only when it is provided will the tuner perform verification checks to ensure correctness. - my_tuner.SetReference("path/to/reference.opencl", "my_reference", {8192}, {128}); + my_tuner.SetReference({"path/to/reference.opencl"}, "my_reference", {8192}, {128}); The tuner also needs to know which arguments the kernels take. Scalar arguments can be provided as-is and are passed-by-value, whereas arrays have to be provided as C++ `std::vector`s. That's right, we won't have to create OpenCL buffers, CLTune will handle that for us! Here is an example: From 2416b3b2d6d9af5b1870b8afc3f52a6547dc5995 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 18 May 2015 16:12:48 +0200 Subject: [PATCH 10/10] Updated to version 1.5.1 --- CHANGELOG | 8 ++++++++ CMakeLists.txt | 10 +++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index c74b8d6..0612960 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,12 @@ +Version 1.5.1 +- Improved the GEMM example to support the Intel MIC (Xeon Phi) accelerators +- Updated compiler check and compiler flags +- Adds support for multiple OpenCL kernel files at once (e.g. when wanting to include a header file) +- Adds support for the std::complex data-types +- Fixed some compilation warnings regarding size_t conversions +- Updated the FindOpenCL.cmake file + Version 1.5.0 - OpenCL local work size and memory size constraints are now automatically handled - Greatly improved the new 2D convolution example: diff --git a/CMakeLists.txt b/CMakeLists.txt index c90f3be..2ade1e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ cmake_minimum_required(VERSION 2.8) project("cltune" CXX) set(cltune_VERSION_MAJOR 1) set(cltune_VERSION_MINOR 5) -set(cltune_VERSION_PATCH 0) +set(cltune_VERSION_PATCH 1) # Options option(SAMPLES "Enable compilation of sample programs" ON) @@ -48,10 +48,14 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) message(FATAL_ERROR "GCC version must be at least 4.9") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") # Note: what about AppleClang? +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) message(FATAL_ERROR "Clang version must be at least 3.3") endif() +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + message(FATAL_ERROR "Clang version must be at least 5.0") + endif() elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) message(FATAL_ERROR "ICC version must be at least 14.0") @@ -66,7 +70,7 @@ endif() set(FLAGS "-O3 -std=c++11") if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") set(FLAGS "${FLAGS} -Wall -Wno-comment") -elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") # Note: what about AppleClang? +elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") #set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")