diff --git a/CHANGELOG b/CHANGELOG index aaca86b..9445bcb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,8 @@ +Version 2.1.0 +- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) +- Added command-line OpenCL platform selection in the examples (thanks to William J Shipman) + Version 2.0.0 - Added support for machine learning models. These models can be trained on a small fraction of the tuning configurations and can be used to predict the remainder. Two models are supported: diff --git a/CMakeLists.txt b/CMakeLists.txt index 67069a2..03fb90e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ cmake_minimum_required(VERSION 2.8.10) project("cltune" CXX) set(cltune_VERSION_MAJOR 2) -set(cltune_VERSION_MINOR 0) +set(cltune_VERSION_MINOR 1) set(cltune_VERSION_PATCH 0) # Options @@ -78,7 +78,7 @@ endif() # C++ compiler settings if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - set(FLAGS "/Ox") + set(FLAGS "/Ox /wd4715 /wd4996") else () set(FLAGS "-O3 -std=c++11") if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") diff --git a/README.md b/README.md index 01244d8..a446c67 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,9 @@ Several examples are included as part of the CLTune distribution. They illustrat * `gemm.cc` providing an advanced and heavily tunable implementation of matrix-matrix multiplication (GEMM) * `conv.cc` providing an advanced and heavily tunable implementation of 2D convolution -The latter two optionally take command-line arguments. The first argument is an integer for the device to run on, the second argument is an integer to select a search strategy (0=random, 1=annealing, 2=PSO, 3=fullsearch), and the third an optional search-strategy parameter. +The latter two optionally take command-line arguments. The first argument is an integer to select the platform (NVIDIA, AMD, etc.), the second argument is an integer for the device to run on, the third argument is an integer to select a search strategy (0=random, 1=annealing, 2=PSO, 3=fullsearch), and the fourth an optional search-strategy parameter. + +Other examples are found in the [CLTuneDemos repository](https://github.com/williamjshipman/CLTuneDemos). CLTune is also used in the [CLBlast library](https://github.com/CNugteren/CLBlast). Search strategies and machine-learning @@ -123,9 +125,7 @@ The samples ship with a basic header to convert the included OpenCL samples to C Development and tests ------------- -The CLTune project follows the Google C++ styleguide (with some exceptions) and uses a tab-size of two spaces and a max-width of 100 characters per line. It is furthermore based on practises from the third edition of Effective C++ and the first edition of Effective Modern C++. The project is licensed under the APACHE 2.0 license by SURFsara, (c) 2014. The contributing authors so far are: - -* Cedric Nugteren +The CLTune project follows the Google C++ styleguide (with some exceptions) and uses a tab-size of two spaces and a max-width of 100 characters per line. It is furthermore based on practises from the third edition of Effective C++ and the first edition of Effective Modern C++. The project is licensed under the APACHE 2.0 license by SURFsara, (c) 2014. CLTune is packaged with Catch 1.2.1 and a custom test suite. No external dependencies are needed. The tests will be compiled when providing the `TESTS=ON` option to CMake. Running the tests goes as follows: @@ -137,9 +137,11 @@ However, the more useful tests are the provided examples, since they include a v ./sample_gemm X Y -Citation +More information ------------- -If you refer to this work in a scientific publication, please cite the corresponding CLTune paper published in MCSoC '15: +A how-to-use CLTune tutorial written by William J Shipman is available on [his blog](https://williamjshipman.wordpress.com/2016/01/31/autotuning-opencl-kernels-cltune-on-windows-7/). + +More in-depth information and experimental results are also available in a scientific paper. If you refer to this work in a scientific publication, please cite the corresponding CLTune paper published in MCSoC '15: > Cedric Nugteren and Valeriu Codreanu. CLTune: A Generic Auto-Tuner for OpenCL Kernels. In: MCSoC: 9th International Symposium on Embedded Multicore/Many-core Systems-on-Chip. IEEE, 2015. diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake index 3ca8fa8..b8d47bb 100644 --- a/cmake/Modules/FindOpenCL.cmake +++ b/cmake/Modules/FindOpenCL.cmake @@ -45,6 +45,7 @@ set(OPENCL_HINTS set(OPENCL_PATHS /usr/local/cuda /opt/cuda + /opt/intel/opencl /usr /usr/local ) @@ -63,7 +64,7 @@ mark_as_advanced(OPENCL_INCLUDE_DIRS) find_library(OPENCL_LIBRARIES NAMES OpenCL HINTS ${OPENCL_HINTS} - PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64 + PATH_SUFFIXES lib lib64 lib/x86_64 lib/x86_64/sdk lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64 PATHS ${OPENCL_PATHS} DOC "OpenCL library" ) diff --git a/include/cltune.h b/include/cltune.h index af8dfdc..eaf8a72 100644 --- a/include/cltune.h +++ b/include/cltune.h @@ -35,6 +35,14 @@ #include // std::function #include // std::pair +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#ifdef _WIN32 + #define PUBLIC_API __declspec(dllexport) +#else + #define PUBLIC_API +#endif + namespace cltune { // ================================================================================================= @@ -58,52 +66,52 @@ class Tuner { public: // Initializes the tuner either with platform 0 and device 0 or with a custom platform/device - explicit Tuner(); - explicit Tuner(size_t platform_id, size_t device_id); - ~Tuner(); + explicit PUBLIC_API Tuner(); + explicit PUBLIC_API Tuner(size_t platform_id, size_t device_id); + PUBLIC_API ~Tuner(); // Adds a new kernel to the list of tuning-kernels and returns a unique ID (to be used when // adding tuning parameters). Either loads the source from filenames or from string. - size_t AddKernel(const std::vector &filenames, const std::string &kernel_name, - const IntRange &global, const IntRange &local); - size_t AddKernelFromString(const std::string &source, const std::string &kernel_name, - const IntRange &global, const IntRange &local); + size_t PUBLIC_API AddKernel(const std::vector &filenames, const std::string &kernel_name, + const IntRange &global, const IntRange &local); + size_t PUBLIC_API AddKernelFromString(const std::string &source, const std::string &kernel_name, + const IntRange &global, const IntRange &local); // Sets the reference kernel. Same as the AddKernel function, but in this case there is only one // reference kernel. Calling this function again will overwrite the previous reference kernel. - void SetReference(const std::vector &filenames, - const std::string &kernel_name, - const IntRange &global, const IntRange &local); - void SetReferenceFromString(const std::string &source, - const std::string &kernel_name, - const IntRange &global, const IntRange &local); + void PUBLIC_API SetReference(const std::vector &filenames, + const std::string &kernel_name, + const IntRange &global, const IntRange &local); + void PUBLIC_API SetReferenceFromString(const std::string &source, + const std::string &kernel_name, + const IntRange &global, const IntRange &local); // Adds a new tuning parameter for a kernel with a specific ID. The parameter has a name, the // number of values, and a list of values. - void AddParameter(const size_t id, const std::string ¶meter_name, - const std::initializer_list &values); + void PUBLIC_API AddParameter(const size_t id, const std::string ¶meter_name, + const std::initializer_list &values); // As above, but now adds a single valued parameter to the reference - void AddParameterReference(const std::string ¶meter_name, const size_t value); + void PUBLIC_API AddParameterReference(const std::string ¶meter_name, const size_t value); // Modifies the global or local thread-size (integers) by one of the parameters (strings). The // modifier can be multiplication or division. - void MulGlobalSize(const size_t id, const StringRange range); - void DivGlobalSize(const size_t id, const StringRange range); - void MulLocalSize(const size_t id, const StringRange range); - void DivLocalSize(const size_t id, const StringRange range); + void PUBLIC_API MulGlobalSize(const size_t id, const StringRange range); + void PUBLIC_API DivGlobalSize(const size_t id, const StringRange range); + void PUBLIC_API MulLocalSize(const size_t id, const StringRange range); + void PUBLIC_API DivLocalSize(const size_t id, const StringRange range); // Adds a new constraint to the set of parameters (e.g. must be equal or larger than). The // constraints come in the form of a function object which takes a number of tuning parameters, // given as a vector of strings (parameter names). Their names are later substituted by actual // values. - void AddConstraint(const size_t id, ConstraintFunction valid_if, - const std::vector ¶meters); + void PUBLIC_API AddConstraint(const size_t id, ConstraintFunction valid_if, + const std::vector ¶meters); // As above, but for local memory usage. If this function is not called, it is assumed that the // local memory usage is 0: no configurations will be excluded because of too much local memory. - void SetLocalMemoryUsage(const size_t id, LocalMemoryFunction amount, - const std::vector ¶meters); + void PUBLIC_API SetLocalMemoryUsage(const size_t id, LocalMemoryFunction amount, + const std::vector ¶meters); // Functions to add kernel-arguments for input buffers, output buffers, and scalars. Make sure to // call these in the order in which the arguments appear in the kernel. @@ -113,35 +121,35 @@ class Tuner { // Configures a specific search method. The default search method is "FullSearch". These are // implemented as separate functions since they each take a different number of arguments. - void UseFullSearch(); - void UseRandomSearch(const double fraction); - void UseAnnealing(const double fraction, const double max_temperature); - void UsePSO(const double fraction, const size_t swarm_size, const double influence_global, - const double influence_local, const double influence_random); + void PUBLIC_API UseFullSearch(); + void PUBLIC_API UseRandomSearch(const double fraction); + void PUBLIC_API UseAnnealing(const double fraction, const double max_temperature); + void PUBLIC_API UsePSO(const double fraction, const size_t swarm_size, const double influence_global, + const double influence_local, const double influence_random); // Outputs the search process to a file - void OutputSearchLog(const std::string &filename); + void PUBLIC_API OutputSearchLog(const std::string &filename); // Starts the tuning process: compile all kernels and run them for each permutation of the tuning- // parameters. Note that this might take a while. - void Tune(); + void PUBLIC_API Tune(); // Trains a machine learning model based on the search space explored so far. Then, all the // missing data-points are estimated based on this model. This is only useful if a fraction of // the search space is explored, as is the case when doing random-search. - void ModelPrediction(const Model model_type, const float validation_fraction, - const size_t test_top_x_configurations); + void PUBLIC_API ModelPrediction(const Model model_type, const float validation_fraction, + const size_t test_top_x_configurations); // Prints the results of the tuning either to screen (stdout) or to a specific output-file. // Returns the execution time in miliseconds. - double PrintToScreen() const; - void PrintFormatted() const; - void PrintJSON(const std::string &filename, - const std::vector> &descriptions) const; - void PrintToFile(const std::string &filename) const; + double PUBLIC_API PrintToScreen() const; + void PUBLIC_API PrintFormatted() const; + void PUBLIC_API PrintJSON(const std::string &filename, + const std::vector> &descriptions) const; + void PUBLIC_API PrintToFile(const std::string &filename) const; // Disables all further printing to stdout - void SuppressOutput(); + void PUBLIC_API SuppressOutput(); private: diff --git a/samples/conv/conv.cc b/samples/conv/conv.cc index 29a6bc7..b37fbc9 100644 --- a/samples/conv/conv.cc +++ b/samples/conv/conv.cc @@ -46,6 +46,7 @@ bool IsMultiple(size_t a, size_t b) { // Constants constexpr auto kDefaultDevice = size_t{0}; +constexpr auto kDefaultPlatform = size_t{0}; constexpr auto kDefaultSearchMethod = size_t{1}; constexpr auto kDefaultSearchParameter1 = size_t{4}; @@ -73,16 +74,20 @@ int main(int argc, char* argv[]) { // Selects the device, the search method and its first parameter. These parameters are all // optional and are thus also given default values. auto device_id = kDefaultDevice; + auto platform_id = kDefaultPlatform; auto method = kDefaultSearchMethod; auto search_param_1 = kDefaultSearchParameter1; if (argc >= 2) { - device_id = static_cast(std::stoi(std::string{argv[1]})); - if (argc >= 3) { - method = static_cast(std::stoi(std::string{argv[2]})); + platform_id = static_cast(std::stoi(std::string{argv[1]})); + if (argc >= 3) { + device_id = static_cast(std::stoi(std::string{argv[2]})); if (argc >= 4) { - search_param_1 = static_cast(std::stoi(std::string{argv[3]})); + method = static_cast(std::stoi(std::string{argv[3]})); + if (argc >= 5) { + search_param_1 = static_cast(std::stoi(std::string{argv[4]})); + } } - } + } } // Creates data structures @@ -115,8 +120,8 @@ int main(int argc, char* argv[]) { // =============================================================================================== - // Initializes the tuner (platform 0, device 'device_id') - cltune::Tuner tuner(size_t{0}, static_cast(device_id)); + // Initializes the tuner (platform 'platform_id', device 'device_id') + cltune::Tuner tuner(static_cast(platform_id), static_cast(device_id)); // Sets one of the following search methods: // 0) Random search diff --git a/samples/gemm/gemm.cc b/samples/gemm/gemm.cc index 9bdf4e6..5a2f111 100644 --- a/samples/gemm/gemm.cc +++ b/samples/gemm/gemm.cc @@ -46,6 +46,7 @@ bool IsMultiple(size_t a, size_t b) { // Constants constexpr auto kDefaultDevice = size_t{0}; +constexpr auto kDefaultPlatform = size_t{0}; constexpr auto kDefaultSearchMethod = size_t{1}; constexpr auto kDefaultSearchParameter1 = size_t{4}; @@ -71,16 +72,20 @@ int main(int argc, char* argv[]) { // Selects the device, the search method and its first parameter. These parameters are all // optional and are thus also given default values. auto device_id = kDefaultDevice; + auto platform_id = kDefaultPlatform; auto method = kDefaultSearchMethod; auto search_param_1 = kDefaultSearchParameter1; if (argc >= 2) { - device_id = static_cast(std::stoi(std::string{argv[1]})); - if (argc >= 3) { - method = static_cast(std::stoi(std::string{argv[2]})); + platform_id = static_cast(std::stoi(std::string{argv[1]})); + if (argc >= 3) { + device_id = static_cast(std::stoi(std::string{argv[2]})); if (argc >= 4) { - search_param_1 = static_cast(std::stoi(std::string{argv[3]})); + method = static_cast(std::stoi(std::string{argv[3]})); + if (argc >= 5) { + search_param_1 = static_cast(std::stoi(std::string{argv[4]})); + } } - } + } } // Creates input matrices @@ -99,7 +104,7 @@ int main(int argc, char* argv[]) { for (auto &item: mat_c) { item = 0.0f; } // Initializes the tuner (platform 0, device 'device_id') - cltune::Tuner tuner(size_t{0}, static_cast(device_id)); + cltune::Tuner tuner(static_cast(platform_id), static_cast(device_id)); // Sets one of the following search methods: // 0) Random search @@ -108,7 +113,7 @@ int main(int argc, char* argv[]) { // 3) Full search auto fraction = 1.0f/2048.0f; if (method == 0) { tuner.UseRandomSearch(fraction); } - else if (method == 1) { tuner.UseAnnealing(fraction, static_cast(search_param_1)); } + else if (method == 1) { tuner.UseAnnealing(fraction, static_cast(search_param_1)); } else if (method == 2) { tuner.UsePSO(fraction, static_cast(search_param_1), 0.4, 0.0, 0.4); } else { tuner.UseFullSearch(); } diff --git a/src/cltune.cc b/src/cltune.cc index 8d3ce73..356eb21 100644 --- a/src/cltune.cc +++ b/src/cltune.cc @@ -152,7 +152,7 @@ void Tuner::SetLocalMemoryUsage(const size_t id, LocalMemoryFunction amount, throw std::runtime_error("Invalid parameter"); } } - pimpl->kernels_[id].SetLocalMemoryUsage(amount, parameters); + pimpl->kernels_[id].SetLocalMemoryUsage(amount, parameters); } @@ -170,12 +170,12 @@ void Tuner::AddArgumentInput(const std::vector &source) { } // Compiles the function for various data-types -template void Tuner::AddArgumentInput(const std::vector&); -template void Tuner::AddArgumentInput(const std::vector&); -template void Tuner::AddArgumentInput(const std::vector&); -template void Tuner::AddArgumentInput(const std::vector&); -template void Tuner::AddArgumentInput(const std::vector&); -template void Tuner::AddArgumentInput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); // Similar to the above function, but now marked as output buffer. Output buffers are special in the // sense that they will be checked in the verification process. @@ -188,32 +188,32 @@ void Tuner::AddArgumentOutput(const std::vector &source) { } // Compiles the function for various data-types -template void Tuner::AddArgumentOutput(const std::vector&); -template void Tuner::AddArgumentOutput(const std::vector&); -template void Tuner::AddArgumentOutput(const std::vector&); -template void Tuner::AddArgumentOutput(const std::vector&); -template void Tuner::AddArgumentOutput(const std::vector&); -template void Tuner::AddArgumentOutput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); // Sets a scalar value as an argument to the kernel. Since a vector of scalars of any type doesn't // exist, there is no general implemenation. Instead, each data-type has its specialised version in // which it stores to a specific vector. -template <> void Tuner::AddArgumentScalar(const int argument) { +template <> void PUBLIC_API Tuner::AddArgumentScalar(const int argument) { pimpl->arguments_int_.push_back({pimpl->argument_counter_++, argument}); } -template <> void Tuner::AddArgumentScalar(const size_t argument) { +template <> void PUBLIC_API Tuner::AddArgumentScalar(const size_t argument) { pimpl->arguments_size_t_.push_back({pimpl->argument_counter_++, argument}); } -template <> void Tuner::AddArgumentScalar(const float argument) { +template <> void PUBLIC_API Tuner::AddArgumentScalar(const float argument) { pimpl->arguments_float_.push_back({pimpl->argument_counter_++, argument}); } -template <> void Tuner::AddArgumentScalar(const double argument) { +template <> void PUBLIC_API Tuner::AddArgumentScalar(const double argument) { pimpl->arguments_double_.push_back({pimpl->argument_counter_++, argument}); } -template <> void Tuner::AddArgumentScalar(const float2 argument) { +template <> void PUBLIC_API Tuner::AddArgumentScalar(const float2 argument) { pimpl->arguments_float2_.push_back({pimpl->argument_counter_++, argument}); } -template <> void Tuner::AddArgumentScalar(const double2 argument) { +template <> void PUBLIC_API Tuner::AddArgumentScalar(const double2 argument) { pimpl->arguments_double2_.push_back({pimpl->argument_counter_++, argument}); }