diff --git a/.appveyor.yml b/.appveyor.yml index 508489e..8f96e0c 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,8 +1,8 @@ environment: global: - CLTUNE_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\cltune" + CLTUNE_ROOT: "C:\\cltune\\build" OPENCL_REGISTRY: "https://www.khronos.org/registry/cl" - OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl" + OPENCL_ROOT: "C:\\dependencies\\opencl" platform: - x64 @@ -21,13 +21,21 @@ install: - ps: pushd $env:OPENCL_ROOT - ps: $opencl_registry = $env:OPENCL_REGISTRY # This downloads the source to the Khronos ICD library - - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git + - git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git + - ps: pushd OpenCL-ICD-Loader + - git checkout cb4acb9 # older version (pre 2.2 support) + - ps: popd - ps: mv ./OpenCL-ICD-Loader/* . # This downloads all the opencl header files # The cmake build files expect a directory called inc - ps: mkdir inc/CL - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL - ps: wget $opencl_registry/api/2.1/cl.hpp -OutFile inc/CL/cl.hpp + # Switch to OpenCL 2.1 headers + - ps: pushd inc/CL + - git fetch origin opencl21:opencl21 + - git checkout opencl21 + - ps: popd # - ps: dir; if( $lastexitcode -eq 0 ){ dir include/CL } else { Write-Output boom } # Create the static import lib in a directory called lib, so findopencl() will find it - ps: mkdir lib @@ -35,11 +43,6 @@ install: - cmake -G "NMake Makefiles" .. - nmake - ps: popd - # Switch to OpenCL 1.2 headers - - ps: pushd inc/CL - - git fetch origin opencl12:opencl12 - - git checkout opencl12 - - ps: popd # Rename the inc directory to include, so FindOpencl() will find it - ps: ren inc include - ps: popd diff --git a/CHANGELOG b/CHANGELOG index 19c7709..fef5f23 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,10 @@ +Version 2.7.0 +- CLTune now automatically ensures global size is a multiple of the local workgroup size +- Added GetBestResult() to the tuner's API to retrieve the best parameters programmatically +- Changed std::initalizer_list in the AddParameters API to std::vector +- Fixed a bug in the simulated annealing search method + Version 2.6.0 - Changed timing measurements to now also include the (varying) kernel launch overhead - It is now possible to set OpenCL compiler options through the env variable CLTUNE_BUILD_OPTIONS diff --git a/CMakeLists.txt b/CMakeLists.txt index a3c3882..00a7377 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla # CMake project details project("cltune" CXX) set(cltune_VERSION_MAJOR 2) -set(cltune_VERSION_MINOR 6) +set(cltune_VERSION_MINOR 7) set(cltune_VERSION_PATCH 0) # Options and their default values diff --git a/doc/api.md b/doc/api.md index cffbdea..9f5c223 100644 --- a/doc/api.md +++ b/doc/api.md @@ -23,7 +23,7 @@ Adds a new kernel to the list of tuning-kernels and returns a unique ID (to be u * `size_t AddKernelFromString(const std::string &source, const std::string &kernel_name, const IntRange &global, const IntRange &local)`: As above, but now the kernel is loaded from a string instead of from a file. -* `void AddParameter(const size_t id, const std::string ¶meter_name, const std::initializer_list &values)`: +* `void AddParameter(const size_t id, const std::string ¶meter_name, const std::vector &values)`: Adds a new tuning parameter for the kernel with the given `id`. The parameter has as a name `parameter_name`, and a list of tuneable integer values. * `void MulGlobalSize(const size_t id, const StringRange range)`: @@ -89,6 +89,9 @@ Call this method *after* calling the `Tune()` method. Trains a machine learning Output ------------- +* `std::unordered_map GetBestResult()`: +Retrieves the parameters of the best tuning result and returns them to the caller as a map of strings (parameter names) to integers (parameter values). + * `void OutputSearchLog(const std::string &filename)`: Outputs the search process to the file `filename`. diff --git a/include/cltune.h b/include/cltune.h index 8af3dff..f79834b 100644 --- a/include/cltune.h +++ b/include/cltune.h @@ -34,6 +34,7 @@ #include // std::unique_ptr #include // std::function #include // std::pair +#include // std::unordered_map // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx @@ -93,7 +94,7 @@ class Tuner { // Adds a new tuning parameter for a kernel with a specific ID. The parameter has a name, the // number of values, and a list of values. void PUBLIC_API AddParameter(const size_t id, const std::string ¶meter_name, - const std::initializer_list &values); + const std::vector &values); // As above, but now adds a single valued parameter to the reference void PUBLIC_API AddParameterReference(const std::string ¶meter_name, const size_t value); @@ -144,6 +145,9 @@ class Tuner { void PUBLIC_API ModelPrediction(const Model model_type, const float validation_fraction, const size_t test_top_x_configurations); + // Retrieves the parameters of the best tuning result + std::unordered_map GetBestResult() const; + // Prints the results of the tuning either to screen (stdout) or to a specific output-file. // Returns the execution time in miliseconds. double PUBLIC_API PrintToScreen() const; diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h index 8b703a7..fea52e2 100644 --- a/include/internal/tuner_impl.h +++ b/include/internal/tuner_impl.h @@ -136,6 +136,9 @@ class TunerImpl { // Prints results of a particular kernel run void PrintResult(FILE* fp, const TunerResult &result, const std::string &message) const; + // Retrieves the best tuning result + TunerResult GetBestResult() const; + // Loads a file from disk into a string std::string LoadFile(const std::string &filename); @@ -146,6 +149,14 @@ class TunerImpl { // argument. Supports all enumerations of MemType. template MemType GetType(); + // Rounding functions performing ceiling and division operations + size_t CeilDiv(const size_t x, const size_t y) { + return 1 + ((x - 1) / y); + } + size_t Ceil(const size_t x, const size_t y) { + return CeilDiv(x,y)*y; + } + // Accessors to device data-types const Device device() const { return device_; } const Context context() const { return context_; } diff --git a/src/cltune.cc b/src/cltune.cc index 72e3fb7..15b1d25 100644 --- a/src/cltune.cc +++ b/src/cltune.cc @@ -95,7 +95,7 @@ void Tuner::SetReferenceFromString(const std::string &source, const std::string // Adds parameters for a kernel to tune. Also checks whether this parameter already exists. void Tuner::AddParameter(const size_t id, const std::string ¶meter_name, - const std::initializer_list &values) { + const std::vector &values) { if (id >= pimpl->kernels_.size()) { throw std::runtime_error("Invalid kernel ID"); } if (pimpl->kernels_[id].ParameterExists(parameter_name)) { throw std::runtime_error("Parameter already exists"); @@ -283,19 +283,27 @@ void Tuner::ModelPrediction(const Model model_type, const float validation_fract // ================================================================================================= + +// Retrieves the parameters of the best tuning result +std::unordered_map Tuner::GetBestResult() const { + const auto best_result = pimpl->GetBestResult(); + const auto best_configuration = best_result.configuration; + + // Converts the std::vector into an unordere map of strings and integers + auto parameters = std::unordered_map{}; + for (const auto ¶meter_setting : best_configuration) { + parameters[parameter_setting.name] = parameter_setting.value; + } + return parameters; +} + // Iterates over all tuning results and prints each parameter configuration and the corresponding // timing-results. Printing is to stdout. double Tuner::PrintToScreen() const { // Finds the best result - auto best_result = pimpl->tuning_results_[0]; - auto best_time = std::numeric_limits::max(); - for (auto &tuning_result: pimpl->tuning_results_) { - if (tuning_result.status && best_time >= tuning_result.time) { - best_result = tuning_result; - best_time = tuning_result.time; - } - } + const auto best_result = pimpl->GetBestResult(); + const auto best_time = best_result.time; // Aborts if there was no best time found if (best_time == std::numeric_limits::max()) { @@ -321,14 +329,8 @@ double Tuner::PrintToScreen() const { void Tuner::PrintFormatted() const { // Finds the best result - auto best_result = pimpl->tuning_results_[0]; - auto best_time = std::numeric_limits::max(); - for (auto &tuning_result: pimpl->tuning_results_) { - if (tuning_result.status && best_time >= tuning_result.time) { - best_result = tuning_result; - best_time = tuning_result.time; - } - } + const auto best_result = pimpl->GetBestResult(); + const auto best_time = best_result.time; // Prints the best result in C++ database format auto count = size_t{0}; diff --git a/src/searchers/annealing.cc b/src/searchers/annealing.cc index cdbcad6..a5792ae 100644 --- a/src/searchers/annealing.cc +++ b/src/searchers/annealing.cc @@ -75,7 +75,11 @@ KernelInfo::Configuration Annealing::GetConfiguration() { void Annealing::CalculateNextIndex() { // Computes the new temperature - auto progress = num_visited_states_ / static_cast(NumConfigurations()); + const auto num_configurations = static_cast(NumConfigurations()); + if (num_configurations == 0.0) { + throw std::runtime_error("Running annealing with 0 configurations, aborting"); + } + auto progress = num_visited_states_ / num_configurations; auto temperature = max_temperature_ * (1.0 - progress); // Determines whether to continue with the neighbour or with the current ID @@ -89,7 +93,8 @@ void Annealing::CalculateNextIndex() { // Computes the new neighbour state auto neighbours = GetNeighboursOf(current_state_); - neighbour_state_ = neighbours[static_cast(int_distribution_(generator_))%neighbours.size()]; + const auto random_integer = static_cast(std::abs(int_distribution_(generator_))); + neighbour_state_ = neighbours[random_integer % neighbours.size()]; // Checks whether this neighbour was already visited. If so, calculate a new neighbour instead. // This continues up to a maximum number, because all neighbours might already be visited. In @@ -109,7 +114,7 @@ void Annealing::CalculateNextIndex() { // The number of configurations is equal to all possible configurations size_t Annealing::NumConfigurations() { - return std::max(size_t{1}, static_cast(configurations_.size()*fraction_)); + return std::max(size_t{1}, static_cast(static_cast(configurations_.size())*fraction_)); } // ================================================================================================= @@ -142,11 +147,14 @@ std::vector Annealing::GetNeighboursOf(const size_t reference_id) const } // Consider this configuration a neighbour if there is at most a certain amount of differences - if (differences == kMaxDifferences) { + if (differences <= kMaxDifferences) { neighbours.push_back(other_id); } ++other_id; } + if (neighbours.size() == 0) { + throw std::runtime_error("Running annealing and found no neighbours, aborting"); + } return neighbours; } diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc index 37df3f7..5b8b0c3 100644 --- a/src/tuner_impl.cc +++ b/src/tuner_impl.cc @@ -326,6 +326,11 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker auto global = kernel.global(); auto local = kernel.local(); + // Makes sure that the global size is a multiple of the local + for (auto i=size_t{0}; i::max(); + for (auto &tuning_result: tuning_results_) { + if (tuning_result.status && best_time >= tuning_result.time) { + best_result = tuning_result; + best_time = tuning_result.time; + } + } + return best_result; +} + +// ================================================================================================= + // Loads a file into a stringstream and returns the result as a string std::string TunerImpl::LoadFile(const std::string &filename) { std::ifstream file(filename);