diff --git a/CHANGELOG b/CHANGELOG index 9334ddc..cc1443b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,9 @@ +Version 2.3.0 +- Added support for 'short' and 'cl_half' data-types as kernel buffer and scalar arguments +- Fixed a bug where failed results would still show up in the tuning results +- Made MSVC link the run-time libraries statically + Version 2.2.0 - Added two new simpler samples of using the tuner (vector-add and convolution) - Updated the general documentation diff --git a/CMakeLists.txt b/CMakeLists.txt index 24ad9e8..d43c869 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,11 +23,16 @@ # # ================================================================================================== -# CMake project cmake_minimum_required(VERSION 2.8.10) + +# Overrides for MSVC static runtime +set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake) +set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_flag_overrides.cmake) + +# CMake project details project("cltune" CXX) set(cltune_VERSION_MAJOR 2) -set(cltune_VERSION_MINOR 2) +set(cltune_VERSION_MINOR 3) set(cltune_VERSION_PATCH 0) # Options @@ -54,40 +59,41 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically deter # ================================================================================================== # Compiler-version check (requires at least CMake 2.8.10) -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) +if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) message(FATAL_ERROR "GCC version must be at least 4.7") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) message(FATAL_ERROR "Clang version must be at least 3.3") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) - message(FATAL_ERROR "Clang version must be at least 5.0") +elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + message(FATAL_ERROR "AppleClang version must be at least 5.0") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) message(FATAL_ERROR "ICC version must be at least 14.0") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) +elseif(MSVC) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) message(FATAL_ERROR "MS Visual Studio version must be at least 18.0") endif() endif() # C++ compiler settings -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - set(FLAGS "/Ox /wd4715 /wd4996") -else () +if(MSVC) + set(FLAGS "/Ox") + set(FLAGS "${FLAGS} /wd4715 /wd4996") +else() set(FLAGS "-O3 -std=c++11") - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) set(FLAGS "${FLAGS} -Wall -Wno-comment") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8.4) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8.4) set(FLAGS "${FLAGS} -Wno-attributes") endif() - elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") - set(FLAGS "${FLAGS} -Wextra") + elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang) + set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic") endif() endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") diff --git a/cmake/c_flag_overrides.cmake b/cmake/c_flag_overrides.cmake new file mode 100644 index 0000000..c2eb0d1 --- /dev/null +++ b/cmake/c_flag_overrides.cmake @@ -0,0 +1,8 @@ +# Overriding the CMake flags to use static runtime libraries +# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F +if(MSVC) + set(CMAKE_C_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1") + set(CMAKE_C_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG") + set(CMAKE_C_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG") + set(CMAKE_C_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG") +endif() diff --git a/cmake/cxx_flag_overrides.cmake b/cmake/cxx_flag_overrides.cmake new file mode 100644 index 0000000..1511f5f --- /dev/null +++ b/cmake/cxx_flag_overrides.cmake @@ -0,0 +1,8 @@ +# Overriding the CMake flags to use static runtime libraries +# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F +if(MSVC) + set(CMAKE_CXX_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1") + set(CMAKE_CXX_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG") + set(CMAKE_CXX_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG") +endif() diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h index 4de8004..1ec560b 100644 --- a/include/internal/tuner_impl.h +++ b/include/internal/tuner_impl.h @@ -50,6 +50,13 @@ namespace cltune { // ================================================================================================= +// Host data-type for half-precision floating-point (16-bit) +#if USE_OPENCL + using half = cl_half; +#else + using half = short unsigned int; +#endif + // Shorthands for complex data-types using float2 = std::complex; // cl_float2; using double2 = std::complex; // cl_double2; @@ -62,7 +69,7 @@ using double2 = std::complex; // cl_double2; #endif // Enumeration of currently supported data-types by this class -enum class MemType { kInt, kSizeT, kFloat, kDouble, kFloat2, kDouble2 }; +enum class MemType { kShort, kInt, kSizeT, kHalf, kFloat, kDouble, kFloat2, kDouble2 }; // See comment at top of file for a description of the class class TunerImpl { diff --git a/src/cltune.cc b/src/cltune.cc index 356eb21..0057d00 100644 --- a/src/cltune.cc +++ b/src/cltune.cc @@ -170,8 +170,10 @@ void Tuner::AddArgumentInput(const std::vector &source) { } // Compiles the function for various data-types +template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentInput(const std::vector&); @@ -188,8 +190,10 @@ void Tuner::AddArgumentOutput(const std::vector &source) { } // Compiles the function for various data-types +template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); +template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector&); @@ -198,12 +202,18 @@ template void PUBLIC_API Tuner::AddArgumentOutput(const std::vector void PUBLIC_API Tuner::AddArgumentScalar(const short argument) { + pimpl->arguments_int_.push_back({pimpl->argument_counter_++, argument}); +} template <> void PUBLIC_API Tuner::AddArgumentScalar(const int argument) { pimpl->arguments_int_.push_back({pimpl->argument_counter_++, argument}); } template <> void PUBLIC_API Tuner::AddArgumentScalar(const size_t argument) { pimpl->arguments_size_t_.push_back({pimpl->argument_counter_++, argument}); } +template <> void PUBLIC_API Tuner::AddArgumentScalar(const half argument) { + pimpl->arguments_float_.push_back({pimpl->argument_counter_++, argument}); +} template <> void PUBLIC_API Tuner::AddArgumentScalar(const float argument) { pimpl->arguments_float_.push_back({pimpl->argument_counter_++, argument}); } @@ -352,10 +362,18 @@ void Tuner::PrintJSON(const std::string &filename, fprintf(file, " \"device_compute_units\": \"%zu\",\n", pimpl->device().ComputeUnits()); fprintf(file, " \"results\": [\n"); + // Filters failed configurations + auto results = std::vector(); + for (const auto &tuning_result: pimpl->tuning_results_) { + if (tuning_result.status && tuning_result.time != std::numeric_limits::max()) { + results.push_back(tuning_result); + } + } + // Loops over all the results - auto num_results = pimpl->tuning_results_.size(); + auto num_results = results.size(); for (auto r=size_t{0}; rtuning_results_[r]; + auto result = results[r]; fprintf(file, " {\n"); fprintf(file, " \"kernel\": \"%s\",\n", result.kernel_name.c_str()); fprintf(file, " \"time\": %.3lf,\n", result.time); diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc index 51fa96e..8556eda 100644 --- a/src/tuner_impl.cc +++ b/src/tuner_impl.cc @@ -206,15 +206,16 @@ void TunerImpl::Tune() { // Stores the parameters and the timing-result tuning_result.configuration = permutation; - tuning_results_.push_back(tuning_result); if (tuning_result.time == std::numeric_limits::max()) { tuning_result.time = 0.0; PrintResult(stdout, tuning_result, kMessageFailure); tuning_result.time = std::numeric_limits::max(); + tuning_result.status = false; } else if (!tuning_result.status) { PrintResult(stdout, tuning_result, kMessageWarning); } + tuning_results_.push_back(tuning_result); } // Prints a log of the searching process. This is disabled per default, but can be enabled @@ -265,8 +266,10 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker // Sets the output buffer(s) to zero for (auto &output: arguments_output_) { switch (output.type) { + case MemType::kShort: ResetMemArgument(output); break; case MemType::kInt: ResetMemArgument(output); break; case MemType::kSizeT: ResetMemArgument(output); break; + case MemType::kHalf: ResetMemArgument(output); break; case MemType::kFloat: ResetMemArgument(output); break; case MemType::kDouble: ResetMemArgument(output); break; case MemType::kFloat2: ResetMemArgument(output); break; @@ -357,8 +360,10 @@ void TunerImpl::StoreReferenceOutput() { reference_outputs_.clear(); for (auto &output_buffer: arguments_output_) { switch (output_buffer.type) { + case MemType::kShort: DownloadReference(output_buffer); break; case MemType::kInt: DownloadReference(output_buffer); break; case MemType::kSizeT: DownloadReference(output_buffer); break; + case MemType::kHalf: DownloadReference(output_buffer); break; case MemType::kFloat: DownloadReference(output_buffer); break; case MemType::kDouble: DownloadReference(output_buffer); break; case MemType::kFloat2: DownloadReference(output_buffer); break; @@ -385,8 +390,10 @@ bool TunerImpl::VerifyOutput() { auto i = size_t{0}; for (auto &output_buffer: arguments_output_) { switch (output_buffer.type) { + case MemType::kShort: status &= DownloadAndCompare(output_buffer, i); break; case MemType::kInt: status &= DownloadAndCompare(output_buffer, i); break; case MemType::kSizeT: status &= DownloadAndCompare(output_buffer, i); break; + case MemType::kHalf: status &= DownloadAndCompare(output_buffer, i); break; case MemType::kFloat: status &= DownloadAndCompare(output_buffer, i); break; case MemType::kDouble: status &= DownloadAndCompare(output_buffer, i); break; case MemType::kFloat2: status &= DownloadAndCompare(output_buffer, i); break; @@ -612,8 +619,10 @@ void TunerImpl::PrintHeader(const std::string &header_name) const { // ================================================================================================= // Get the MemType based on a template argument +template <> MemType TunerImpl::GetType() { return MemType::kShort; } template <> MemType TunerImpl::GetType() { return MemType::kInt; } template <> MemType TunerImpl::GetType() { return MemType::kSizeT; } +template <> MemType TunerImpl::GetType() { return MemType::kHalf; } template <> MemType TunerImpl::GetType() { return MemType::kFloat; } template <> MemType TunerImpl::GetType() { return MemType::kDouble; } template <> MemType TunerImpl::GetType() { return MemType::kFloat2; }