diff --git a/CHANGELOG b/CHANGELOG index 2d51152..fb55234 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,8 @@ +Version 1.6.1 +- Fixed a couple of issues related to exceptions +- Improved reporting of failed runs + Version 1.6.0 - Much cleaner API due to Pimpl idiom: only cltune.h header is now required - Replaced Khronos' cl.hpp with a custom C++11 version tailored for CLTune diff --git a/CMakeLists.txt b/CMakeLists.txt index 118330a..b53314f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ cmake_minimum_required(VERSION 2.8) project("cltune" CXX) set(cltune_VERSION_MAJOR 1) set(cltune_VERSION_MINOR 6) -set(cltune_VERSION_PATCH 0) +set(cltune_VERSION_PATCH 1) # Options option(SAMPLES "Enable compilation of sample programs" ON) diff --git a/include/cltune.h b/include/cltune.h index 8fd1127..0dd0763 100644 --- a/include/cltune.h +++ b/include/cltune.h @@ -103,7 +103,7 @@ class Tuner { // Functions to add kernel-arguments for input buffers, output buffers, and scalars. Make sure to // call these in the order in which the arguments appear in the OpenCL kernel. - template void AddArgumentInput(std::vector &source); + template void AddArgumentInput(const std::vector &source); template void AddArgumentOutput(const std::vector &source); template void AddArgumentScalar(const T argument); diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index a722067..dffe112 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -46,25 +46,30 @@ #endif namespace cltune { - // ================================================================================================= + // Base class for any object class Object { protected: // Error handling - void Error(const std::string &message) { + [[noreturn]] void Error(const std::string &message) { throw std::runtime_error("Internal OpenCL error: "+message); } + [[noreturn]] void Error(const cl_int status) { + throw std::runtime_error("Internal OpenCL error with status: "+std::to_string(status)); + } }; // ================================================================================================= + // Base class for objects which require memory management class ObjectWithState: public Object { }; // ================================================================================================= + // C++11 version of cl_event class Event: public Object { public: @@ -114,12 +119,12 @@ class Platform: public Object { Platform(const size_t platform_id) { auto num_platforms = cl_uint{0}; auto status = clGetPlatformIDs(0, nullptr, &num_platforms); - if (status != CL_SUCCESS) { Error("status "+status); } + if (status != CL_SUCCESS) { Error(status); } if (num_platforms == 0) { Error("no platforms found"); } auto platforms = std::vector(num_platforms); status = clGetPlatformIDs(num_platforms, platforms.data(), nullptr); - if (status != CL_SUCCESS) { Error("status "+status); } - if (platform_id >= num_platforms) { Error("invalid platform ID "+platform_id); } + if (status != CL_SUCCESS) { Error(status); } + if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); } platform_ = platforms[platform_id]; } @@ -143,12 +148,12 @@ class Device: public Object { Device(const Platform &platform, const cl_device_type type, const size_t device_id) { auto num_devices = cl_uint{0}; auto status = clGetDeviceIDs(platform(), type, 0, nullptr, &num_devices); - if (status != CL_SUCCESS) { Error("status "+status); } + if (status != CL_SUCCESS) { Error(status); } if (num_devices == 0) { Error("no devices found"); } auto devices = std::vector(num_devices); status = clGetDeviceIDs(platform(), type, num_devices, devices.data(), nullptr); - if (status != CL_SUCCESS) { Error("status "+status); } - if (device_id >= num_devices) { Error("invalid device ID "+device_id); } + if (status != CL_SUCCESS) { Error(status); } + if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); } device_ = devices[device_id]; } @@ -229,7 +234,7 @@ class Context: public ObjectWithState { auto status = CL_SUCCESS; const cl_device_id dev = device(); context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); - if (status != CL_SUCCESS) { Error("status "+status); } + if (status != CL_SUCCESS) { Error(status); } } ~Context() { clReleaseContext(context_); @@ -268,7 +273,7 @@ class Program: public ObjectWithState { source_ptr_ = source_.data(); auto status = CL_SUCCESS; program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status); - if (status != CL_SUCCESS) { Error("status "+status); } + if (status != CL_SUCCESS) { Error(status); } } ~Program() { clReleaseProgram(program_); @@ -329,7 +334,7 @@ class Kernel: public ObjectWithState { Kernel(const Program &program, const std::string &name) { auto status = CL_SUCCESS; kernel_ = clCreateKernel(program(), name.c_str(), &status); - if (status != CL_SUCCESS) { Error("status "+status); } + if (status != CL_SUCCESS) { Error(status); } } ~Kernel() { clReleaseKernel(kernel_); @@ -381,7 +386,7 @@ class CommandQueue: public ObjectWithState { CommandQueue(const Context &context, const Device &device) { auto status = CL_SUCCESS; queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); - if (status != CL_SUCCESS) { Error("status "+status); } + if (status != CL_SUCCESS) { Error(status); } } ~CommandQueue() { clReleaseCommandQueue(queue_); @@ -411,6 +416,13 @@ class CommandQueue: public ObjectWithState { clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr); return Context(result); } + Device GetDevice() const { + auto bytes = size_t{0}; + clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes); + cl_device_id result; + clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr); + return Device(result); + } cl_int Finish() { return clFinish(queue_); } @@ -437,7 +449,7 @@ class Buffer: public ObjectWithState { Buffer(const Context &context, const cl_mem_flags flags, const size_t bytes) { auto status = CL_SUCCESS; buffer_ = clCreateBuffer(context(), flags, bytes, nullptr, &status); - if (status != CL_SUCCESS) { Error("status "+status); } + if (status != CL_SUCCESS) { Error(status); } } ~Buffer() { clReleaseMemObject(buffer_); @@ -464,12 +476,12 @@ class Buffer: public ObjectWithState { return ReadBuffer(queue, bytes, host.data()); } template - cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, T* host) { + cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const T* host) { return clEnqueueWriteBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr); } template - cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, std::vector &host) { - return WriteBuffer(queue, bytes, host.data()); + cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const std::vector &host) { + return WriteBuffer(queue, bytes, &host[0]); } // Accessors to the private data-member diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h index f3ff742..29fa0cd 100644 --- a/include/internal/tuner_impl.h +++ b/include/internal/tuner_impl.h @@ -40,6 +40,7 @@ #include // std::vector #include // std::shared_ptr #include // std::complex +#include // std::runtime_error namespace cltune { // ================================================================================================= @@ -91,6 +92,13 @@ class TunerImpl { KernelInfo::Configuration configuration; }; + // OpenCL exception with status printing + class OpenCLException: public std::runtime_error { + public: + OpenCLException(const std::string &message, int status): + std::runtime_error(message + std::to_string(status)) {} + }; + // Initialize either with platform 0 and device 0 or with a custom platform/device explicit TunerImpl(); explicit TunerImpl(size_t platform_id, size_t device_id); diff --git a/src/cltune.cc b/src/cltune.cc index 02f5722..053f90b 100644 --- a/src/cltune.cc +++ b/src/cltune.cc @@ -161,7 +161,7 @@ void Tuner::SetLocalMemoryUsage(const size_t id, LocalMemoryFunction amount, // Creates a new buffer of type Memory (containing both host and device data) based on a source // vector of data. Then, upload it to the device and store the argument in a list. template -void Tuner::AddArgumentInput(std::vector &source) { +void Tuner::AddArgumentInput(const std::vector &source) { auto device_buffer = Buffer(pimpl->context(), CL_MEM_READ_ONLY, source.size()*sizeof(T)); auto status = device_buffer.WriteBuffer(pimpl->queue(), source.size()*sizeof(T), source); if (status != CL_SUCCESS) { throw std::runtime_error("Write buffer error: " + status); } @@ -171,12 +171,12 @@ void Tuner::AddArgumentInput(std::vector &source) { } // Compiles the function for various data-types -template void Tuner::AddArgumentInput(std::vector&); -template void Tuner::AddArgumentInput(std::vector&); -template void Tuner::AddArgumentInput(std::vector&); -template void Tuner::AddArgumentInput(std::vector&); -template void Tuner::AddArgumentInput(std::vector&); -template void Tuner::AddArgumentInput(std::vector&); +template void Tuner::AddArgumentInput(const std::vector&); +template void Tuner::AddArgumentInput(const std::vector&); +template void Tuner::AddArgumentInput(const std::vector&); +template void Tuner::AddArgumentInput(const std::vector&); +template void Tuner::AddArgumentInput(const std::vector&); +template void Tuner::AddArgumentInput(const std::vector&); // Similar to the above function, but now marked as output buffer. Output buffers are special in the // sense that they will be checked in the verification process. @@ -288,7 +288,7 @@ double Tuner::PrintToScreen() const { // Prints all valid results and the one with the lowest execution time pimpl->PrintHeader("Printing results to stdout"); for (auto &tuning_result: pimpl->tuning_results_) { - if (tuning_result.status) { + if (tuning_result.status && tuning_result.time != std::numeric_limits::max()) { pimpl->PrintResult(stdout, tuning_result, pimpl->kMessageResult); } } diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc index 75cda61..6b18297 100644 --- a/src/tuner_impl.cc +++ b/src/tuner_impl.cc @@ -193,6 +193,11 @@ void TunerImpl::Tune() { if (!tuning_result.status) { PrintResult(stdout, tuning_result, kMessageWarning); } + if (tuning_result.time == std::numeric_limits::max()) { + tuning_result.time = 0.0; + PrintResult(stdout, tuning_result, kMessageFailure); + tuning_result.time = std::numeric_limits::max(); + } } // Prints a log of the searching process. This is disabled per default, but can be enabled @@ -220,48 +225,48 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker auto processed_source = std::regex_replace(source, string_literal_start, ""); processed_source = std::regex_replace(processed_source, string_literal_end, ""); - // Compiles the kernel and prints the compiler errors/warnings - auto status = CL_SUCCESS; - auto program = Program(context_, processed_source); - status = program.Build(device_, ""); - if (status == CL_BUILD_PROGRAM_FAILURE) { - auto message = program.GetBuildInfo(device_); - fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); - throw std::runtime_error("OpenCL compiler error/warning occurred ^^\n"); - } - if (status != CL_SUCCESS) { throw std::runtime_error("Program build error: " + status); } - - // Sets the output buffer(s) to zero - for (auto &output: arguments_output_) { - switch (output.type) { - case MemType::kInt: ResetMemArgument(output); break; - case MemType::kFloat: ResetMemArgument(output); break; - case MemType::kDouble: ResetMemArgument(output); break; - case MemType::kFloat2: ResetMemArgument(output); break; - case MemType::kDouble2: ResetMemArgument(output); break; - default: throw std::runtime_error("Unsupported reference output data-type"); - } - } - - // Sets the kernel and its arguments - auto tune_kernel = Kernel(program, kernel.name()); - if (status != CL_SUCCESS) { throw std::runtime_error("Kernel creation error: " + status); } - for (auto &i: arguments_input_) { tune_kernel.SetArgument(static_cast(i.index), i.buffer); } - for (auto &i: arguments_output_) { tune_kernel.SetArgument(static_cast(i.index), i.buffer); } - for (auto &i: arguments_int_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } - for (auto &i: arguments_size_t_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } - for (auto &i: arguments_float_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } - for (auto &i: arguments_double_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } - for (auto &i: arguments_float2_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } - for (auto &i: arguments_double2_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } - - // Sets the global and local thread-sizes - auto global = kernel.global(); - auto local = kernel.local(); - // In case of an exception, skip this run try { + // Compiles the kernel and prints the compiler errors/warnings + auto status = CL_SUCCESS; + auto program = Program(context_, processed_source); + status = program.Build(device_, ""); + if (status == CL_BUILD_PROGRAM_FAILURE) { + auto message = program.GetBuildInfo(device_); + fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); + throw std::runtime_error("OpenCL compiler error/warning occurred ^^\n"); + } + if (status != CL_SUCCESS) { throw OpenCLException("Program build error: ", status); } + + // Sets the output buffer(s) to zero + for (auto &output: arguments_output_) { + switch (output.type) { + case MemType::kInt: ResetMemArgument(output); break; + case MemType::kFloat: ResetMemArgument(output); break; + case MemType::kDouble: ResetMemArgument(output); break; + case MemType::kFloat2: ResetMemArgument(output); break; + case MemType::kDouble2: ResetMemArgument(output); break; + default: throw std::runtime_error("Unsupported reference output data-type"); + } + } + + // Sets the kernel and its arguments + auto tune_kernel = Kernel(program, kernel.name()); + if (status != CL_SUCCESS) { throw OpenCLException("Kernel creation error: ", status); } + for (auto &i: arguments_input_) { tune_kernel.SetArgument(static_cast(i.index), i.buffer); } + for (auto &i: arguments_output_) { tune_kernel.SetArgument(static_cast(i.index), i.buffer); } + for (auto &i: arguments_int_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } + for (auto &i: arguments_size_t_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } + for (auto &i: arguments_float_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } + for (auto &i: arguments_double_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } + for (auto &i: arguments_float2_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } + for (auto &i: arguments_double2_) { tune_kernel.SetArgument(static_cast(i.first), i.second); } + + // Sets the global and local thread-sizes + auto global = kernel.global(); + auto local = kernel.local(); + // Verifies the local memory usage of the kernel auto local_mem_usage = tune_kernel.LocalMemUsage(device_); if (!device_.IsLocalMemoryValid(local_mem_usage)) { @@ -270,22 +275,19 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker // Prepares the kernel status = queue_.Finish(); - if (status != CL_SUCCESS) { throw std::runtime_error("Command queue error: " + status); } + if (status != CL_SUCCESS) { throw OpenCLException("Command queue error: ", status); } // Runs the kernel (this is the timed part) fprintf(stdout, "%s Running %s\n", kMessageRun.c_str(), kernel.name().c_str()); auto events = std::vector(kNumRuns); for (auto t=0; t::max(); @@ -309,6 +311,8 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker // There was an exception, now return an invalid tuner results catch(std::exception& e) { + fprintf(stdout, "%s Kernel %s failed\n", kMessageFailure.c_str(), kernel.name().c_str()); + fprintf(stdout, "%s catched exception: %s\n", kMessageFailure.c_str(), e.what()); TunerResult result = {kernel.name(), std::numeric_limits::max(), 0, false, {}}; return result; } @@ -326,7 +330,7 @@ void TunerImpl::ResetMemArgument(MemArgument &argument) { // Copy the new array to the OpenCL buffer on the device auto bytes = sizeof(T)*argument.size; auto status = argument.buffer.WriteBuffer(queue_, bytes, buffer); - if (status != CL_SUCCESS) { throw std::runtime_error("Write buffer error: " + status); } + if (status != CL_SUCCESS) { throw OpenCLException("Write buffer error: ", status); } } // ================================================================================================= @@ -350,7 +354,7 @@ template void TunerImpl::DownloadReference(MemArgument &device_buff auto host_buffer = new T[device_buffer.size]; auto bytes = sizeof(T)*device_buffer.size; auto status = device_buffer.buffer.ReadBuffer(queue_, bytes, host_buffer); - if (status != CL_SUCCESS) { throw std::runtime_error("Read buffer error: " + status); } + if (status != CL_SUCCESS) { throw OpenCLException("Read buffer error: ", status); } reference_outputs_.push_back(host_buffer); } @@ -388,7 +392,7 @@ bool TunerImpl::DownloadAndCompare(MemArgument &device_buffer, const size_t i) { std::vector host_buffer(device_buffer.size); auto bytes = sizeof(T)*device_buffer.size; auto status = device_buffer.buffer.ReadBuffer(queue_, bytes, host_buffer); - if (status != CL_SUCCESS) { throw std::runtime_error("Read buffer error: " + status); } + if (status != CL_SUCCESS) { throw OpenCLException("Read buffer error: ", status); } // Compares the results (L2 norm) T* reference_output = (T*)reference_outputs_[i];