From d8318a507e6df42164b48c664e37a5c3e4d02ee2 Mon Sep 17 00:00:00 2001 From: williamjshipman Date: Sun, 31 Jul 2016 01:31:22 +0200 Subject: [PATCH 1/6] Fix bug in Kernel::LocalMemUsage where Intel CPU runtime returns a size of 0 if the in the first call to clGetKernelWorkGroupInfo. Cause seems to be an ambiguity in the OpenCL standard. --- include/internal/clpp11.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index e3bcb01..1b0025b 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -86,12 +86,10 @@ class Event { // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx float GetElapsedTime() const { WaitForCompletion(); - auto bytes = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); - auto time_start = size_t{0}; + auto bytes = sizeof(cl_ulong); + auto time_start = cl_ulong{0}; clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); - auto time_end = size_t{0}; + auto time_end = cl_ulong{0}; clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); return (time_end - time_start) * 1.0e-6f; } @@ -592,8 +590,7 @@ class Buffer { // Retrieves the actual allocated size in bytes size_t GetSize() const { - auto bytes = size_t{0}; - CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes)); + auto bytes = sizeof(size_t); auto result = size_t{0}; CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr)); return result; @@ -645,9 +642,8 @@ class Kernel { // Retrieves the amount of local memory used per work-group for this kernel size_t LocalMemUsage(const Device &device) const { - auto bytes = size_t{0}; + auto bytes = sizeof(cl_ulong); auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; - CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes)); auto result = size_t{0}; CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); return result; From 68cb1d44d0bd1d14a4df042de02ea21f12b3f479 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 3 Aug 2016 20:17:41 +0200 Subject: [PATCH 2/6] Updated to version 7.0 of the CLCudaAPI header --- CHANGELOG | 3 ++ include/internal/clpp11.h | 108 ++++++++++++++++++++++---------------- 2 files changed, 65 insertions(+), 46 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index fd6ff46..8a7553f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,7 @@ +Next release (development version) +- Updated to version 7.0 of the CLCudaAPI header + Version 2.4.0 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON) diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index 1b0025b..e630187 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -70,15 +70,24 @@ inline void CheckError(const cl_int status) { class Event { public: - // Constructor based on the regular OpenCL data-type - explicit Event(const cl_event event): event_(event) { } + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Event(const cl_event event): + event_(new cl_event) { + *event_ = event; + } - // Regular constructor - explicit Event(): event_(nullptr) { } + // Regular constructor with memory management + explicit Event(): + event_(new cl_event, [](cl_event* e) { + if (*e) { CheckError(clReleaseEvent(*e)); } + delete e; + }) { + *event_ = nullptr; + } // Waits for completion of this event void WaitForCompletion() const { - CheckError(clWaitForEvents(1, &event_)); + CheckError(clWaitForEvents(1, &(*event_))); } // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on @@ -86,19 +95,21 @@ class Event { // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx float GetElapsedTime() const { WaitForCompletion(); - auto bytes = sizeof(cl_ulong); + const auto bytes = sizeof(cl_ulong); auto time_start = cl_ulong{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); auto time_end = cl_ulong{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); - return (time_end - time_start) * 1.0e-6f; + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); + return static_cast(time_end - time_start) * 1.0e-6f; } // Accessor to the private data-member - cl_event& operator()() { return event_; } - cl_event* pointer() { return &event_; } + cl_event& operator()() { return *event_; } + const cl_event& operator()() const { return *event_; } + cl_event* pointer() { return &(*event_); } + const cl_event* pointer() const { return &(*event_); } private: - cl_event event_; + std::shared_ptr event_; }; // Pointer to an OpenCL event @@ -181,24 +192,32 @@ class Device { } size_t MaxWorkGroupSize() const { return GetInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE); } size_t MaxWorkItemDimensions() const { - return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS); + return static_cast(GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)); } std::vector MaxWorkItemSizes() const { return GetInfoVector(CL_DEVICE_MAX_WORK_ITEM_SIZES); } - size_t LocalMemSize() const { - return static_cast(GetInfo(CL_DEVICE_LOCAL_MEM_SIZE)); + unsigned long LocalMemSize() const { + return GetInfo(CL_DEVICE_LOCAL_MEM_SIZE); } std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } - size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); } - size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); } - size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); } - size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); } + size_t CoreClock() const { + return static_cast(GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY)); + } + size_t ComputeUnits() const { + return static_cast(GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS)); + } + unsigned long MemorySize() const { + return static_cast(GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE)); + } + unsigned long MaxAllocSize() const { + return static_cast(GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE)); + } size_t MemoryClock() const { return 0; } // Not exposed in OpenCL size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL // Configuration-validity checks - bool IsLocalMemoryValid(const size_t local_mem_usage) const { + bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const { return (local_mem_usage <= LocalMemSize()); } bool IsThreadConfigValid(const std::vector &local) const { @@ -215,9 +234,11 @@ class Device { // Query for a specific type of device or brand bool IsCPU() const { return Type() == "CPU"; } bool IsGPU() const { return Type() == "GPU"; } - bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; } + bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." || + Vendor() == "AuthenticAMD";; } bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; } - bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; } + bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" || + Vendor() == "GenuineIntel"; } bool IsARM() const { return Vendor() == "ARM"; } // Accessor to the private data-member @@ -234,13 +255,6 @@ class Device { CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); return result; } - size_t GetInfo(const cl_device_info info) const { - auto bytes = size_t{0}; - CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); - auto result = cl_uint(0); - CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); - return static_cast(result); - } template std::vector GetInfoVector(const cl_device_info info) const { auto bytes = size_t{0}; @@ -590,7 +604,7 @@ class Buffer { // Retrieves the actual allocated size in bytes size_t GetSize() const { - auto bytes = sizeof(size_t); + const auto bytes = sizeof(size_t); auto result = size_t{0}; CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr)); return result; @@ -641,12 +655,22 @@ class Kernel { } // Retrieves the amount of local memory used per work-group for this kernel - size_t LocalMemUsage(const Device &device) const { - auto bytes = sizeof(cl_ulong); + unsigned long LocalMemUsage(const Device &device) const { + const auto bytes = sizeof(cl_ulong); auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; - auto result = size_t{0}; + auto result = cl_ulong{0}; CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); - return result; + return static_cast(result); + } + + // Retrieves the name of the kernel + std::string GetFunctionName() const { + auto bytes = size_t{0}; + CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr)); + return std::string{result.c_str()}; // Removes any trailing '\0'-characters } // Launches a kernel onto the specified queue @@ -660,30 +684,22 @@ class Kernel { // As above, but with an event waiting list void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event, - std::vector& waitForEvents) { - if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } + const std::vector &waitForEvents) { // Builds a plain version of the events waiting list auto waitForEventsPlain = std::vector(); for (auto &waitEvent : waitForEvents) { - waitForEventsPlain.push_back(waitEvent()); + if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); } } // Launches the kernel while waiting for other events CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), - nullptr, global.data(), local.data(), + nullptr, global.data(), !local.empty() ? local.data() : nullptr, static_cast(waitForEventsPlain.size()), - waitForEventsPlain.data(), + !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr, event)); } - // As above, but with the default local workgroup size - void Launch(const Queue &queue, const std::vector &global, EventPointer event) { - CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), - nullptr, global.data(), nullptr, - 0, nullptr, event)); - } - // Accessor to the private data-member const cl_kernel& operator()() const { return *kernel_; } private: From 492c3620462cef4b8a87131dfabe2b2ba8bf94a2 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 3 Aug 2016 20:21:59 +0200 Subject: [PATCH 3/6] Updated Travis CI to use the system OpenCL instead of compiling our own OpenCL library --- .travis.yml | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/.travis.yml b/.travis.yml index fd31c0e..8526aae 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,49 +17,21 @@ addons: - kubuntu-backports packages: - cmake + - ocl-icd-opencl-dev env: global: - CLTUNE_ROOT=${TRAVIS_BUILD_DIR}/bin/cltune - - OPENCL_REGISTRY=https://www.khronos.org/registry/cl - - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl before_install: - cmake --version; - ${CC} --version; - ${CXX} --version; -install: - # The following linux logic is necessary because of Travis's move to the GCE platform, which does not - # currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221 - # We build our own linkable .so file - - if [ ${TRAVIS_OS_NAME} == "linux" ]; then - mkdir -p ${OPENCL_ROOT}; - pushd ${OPENCL_ROOT}; - travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; - mv ./OpenCL-ICD-Loader/* .; - travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL; - pushd inc/CL; - travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp; - popd; - mkdir -p lib; - pushd lib; - cmake -G "Unix Makefiles" ..; - make; - cp ./bin/libOpenCL.so .; - popd; - pushd inc/CL; - travis_retry git fetch origin opencl12:opencl12; - git checkout opencl12; - popd; - mv inc/ include/; - popd; - fi - before_script: - mkdir -p ${CLTUNE_ROOT} - pushd ${CLTUNE_ROOT} - - cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DSAMPLES=ON -DTESTS=ON ${TRAVIS_BUILD_DIR} + - cmake -DSAMPLES=ON -DTESTS=ON ${TRAVIS_BUILD_DIR} script: - make From bb4ba83d3c517b8659c7be3416878cb0459fb3c1 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 27 Sep 2016 20:48:10 +0200 Subject: [PATCH 4/6] Updated to version 8.0 of CLCudaAPI --- CHANGELOG | 2 +- include/internal/clpp11.h | 16 +++++++++-- include/internal/cupp11.h | 60 ++++++++++++++++++++++++++++----------- 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 8a7553f..4b824c2 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ Next release (development version) -- Updated to version 7.0 of the CLCudaAPI header +- Updated to version 8.0 of the CLCudaAPI header Version 2.4.0 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index e630187..2a7b22f 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -11,7 +11,8 @@ // Portability here means that a similar header exists for CUDA with the same classes and // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. // -// This is version 6.0 of CLCudaAPI . +// This file is taken from the CLCudaAPI project and +// therefore contains the following header copyright notice: // // ================================================================================================= // @@ -148,6 +149,17 @@ class Platform { cl_platform_id platform_; }; +// Retrieves a vector with all platforms +inline std::vector GetAllPlatforms() { + auto num_platforms = cl_uint{0}; + CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); + auto all_platforms = std::vector(); + for (size_t platform_id = 0; platform_id < static_cast(num_platforms); ++platform_id) { + all_platforms.push_back(Platform(platform_id)); + } + return all_platforms; +} + // ================================================================================================= // C++11 version of 'cl_device_id' @@ -198,7 +210,7 @@ class Device { return GetInfoVector(CL_DEVICE_MAX_WORK_ITEM_SIZES); } unsigned long LocalMemSize() const { - return GetInfo(CL_DEVICE_LOCAL_MEM_SIZE); + return static_cast(GetInfo(CL_DEVICE_LOCAL_MEM_SIZE)); } std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } size_t CoreClock() const { diff --git a/include/internal/cupp11.h b/include/internal/cupp11.h index c1098f4..67a99d7 100644 --- a/include/internal/cupp11.h +++ b/include/internal/cupp11.h @@ -11,7 +11,8 @@ // Portability here means that a similar header exists for OpenCL with the same classes and // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change. // -// This is version 6.0 of CLCudaAPI . +// This file is taken from the CLCudaAPI project and +// therefore contains the following header copyright notice: // // ================================================================================================= // @@ -138,6 +139,12 @@ class Platform { size_t platform_id_; }; +// Retrieves a vector with all platforms. Note that there is just one platform in CUDA. +inline std::vector GetAllPlatforms() { + auto all_platforms = std::vector{ Platform(size_t{0}) }; + return all_platforms; +} + // ================================================================================================= // C++11 version of 'CUdevice' @@ -180,7 +187,9 @@ class Device { GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)}; } - size_t LocalMemSize() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK); } + unsigned long LocalMemSize() const { + return static_cast(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); + } std::string Capabilities() const { auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); @@ -188,12 +197,12 @@ class Device { } size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); } size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); } - size_t MemorySize() const { + unsigned long MemorySize() const { auto result = size_t{0}; CheckError(cuDeviceTotalMem(&result, device_)); - return result; + return static_cast(result); } - size_t MaxAllocSize() const { return MemorySize(); } + unsigned long MaxAllocSize() const { return MemorySize(); } size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); } size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); } @@ -276,12 +285,22 @@ class Program { program_(new nvrtcProgram, [](nvrtcProgram* p) { CheckError(nvrtcDestroyProgram(p)); delete p; }), source_(std::move(source)), - source_ptr_(&source_[0]) { + source_ptr_(&source_[0]), + from_binary_(false) { CheckError(nvrtcCreateProgram(program_.get(), source_ptr_, nullptr, 0, nullptr, nullptr)); } + // PTX-based constructor + explicit Program(const Device &device, const Context &context, const std::string& binary): + program_(nullptr), // not used + source_(binary), + source_ptr_(&source_[0]), // not used + from_binary_(true) { + } + // Compiles the device program and returns whether or not there where any warnings/errors BuildStatus Build(const Device &, std::vector &options) { + if (from_binary_) { return BuildStatus::kSuccess; } auto raw_options = std::vector(); for (const auto &option: options) { raw_options.push_back(option.c_str()); @@ -301,6 +320,7 @@ class Program { // Retrieves the warning/error message from the compiler (if any) std::string GetBuildInfo(const Device &) const { + if (from_binary_) { return std::string{}; } auto bytes = size_t{0}; CheckError(nvrtcGetProgramLogSize(*program_, &bytes)); auto result = std::string{}; @@ -311,6 +331,7 @@ class Program { // Retrieves an intermediate representation of the compiled program (i.e. PTX) std::string GetIR() const { + if (from_binary_) { return source_; } // holds the PTX auto bytes = size_t{0}; CheckError(nvrtcGetPTXSize(*program_, &bytes)); auto result = std::string{}; @@ -325,6 +346,7 @@ class Program { std::shared_ptr program_; std::string source_; const char* source_ptr_; + const bool from_binary_; }; // ================================================================================================= @@ -565,10 +587,15 @@ class Kernel { // Retrieves the amount of local memory used per work-group for this kernel. Note that this the // shared memory in CUDA terminology. - size_t LocalMemUsage(const Device &) const { + unsigned long LocalMemUsage(const Device &) const { auto result = 0; CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_)); - return static_cast(result); + return static_cast(result); + } + + // Retrieves the name of the kernel + std::string GetFunctionName() const { + return std::string{"unknown"}; // Not implemented for the CUDA backend } // Launches a kernel onto the specified queue @@ -600,14 +627,15 @@ class Kernel { void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event, std::vector& waitForEvents) { - if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } - Error("launching with an event waiting list is not implemented for the CUDA back-end"); - } - - // As above, but with the default local workgroup size - // TODO: Implement this function - void Launch(const Queue &, const std::vector &, EventPointer) { - Error("launching with a default workgroup size is not implemented for the CUDA back-end"); + if (local.size() == 0) { + Error("launching with a default workgroup size is not implemented for the CUDA back-end"); + } + else if (waitForEvents.size() != 0) { + Error("launching with an event waiting list is not implemented for the CUDA back-end"); + } + else { + return Launch(queue, global, local, event); + } } // Accessors to the private data-members From 2a567220df710065e6e2eecf43007e641719dcb3 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 27 Sep 2016 20:49:47 +0200 Subject: [PATCH 5/6] Made the number of runs for averaging a setting configurable by the user --- CHANGELOG | 1 + include/cltune.h | 3 +++ include/internal/tuner_impl.h | 2 +- samples/simple/simple.cc | 1 + src/cltune.cc | 5 +++++ src/tuner_impl.cc | 14 ++++++++------ 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4b824c2..ecdc21c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ Next release (development version) - Updated to version 8.0 of the CLCudaAPI header +- Made it possible to configure the number of times each kernel is run (to average results) Version 2.4.0 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples diff --git a/include/cltune.h b/include/cltune.h index eaf8a72..9837556 100644 --- a/include/cltune.h +++ b/include/cltune.h @@ -151,6 +151,9 @@ class Tuner { // Disables all further printing to stdout void PUBLIC_API SuppressOutput(); + // Changes the number of times each kernel should be run. Used for averaging execution times. + void PUBLIC_API SetNumRuns(const size_t num_runs); + private: // This implements the pointer to implementation idiom (pimpl) and hides all private functions and diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h index ffb1b69..b471fd3 100644 --- a/include/internal/tuner_impl.h +++ b/include/internal/tuner_impl.h @@ -74,7 +74,6 @@ class TunerImpl { // Parameters static constexpr auto kMaxL2Norm = 1e-4; // This is the threshold for 'correctness' - static constexpr auto kNumRuns = size_t{1}; // This is used for more-accurate execution time measurement // Messages printed to stdout (in colours) static const std::string kMessageFull; @@ -158,6 +157,7 @@ class TunerImpl { Queue queue_; // Settings + size_t num_runs_; // This is used for more-accurate execution time measurement bool has_reference_; bool suppress_output_; bool output_search_process_; diff --git a/samples/simple/simple.cc b/samples/simple/simple.cc index fd7062e..0493746 100644 --- a/samples/simple/simple.cc +++ b/samples/simple/simple.cc @@ -52,6 +52,7 @@ int main() { tuner.AddArgumentOutput(vec_c); // Starts the tuner + tuner.SetNumRuns(10); tuner.Tune(); // Prints the results to screen diff --git a/src/cltune.cc b/src/cltune.cc index 455fe8e..72e3fb7 100644 --- a/src/cltune.cc +++ b/src/cltune.cc @@ -441,5 +441,10 @@ void Tuner::SuppressOutput() { pimpl->suppress_output_ = true; } +// Sets the number of runs to average time measurements. +void Tuner::SetNumRuns(const size_t num_runs) { + pimpl->num_runs_ = num_runs; +} + // ================================================================================================= } // namespace cltune diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc index 4a67495..58bb171 100644 --- a/src/tuner_impl.cc +++ b/src/tuner_impl.cc @@ -68,6 +68,7 @@ TunerImpl::TunerImpl(): device_(Device(platform_, size_t{0})), context_(Context(device_)), queue_(Queue(context_, device_)), + num_runs_(size_t{1}), has_reference_(false), suppress_output_(false), output_search_process_(false), @@ -90,6 +91,7 @@ TunerImpl::TunerImpl(size_t platform_id, size_t device_id): device_(Device(platform_, device_id)), context_(Context(device_)), queue_(Queue(context_, device_)), + num_runs_(size_t{1}), has_reference_(false), suppress_output_(false), output_search_process_(false), @@ -325,11 +327,11 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker // Runs the kernel (this is the timed part) fprintf(stdout, "%s Running %s\n", kMessageRun.c_str(), kernel.name().c_str()); - auto events = std::vector(kNumRuns); - for (auto t=size_t{0}; t(num_runs_); + for (auto t=size_t{0}; t::max(); - for (auto t=size_t{0}; t Date: Tue, 27 Sep 2016 20:53:32 +0200 Subject: [PATCH 6/6] Updated to version 2.5.0 --- CHANGELOG | 2 +- CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index ecdc21c..5fcb1a6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,5 @@ -Next release (development version) +Version 2.5.0 - Updated to version 8.0 of the CLCudaAPI header - Made it possible to configure the number of times each kernel is run (to average results) diff --git a/CMakeLists.txt b/CMakeLists.txt index 831b421..14a106a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla # CMake project details project("cltune" CXX) set(cltune_VERSION_MAJOR 2) -set(cltune_VERSION_MINOR 4) +set(cltune_VERSION_MINOR 5) set(cltune_VERSION_PATCH 0) # Options