From d8318a507e6df42164b48c664e37a5c3e4d02ee2 Mon Sep 17 00:00:00 2001
From: williamjshipman <shipman.william@gmail.com>
Date: Sun, 31 Jul 2016 01:31:22 +0200
Subject: [PATCH 1/6] Fix bug in Kernel::LocalMemUsage where Intel CPU runtime
 returns a size of 0 if the in the first call to clGetKernelWorkGroupInfo.
 Cause seems to be an ambiguity in the OpenCL standard.

---
 include/internal/clpp11.h | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
index e3bcb01..1b0025b 100644
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@@ -86,12 +86,10 @@ class Event {
   // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
   float GetElapsedTime() const {
     WaitForCompletion();
-    auto bytes = size_t{0};
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
-    auto time_start = size_t{0};
+    auto bytes = sizeof(cl_ulong);
+    auto time_start = cl_ulong{0};
     clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
-    auto time_end = size_t{0};
+    auto time_end = cl_ulong{0};
     clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
     return (time_end - time_start) * 1.0e-6f;
   }
@@ -592,8 +590,7 @@ class Buffer {
 
   // Retrieves the actual allocated size in bytes
   size_t GetSize() const {
-    auto bytes = size_t{0};
-    CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
+    auto bytes = sizeof(size_t);
     auto result = size_t{0};
     CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
     return result;
@@ -645,9 +642,8 @@ class Kernel {
 
   // Retrieves the amount of local memory used per work-group for this kernel
   size_t LocalMemUsage(const Device &device) const {
-    auto bytes = size_t{0};
+    auto bytes = sizeof(cl_ulong);
     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
-    CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
     auto result = size_t{0};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
     return result;

From 68cb1d44d0bd1d14a4df042de02ea21f12b3f479 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 3 Aug 2016 20:17:41 +0200
Subject: [PATCH 2/6] Updated to version 7.0 of the CLCudaAPI header

---
 CHANGELOG                 |   3 ++
 include/internal/clpp11.h | 108 ++++++++++++++++++++++----------------
 2 files changed, 65 insertions(+), 46 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index fd6ff46..8a7553f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,7 @@
 
+Next release (development version)
+- Updated to version 7.0 of the CLCudaAPI header
+
 Version 2.4.0
 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
 - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON)
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
index 1b0025b..e630187 100644
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@@ -70,15 +70,24 @@ inline void CheckError(const cl_int status) {
 class Event {
  public:
 
-  // Constructor based on the regular OpenCL data-type
-  explicit Event(const cl_event event): event_(event) { }
+  // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
+  explicit Event(const cl_event event):
+      event_(new cl_event) {
+    *event_ = event;
+  }
 
-  // Regular constructor
-  explicit Event(): event_(nullptr) { }
+  // Regular constructor with memory management
+  explicit Event():
+      event_(new cl_event, [](cl_event* e) {
+        if (*e) { CheckError(clReleaseEvent(*e)); }
+        delete e;
+      }) {
+    *event_ = nullptr;
+  }
 
   // Waits for completion of this event
   void WaitForCompletion() const {
-    CheckError(clWaitForEvents(1, &event_));
+    CheckError(clWaitForEvents(1, &(*event_)));
   }
 
   // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
@@ -86,19 +95,21 @@ class Event {
   // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
   float GetElapsedTime() const {
     WaitForCompletion();
-    auto bytes = sizeof(cl_ulong);
+    const auto bytes = sizeof(cl_ulong);
     auto time_start = cl_ulong{0};
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
     auto time_end = cl_ulong{0};
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
-    return (time_end - time_start) * 1.0e-6f;
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
+    return static_cast<float>(time_end - time_start) * 1.0e-6f;
   }
 
   // Accessor to the private data-member
-  cl_event& operator()() { return event_; }
-  cl_event* pointer() { return &event_; }
+  cl_event& operator()() { return *event_; }
+  const cl_event& operator()() const { return *event_; }
+  cl_event* pointer() { return &(*event_); }
+  const cl_event* pointer() const { return &(*event_); }
  private:
-  cl_event event_;
+  std::shared_ptr<cl_event> event_;
 };
 
 // Pointer to an OpenCL event
@@ -181,24 +192,32 @@ class Device {
   }
   size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
   size_t MaxWorkItemDimensions() const {
-    return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
   }
   std::vector<size_t> MaxWorkItemSizes() const {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
   }
-  size_t LocalMemSize() const {
-    return static_cast<size_t>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
+  unsigned long LocalMemSize() const {
+    return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
   }
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
-  size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
-  size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
-  size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
-  size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
+  size_t CoreClock() const {
+    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
+  }
+  size_t ComputeUnits() const {
+    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_COMPUTE_UNITS));
+  }
+  unsigned long MemorySize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE));
+  }
+  unsigned long MaxAllocSize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE));
+  }
   size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
   size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
 
   // Configuration-validity checks
-  bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+  bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const {
     return (local_mem_usage <= LocalMemSize());
   }
   bool IsThreadConfigValid(const std::vector<size_t> &local) const {
@@ -215,9 +234,11 @@ class Device {
   // Query for a specific type of device or brand
   bool IsCPU() const { return Type() == "CPU"; }
   bool IsGPU() const { return Type() == "GPU"; }
-  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
+                              Vendor() == "AuthenticAMD";; }
   bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
-  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+  bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
+                                Vendor() == "GenuineIntel"; }
   bool IsARM() const { return Vendor() == "ARM"; }
 
   // Accessor to the private data-member
@@ -234,13 +255,6 @@ class Device {
     CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
     return result;
   }
-  size_t GetInfo(const cl_device_info info) const {
-    auto bytes = size_t{0};
-    CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
-    auto result = cl_uint(0);
-    CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
-    return static_cast<size_t>(result);
-  }
   template <typename T>
   std::vector<T> GetInfoVector(const cl_device_info info) const {
     auto bytes = size_t{0};
@@ -590,7 +604,7 @@ class Buffer {
 
   // Retrieves the actual allocated size in bytes
   size_t GetSize() const {
-    auto bytes = sizeof(size_t);
+    const auto bytes = sizeof(size_t);
     auto result = size_t{0};
     CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
     return result;
@@ -641,12 +655,22 @@ class Kernel {
   }
 
   // Retrieves the amount of local memory used per work-group for this kernel
-  size_t LocalMemUsage(const Device &device) const {
-    auto bytes = sizeof(cl_ulong);
+  unsigned long LocalMemUsage(const Device &device) const {
+    const auto bytes = sizeof(cl_ulong);
     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
-    auto result = size_t{0};
+    auto result = cl_ulong{0};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
-    return result;
+    return static_cast<unsigned long>(result);
+  }
+
+  // Retrieves the name of the kernel
+  std::string GetFunctionName() const {
+    auto bytes = size_t{0};
+    CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
+    auto result = std::string{};
+    result.resize(bytes);
+    CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr));
+    return std::string{result.c_str()}; // Removes any trailing '\0'-characters
   }
 
   // Launches a kernel onto the specified queue
@@ -660,30 +684,22 @@ class Kernel {
   // As above, but with an event waiting list
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
-              std::vector<Event>& waitForEvents) {
-    if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
+              const std::vector<Event> &waitForEvents) {
 
     // Builds a plain version of the events waiting list
     auto waitForEventsPlain = std::vector<cl_event>();
     for (auto &waitEvent : waitForEvents) {
-      waitForEventsPlain.push_back(waitEvent());
+      if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); }
     }
 
     // Launches the kernel while waiting for other events
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
-                                      nullptr, global.data(), local.data(),
+                                      nullptr, global.data(), !local.empty() ? local.data() : nullptr,
                                       static_cast<cl_uint>(waitForEventsPlain.size()),
-                                      waitForEventsPlain.data(),
+                                      !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
                                       event));
   }
 
-  // As above, but with the default local workgroup size
-  void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
-    CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
-                                      nullptr, global.data(), nullptr,
-                                      0, nullptr, event));
-  }
-
   // Accessor to the private data-member
   const cl_kernel& operator()() const { return *kernel_; }
  private:

From 492c3620462cef4b8a87131dfabe2b2ba8bf94a2 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 3 Aug 2016 20:21:59 +0200
Subject: [PATCH 3/6] Updated Travis CI to use the system OpenCL instead of
 compiling our own OpenCL library

---
 .travis.yml | 32 ++------------------------------
 1 file changed, 2 insertions(+), 30 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index fd31c0e..8526aae 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,49 +17,21 @@ addons:
       - kubuntu-backports
     packages:
       - cmake
+      - ocl-icd-opencl-dev
 
 env:
   global:
     - CLTUNE_ROOT=${TRAVIS_BUILD_DIR}/bin/cltune
-    - OPENCL_REGISTRY=https://www.khronos.org/registry/cl
-    - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
 
 before_install:
   - cmake --version;
   - ${CC} --version;
   - ${CXX} --version;
 
-install:
-  # The following linux logic is necessary because of Travis's move to the GCE platform, which does not
-  # currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221
-  # We build our own linkable .so file
-  - if [ ${TRAVIS_OS_NAME} == "linux" ]; then
-      mkdir -p ${OPENCL_ROOT};
-      pushd ${OPENCL_ROOT};
-      travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git;
-      mv ./OpenCL-ICD-Loader/* .;
-      travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL;
-      pushd inc/CL;
-      travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp;
-      popd;
-      mkdir -p lib;
-      pushd lib;
-      cmake -G "Unix Makefiles" ..;
-      make;
-      cp ./bin/libOpenCL.so .;
-      popd;
-      pushd inc/CL;
-      travis_retry git fetch origin opencl12:opencl12;
-      git checkout opencl12;
-      popd;
-      mv inc/ include/;
-      popd;
-    fi
-
 before_script:
   - mkdir -p ${CLTUNE_ROOT}
   - pushd ${CLTUNE_ROOT}
-  - cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DSAMPLES=ON -DTESTS=ON ${TRAVIS_BUILD_DIR}
+  - cmake -DSAMPLES=ON -DTESTS=ON ${TRAVIS_BUILD_DIR}
 
 script:
   - make

From bb4ba83d3c517b8659c7be3416878cb0459fb3c1 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 27 Sep 2016 20:48:10 +0200
Subject: [PATCH 4/6] Updated to version 8.0 of CLCudaAPI

---
 CHANGELOG                 |  2 +-
 include/internal/clpp11.h | 16 +++++++++--
 include/internal/cupp11.h | 60 ++++++++++++++++++++++++++++-----------
 3 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 8a7553f..4b824c2 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,6 @@
 
 Next release (development version)
-- Updated to version 7.0 of the CLCudaAPI header
+- Updated to version 8.0 of the CLCudaAPI header
 
 Version 2.4.0
 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
index e630187..2a7b22f 100644
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@@ -11,7 +11,8 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This is version 6.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
 //
 // =================================================================================================
 //
@@ -148,6 +149,17 @@ class Platform {
   cl_platform_id platform_;
 };
 
+// Retrieves a vector with all platforms
+inline std::vector<Platform> GetAllPlatforms() {
+  auto num_platforms = cl_uint{0};
+  CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
+  auto all_platforms = std::vector<Platform>();
+  for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
+    all_platforms.push_back(Platform(platform_id));
+  }
+  return all_platforms;
+}
+
 // =================================================================================================
 
 // C++11 version of 'cl_device_id'
@@ -198,7 +210,7 @@ class Device {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
   }
   unsigned long LocalMemSize() const {
-    return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
   }
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
   size_t CoreClock() const {
diff --git a/include/internal/cupp11.h b/include/internal/cupp11.h
index c1098f4..67a99d7 100644
--- a/include/internal/cupp11.h
+++ b/include/internal/cupp11.h
@@ -11,7 +11,8 @@
 // Portability here means that a similar header exists for OpenCL with the same classes and
 // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change.
 //
-// This is version 6.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
 //
 // =================================================================================================
 //
@@ -138,6 +139,12 @@ class Platform {
   size_t platform_id_;
 };
 
+// Retrieves a vector with all platforms. Note that there is just one platform in CUDA.
+inline std::vector<Platform> GetAllPlatforms() {
+  auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) };
+  return all_platforms;
+}
+
 // =================================================================================================
 
 // C++11 version of 'CUdevice'
@@ -180,7 +187,9 @@ class Device {
                                GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
                                GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)};
   }
-  size_t LocalMemSize() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK); }
+  unsigned long LocalMemSize() const {
+    return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK));
+  }
   std::string Capabilities() const {
     auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
     auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
@@ -188,12 +197,12 @@ class Device {
   }
   size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); }
   size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); }
-  size_t MemorySize() const {
+  unsigned long MemorySize() const {
     auto result = size_t{0};
     CheckError(cuDeviceTotalMem(&result, device_));
-    return result;
+    return static_cast<unsigned long>(result);
   }
-  size_t MaxAllocSize() const { return MemorySize(); }
+  unsigned long MaxAllocSize() const { return MemorySize(); }
   size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); }
   size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); }
 
@@ -276,12 +285,22 @@ class Program {
       program_(new nvrtcProgram, [](nvrtcProgram* p) { CheckError(nvrtcDestroyProgram(p));
                                                        delete p; }),
       source_(std::move(source)),
-      source_ptr_(&source_[0]) {
+      source_ptr_(&source_[0]),
+      from_binary_(false) {
     CheckError(nvrtcCreateProgram(program_.get(), source_ptr_, nullptr, 0, nullptr, nullptr));
   }
 
+  // PTX-based constructor
+  explicit Program(const Device &device, const Context &context, const std::string& binary):
+      program_(nullptr), // not used
+      source_(binary),
+      source_ptr_(&source_[0]), // not used
+      from_binary_(true) {
+  }
+
   // Compiles the device program and returns whether or not there where any warnings/errors
   BuildStatus Build(const Device &, std::vector<std::string> &options) {
+    if (from_binary_) { return BuildStatus::kSuccess; }
     auto raw_options = std::vector<const char*>();
     for (const auto &option: options) {
       raw_options.push_back(option.c_str());
@@ -301,6 +320,7 @@ class Program {
 
   // Retrieves the warning/error message from the compiler (if any)
   std::string GetBuildInfo(const Device &) const {
+    if (from_binary_) { return std::string{}; }
     auto bytes = size_t{0};
     CheckError(nvrtcGetProgramLogSize(*program_, &bytes));
     auto result = std::string{};
@@ -311,6 +331,7 @@ class Program {
 
   // Retrieves an intermediate representation of the compiled program (i.e. PTX)
   std::string GetIR() const {
+    if (from_binary_) { return source_; } // holds the PTX
     auto bytes = size_t{0};
     CheckError(nvrtcGetPTXSize(*program_, &bytes));
     auto result = std::string{};
@@ -325,6 +346,7 @@ class Program {
   std::shared_ptr<nvrtcProgram> program_;
   std::string source_;
   const char* source_ptr_;
+  const bool from_binary_;
 };
 
 // =================================================================================================
@@ -565,10 +587,15 @@ class Kernel {
 
   // Retrieves the amount of local memory used per work-group for this kernel. Note that this the
   // shared memory in CUDA terminology.
-  size_t LocalMemUsage(const Device &) const {
+  unsigned long LocalMemUsage(const Device &) const {
     auto result = 0;
     CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_));
-    return static_cast<size_t>(result);
+    return static_cast<unsigned long>(result);
+  }
+
+  // Retrieves the name of the kernel
+  std::string GetFunctionName() const {
+    return std::string{"unknown"}; // Not implemented for the CUDA backend
   }
 
   // Launches a kernel onto the specified queue
@@ -600,14 +627,15 @@ class Kernel {
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
               std::vector<Event>& waitForEvents) {
-    if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
-    Error("launching with an event waiting list is not implemented for the CUDA back-end");
-  }
-
-  // As above, but with the default local workgroup size
-  // TODO: Implement this function
-  void Launch(const Queue &, const std::vector<size_t> &, EventPointer) {
-    Error("launching with a default workgroup size is not implemented for the CUDA back-end");
+    if (local.size() == 0) {
+      Error("launching with a default workgroup size is not implemented for the CUDA back-end");
+    }
+    else if (waitForEvents.size() != 0) {
+      Error("launching with an event waiting list is not implemented for the CUDA back-end");
+    }
+    else {
+     return Launch(queue, global, local, event);
+    }
   }
 
   // Accessors to the private data-members

From 2a567220df710065e6e2eecf43007e641719dcb3 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 27 Sep 2016 20:49:47 +0200
Subject: [PATCH 5/6] Made the number of runs for averaging a setting
 configurable by the user

---
 CHANGELOG                     |  1 +
 include/cltune.h              |  3 +++
 include/internal/tuner_impl.h |  2 +-
 samples/simple/simple.cc      |  1 +
 src/cltune.cc                 |  5 +++++
 src/tuner_impl.cc             | 14 ++++++++------
 6 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 4b824c2..ecdc21c 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
 
 Next release (development version)
 - Updated to version 8.0 of the CLCudaAPI header
+- Made it possible to configure the number of times each kernel is run (to average results)
 
 Version 2.4.0
 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
diff --git a/include/cltune.h b/include/cltune.h
index eaf8a72..9837556 100644
--- a/include/cltune.h
+++ b/include/cltune.h
@@ -151,6 +151,9 @@ class Tuner {
   // Disables all further printing to stdout
   void PUBLIC_API SuppressOutput();
 
+  // Changes the number of times each kernel should be run. Used for averaging execution times.
+  void PUBLIC_API SetNumRuns(const size_t num_runs);
+
  private:
 
   // This implements the pointer to implementation idiom (pimpl) and hides all private functions and
diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h
index ffb1b69..b471fd3 100644
--- a/include/internal/tuner_impl.h
+++ b/include/internal/tuner_impl.h
@@ -74,7 +74,6 @@ class TunerImpl {
 
   // Parameters
   static constexpr auto kMaxL2Norm = 1e-4; // This is the threshold for 'correctness'
-  static constexpr auto kNumRuns = size_t{1}; // This is used for more-accurate execution time measurement
 
   // Messages printed to stdout (in colours)
   static const std::string kMessageFull;
@@ -158,6 +157,7 @@ class TunerImpl {
   Queue queue_;
 
   // Settings
+  size_t num_runs_; // This is used for more-accurate execution time measurement
   bool has_reference_;
   bool suppress_output_;
   bool output_search_process_;
diff --git a/samples/simple/simple.cc b/samples/simple/simple.cc
index fd7062e..0493746 100644
--- a/samples/simple/simple.cc
+++ b/samples/simple/simple.cc
@@ -52,6 +52,7 @@ int main() {
   tuner.AddArgumentOutput(vec_c);
 
   // Starts the tuner
+  tuner.SetNumRuns(10);
   tuner.Tune();
 
   // Prints the results to screen
diff --git a/src/cltune.cc b/src/cltune.cc
index 455fe8e..72e3fb7 100644
--- a/src/cltune.cc
+++ b/src/cltune.cc
@@ -441,5 +441,10 @@ void Tuner::SuppressOutput() {
   pimpl->suppress_output_ = true;
 }
 
+// Sets the number of runs to average time measurements.
+void Tuner::SetNumRuns(const size_t num_runs) {
+  pimpl->num_runs_ = num_runs;
+}
+
 // =================================================================================================
 } // namespace cltune
diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc
index 4a67495..58bb171 100644
--- a/src/tuner_impl.cc
+++ b/src/tuner_impl.cc
@@ -68,6 +68,7 @@ TunerImpl::TunerImpl():
     device_(Device(platform_, size_t{0})),
     context_(Context(device_)),
     queue_(Queue(context_, device_)),
+    num_runs_(size_t{1}),
     has_reference_(false),
     suppress_output_(false),
     output_search_process_(false),
@@ -90,6 +91,7 @@ TunerImpl::TunerImpl(size_t platform_id, size_t device_id):
     device_(Device(platform_, device_id)),
     context_(Context(device_)),
     queue_(Queue(context_, device_)),
+    num_runs_(size_t{1}),
     has_reference_(false),
     suppress_output_(false),
     output_search_process_(false),
@@ -325,11 +327,11 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
 
     // Runs the kernel (this is the timed part)
     fprintf(stdout, "%s Running %s\n", kMessageRun.c_str(), kernel.name().c_str());
-    auto events = std::vector<Event>(kNumRuns);
-    for (auto t=size_t{0}; t<kNumRuns; ++t) {
+    auto events = std::vector<Event>(num_runs_);
+    for (auto t=size_t{0}; t<num_runs_; ++t) {
       #ifdef VERBOSE
         fprintf(stdout, "%s Launching kernel (%zu out of %zu for averaging)\n", kMessageVerbose.c_str(),
-                t + 1, kNumRuns);
+                t + 1, num_runs_);
       #endif
       tune_kernel.Launch(queue_, global, local, events[t].pointer());
       queue_.Finish(events[t]);
@@ -338,13 +340,13 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
 
     // Collects the timing information
     auto elapsed_time = std::numeric_limits<float>::max();
-    for (auto t=size_t{0}; t<kNumRuns; ++t) {
+    for (auto t=size_t{0}; t<num_runs_; ++t) {
       auto this_elapsed_time = events[t].GetElapsedTime();
       elapsed_time = std::min(elapsed_time, this_elapsed_time);
     }
 
     // Prints diagnostic information
-    fprintf(stdout, "%s Completed %s (%.0lf ms) - %zu out of %zu\n",
+    fprintf(stdout, "%s Completed %s (%.1lf ms) - %zu out of %zu\n",
             kMessageOK.c_str(), kernel.name().c_str(), elapsed_time,
             configuration_id+1, num_configurations);
 
@@ -620,7 +622,7 @@ void TunerImpl::ModelPrediction(const Model model_type, const float validation_f
 // Prints a result by looping over all its configuration parameters
 void TunerImpl::PrintResult(FILE* fp, const TunerResult &result, const std::string &message) const {
   fprintf(fp, "%s %s; ", message.c_str(), result.kernel_name.c_str());
-  fprintf(fp, "%6.0lf ms;", result.time);
+  fprintf(fp, "%8.1lf ms;", result.time);
   for (auto &setting: result.configuration) {
     fprintf(fp, "%9s;", setting.GetConfig().c_str());
   }

From 82dd2340c89df3cfa86db0b9ddecc114e3268a2c Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 27 Sep 2016 20:53:32 +0200
Subject: [PATCH 6/6] Updated to version 2.5.0

---
 CHANGELOG      | 2 +-
 CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index ecdc21c..5fcb1a6 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,5 @@
 
-Next release (development version)
+Version 2.5.0
 - Updated to version 8.0 of the CLCudaAPI header
 - Made it possible to configure the number of times each kernel is run (to average results)
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 831b421..14a106a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 # CMake project details
 project("cltune" CXX)
 set(cltune_VERSION_MAJOR 2)
-set(cltune_VERSION_MINOR 4)
+set(cltune_VERSION_MINOR 5)
 set(cltune_VERSION_PATCH 0)
 
 # Options