From e95c15842dfb2577cc519352024cd2163a35e899 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 31 May 2016 20:37:17 +0200
Subject: [PATCH 1/9] Unit-tests are now based on string-kernels instead of
 external-file-kernels to make it possible to run the unit test executables
 anywhere

---
 CHANGELOG     |  3 +++
 test/tuner.cc | 25 +++++++++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 8395f9c..d9a3f92 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,7 @@
 
+Development version (next release)
+- Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
+
 Version 2.3.1
 - Fixed a bug where an output buffer could not be used as input at the same time
 - Fixed computing the validation error for half-precision fp16 data-types
diff --git a/test/tuner.cc b/test/tuner.cc
index 0f09db6..4bd09d3 100644
--- a/test/tuner.cc
+++ b/test/tuner.cc
@@ -20,6 +20,23 @@ const size_t kDeviceID = 0;
 const size_t kInvalidPlatformID = 99;
 const size_t kInvalidDeviceID = 99;
 
+// Example kernels
+const auto kernel1 = R"(
+__kernel void small_kernel(__global float* array) {
+  array[get_global_id(0)] = result;
+})";
+const auto kernel2 = R"(
+__kernel void matvec_reference(const int kSizeM, const int kSizeN,
+                               const __global float* mat_a,
+                               const __global float* vec_x,
+                               __global float* vec_y) {
+  float result = 0.0f;
+  for (int j=0; j<kSizeN; ++j) {
+    result += mat_a[j*kSizeM + get_global_id(0)] * vec_x[j];
+  }
+  vec_y[get_global_id(0)] = result;
+})";
+
 // =================================================================================================
 
 SCENARIO("tuners can be created", "[Tuner]") {
@@ -43,8 +60,8 @@ SCENARIO("kernels can be added", "[Tuner]") {
     const auto kConfigGlobal = cltune::IntRange(128, 256);
     const auto kConfigLocal = cltune::IntRange(8, 1);
     const std::vector<std::pair<std::string,std::string>> kExampleKernels = {
-      {"../samples/multiple_kernels/multiple_kernels_reference.opencl","matvec_reference"},
-      {"../samples/multiple_kernels/multiple_kernels_unroll.opencl","matvec_unroll"}
+      {kernel1,"small_kernel"},
+      {kernel2,"matvec_reference"}
     };
     const auto kExampleParameter = std::string{"TEST_PARAM"};
     const auto kExampleParameterValues = std::initializer_list<size_t>{6, 9, 1003};
@@ -55,8 +72,8 @@ SCENARIO("kernels can be added", "[Tuner]") {
 
       for (; counter<kExampleKernels.size(); ++counter) {
         auto example_kernel = kExampleKernels[counter];
-        auto id = tuner.AddKernel({example_kernel.first}, example_kernel.second,
-                                  kConfigGlobal, kConfigLocal);
+        auto id = tuner.AddKernelFromString(example_kernel.first, example_kernel.second,
+                                            kConfigGlobal, kConfigLocal);
 
         THEN("their IDs are monotonously increasing #" + std::to_string(counter)) {
           REQUIRE(counter == id);

From b516ef7d35bcf99c0ef1165140cef844b52649b1 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Thu, 16 Jun 2016 20:18:49 +0200
Subject: [PATCH 2/9] Added a VERBOSE option to CMake to get additional
 diagnostic messages

---
 CHANGELOG                     |  1 +
 CMakeLists.txt                |  7 +++++++
 include/internal/tuner_impl.h |  1 +
 src/tuner_impl.cc             | 36 ++++++++++++++++++++++++-----------
 4 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index d9a3f92..f064355 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
 
 Development version (next release)
 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
+- Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON)
 
 Version 2.3.1
 - Fixed a bug where an output buffer could not be used as input at the same time
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dff5b2b..159a56a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,13 @@ else()
   message("-- Building with CUDA")
 endif()
 
+# Compile in verbose mode with additional diagnostic messages
+option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
+if(VERBOSE)
+  message("-- Building in verbose mode")
+  add_definitions(-DVERBOSE)
+endif()
+
 # ==================================================================================================
 
 # RPATH settings
diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h
index 1267d11..ffb1b69 100644
--- a/include/internal/tuner_impl.h
+++ b/include/internal/tuner_impl.h
@@ -81,6 +81,7 @@ class TunerImpl {
   static const std::string kMessageHead;
   static const std::string kMessageRun;
   static const std::string kMessageInfo;
+  static const std::string kMessageVerbose;
   static const std::string kMessageOK;
   static const std::string kMessageWarning;
   static const std::string kMessageFailure;
diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc
index 76ee05c..5cb7908 100644
--- a/src/tuner_impl.cc
+++ b/src/tuner_impl.cc
@@ -53,6 +53,7 @@ const std::string TunerImpl::kMessageFull    = "\x1b[32m[==========]\x1b[0m";
 const std::string TunerImpl::kMessageHead    = "\x1b[32m[----------]\x1b[0m";
 const std::string TunerImpl::kMessageRun     = "\x1b[32m[ RUN      ]\x1b[0m";
 const std::string TunerImpl::kMessageInfo    = "\x1b[32m[   INFO   ]\x1b[0m";
+const std::string TunerImpl::kMessageVerbose = "\x1b[39m[ VERBOSE  ]\x1b[0m";
 const std::string TunerImpl::kMessageOK      = "\x1b[32m[       OK ]\x1b[0m";
 const std::string TunerImpl::kMessageWarning = "\x1b[33m[  WARNING ]\x1b[0m";
 const std::string TunerImpl::kMessageFailure = "\x1b[31m[   FAILED ]\x1b[0m";
@@ -162,6 +163,9 @@ void TunerImpl::Tune() {
     } else {
 
       // Computes the permutations of all parameters and pass them to a (smart) search algorithm
+      #ifdef VERBOSE
+        fprintf(stdout, "%s Computing the permutations of all parameters\n", kMessageVerbose.c_str());
+      #endif
       kernel.SetConfigurations();
 
       // Creates the selected search algorithm
@@ -185,6 +189,10 @@ void TunerImpl::Tune() {
 
       // Iterates over all possible configurations (the permutations of the tuning parameters)
       for (auto p=size_t{0}; p<search->NumConfigurations(); ++p) {
+        #ifdef VERBOSE
+          fprintf(stdout, "%s Exploring configuration (%zu out of %zu)\n", kMessageVerbose.c_str(),
+                  p + 1, search->NumConfigurations());
+        #endif
         auto permutation = search->GetConfiguration();
 
         // Adds the parameters to the source-code string as defines
@@ -238,21 +246,14 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
                                             const size_t configuration_id,
                                             const size_t num_configurations) {
 
-  // Note: the following code is disabled because of GCC 4.8.0 compatibility
-  auto processed_source = source;
-  /*
-  // Removes the use of C++11 string literals (if any) from the kernel source code
-  auto string_literal_start = std::regex{"R\"\\("};
-  auto string_literal_end = std::regex{"\\)\";"};
-  auto processed_source = std::regex_replace(source, string_literal_start, "");
-  processed_source = std::regex_replace(processed_source, string_literal_end, "");
-  */
-
   // In case of an exception, skip this run
   try {
 
     // Compiles the kernel and prints the compiler errors/warnings
-    auto program = Program(context_, processed_source);
+    #ifdef VERBOSE
+      fprintf(stdout, "%s Starting compilation\n", kMessageVerbose.c_str());
+    #endif
+    auto program = Program(context_, source);
     auto options = std::vector<std::string>{};
     auto build_status = program.Build(device_, options);
     if (build_status == BuildStatus::kError) {
@@ -263,6 +264,9 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
     if (build_status == BuildStatus::kInvalid) {
       throw std::runtime_error("Invalid program binary");
     }
+    #ifdef VERBOSE
+      fprintf(stdout, "%s Finished compilation\n", kMessageVerbose.c_str());
+    #endif
 
     // Clears all previous copies of output buffer(s)
     for (auto &mem_info: arguments_output_copy_) {
@@ -275,6 +279,9 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
     arguments_output_copy_.clear();
 
     // Creates a copy of the output buffer(s)
+    #ifdef VERBOSE
+      fprintf(stdout, "%s Creating a copy of the output buffer\n", kMessageVerbose.c_str());
+    #endif
     for (auto &output: arguments_output_) {
       switch (output.type) {
         case MemType::kShort: arguments_output_copy_.push_back(CopyOutputBuffer<short>(output)); break;
@@ -290,6 +297,9 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
     }
 
     // Sets the kernel and its arguments
+    #ifdef VERBOSE
+      fprintf(stdout, "%s Setting kernel arguments\n", kMessageVerbose.c_str());
+    #endif
     auto tune_kernel = Kernel(program, kernel.name());
     for (auto &i: arguments_input_) { tune_kernel.SetArgument(i.index, i.buffer); }
     for (auto &i: arguments_output_copy_) { tune_kernel.SetArgument(i.index, i.buffer); }
@@ -317,6 +327,10 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
     fprintf(stdout, "%s Running %s\n", kMessageRun.c_str(), kernel.name().c_str());
     auto events = std::vector<Event>(kNumRuns);
     for (auto t=size_t{0}; t<kNumRuns; ++t) {
+      #ifdef VERBOSE
+        fprintf(stdout, "%s Launching kernel (%zu out of %zu for averaging)\n", kMessageVerbose.c_str(),
+                t + 1, kNumRuns);
+      #endif
       tune_kernel.Launch(queue_, global, local, events[t]);
       queue_.Finish(events[t]);
     }

From 48719a2634555610c312e336fa6046a106d6a183 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Thu, 16 Jun 2016 20:20:44 +0200
Subject: [PATCH 3/9] Fixed the RPATH settings for OSX

---
 CHANGELOG      | 1 +
 CMakeLists.txt | 5 +----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index f064355..1d18f46 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,7 @@
 Development version (next release)
 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
 - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON)
+- Fixed the RPATH settings on OSX
 
 Version 2.3.1
 - Fixed a bug where an output buffer could not be used as input at the same time
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 159a56a..8b03acf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,10 +58,7 @@ endif()
 # ==================================================================================================
 
 # RPATH settings
-set(CMAKE_SKIP_BUILD_RPATH false) # Use, i.e. don't skip the full RPATH for the build tree
-set(CMAKE_BUILD_WITH_INSTALL_RPATH false) # When building, don't use the install RPATH already
-set(CMAKE_INSTALL_RPATH "") # The RPATH to be used when installing
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically determined parts
+set(CMAKE_MACOSX_RPATH 1)
 
 # ==================================================================================================
 

From fca2ad1ccd97691f096d1e17b9de544c75e8925e Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 29 Jun 2016 17:03:42 +0200
Subject: [PATCH 4/9] Added Appveyor CI and added OS X compilation for Travis

---
 .appveyor.yml | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++
 .gitignore    |  3 ++-
 .travis.yml   |  8 +++++--
 CHANGELOG     |  1 +
 README.md     |  5 +++-
 5 files changed, 78 insertions(+), 4 deletions(-)
 create mode 100644 .appveyor.yml

diff --git a/.appveyor.yml b/.appveyor.yml
new file mode 100644
index 0000000..a9142b1
--- /dev/null
+++ b/.appveyor.yml
@@ -0,0 +1,65 @@
+environment:
+  global:
+    CLTUNE_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\cltune"
+    OPENCL_REGISTRY: "https://www.khronos.org/registry/cl"
+    OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl"
+
+platform:
+  - x64
+  - x86
+
+configuration:
+  - Release
+
+init:
+  - cmake --version
+  - C:\"Program Files (x86)"\"Microsoft Visual Studio 14.0"\VC\vcvarsall.bat %PLATFORM%
+
+# Creates an OpenCL library to link against. Taken from clMathLibraries/clBLAS
+install:
+  - ps: mkdir $env:OPENCL_ROOT
+  - ps: pushd $env:OPENCL_ROOT
+  - ps: $opencl_registry = $env:OPENCL_REGISTRY
+  # This downloads the source to the Khronos ICD library
+  - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git
+  - ps: mv ./OpenCL-ICD-Loader/* .
+  # This downloads all the opencl header files
+  # The cmake build files expect a directory called inc
+  - ps: mkdir inc/CL
+  - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL
+  - ps: wget $opencl_registry/api/2.1/cl.hpp -OutFile inc/CL/cl.hpp
+  # - ps: dir; if( $lastexitcode -eq 0 ){ dir include/CL } else { Write-Output boom }
+  # Create the static import lib in a directory called lib, so findopencl() will find it
+  - ps: mkdir lib
+  - ps: pushd lib
+  - cmake -G "NMake Makefiles" ..
+  - nmake
+  - ps: popd
+  # Switch to OpenCL 1.2 headers
+  - ps: pushd inc/CL
+  - git fetch origin opencl12:opencl12
+  - git checkout opencl12
+  - ps: popd
+  # Rename the inc directory to include, so FindOpencl() will find it
+  - ps: ren inc include
+  - ps: popd
+
+before_build:
+  - ps: mkdir $env:CLTUNE_ROOT
+  - ps: pushd $env:CLTUNE_ROOT
+  - ps: mkdir install_dir
+  - cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=install_dir -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DSAMPLES=ON -DTESTS=ON %APPVEYOR_BUILD_FOLDER%
+
+build_script:
+  - nmake
+  - nmake install
+
+after_build:
+  - ps: pushd $env:CLTUNE_ROOT
+  - 7z a CLTune-Windows-%PLATFORM%.zip .\install_dir\*
+  - ps: mv CLTune-Windows-%PLATFORM%.zip $env:APPVEYOR_BUILD_FOLDER
+
+artifacts:
+  - path: '*.zip'
+    name: release
+    type: zip
diff --git a/.gitignore b/.gitignore
index 567609b..53f935f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-build/
+build
+.*
diff --git a/.travis.yml b/.travis.yml
index b6f3a42..fd31c0e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,6 +2,10 @@ language: cpp
 sudo: required
 dist: trusty
 
+os:
+  - linux
+  - osx
+
 compiler:
   - gcc
   - clang
@@ -16,7 +20,7 @@ addons:
 
 env:
   global:
-    - CLTUNE_ROOT=${TRAVIS_BUILD_DIR}/bin/make/release
+    - CLTUNE_ROOT=${TRAVIS_BUILD_DIR}/bin/cltune
     - OPENCL_REGISTRY=https://www.khronos.org/registry/cl
     - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
 
@@ -55,7 +59,7 @@ install:
 before_script:
   - mkdir -p ${CLTUNE_ROOT}
   - pushd ${CLTUNE_ROOT}
-  - cmake -DSAMPLES=ON -DTESTS=ON -DOPENCL_ROOT=${OPENCL_ROOT} ${TRAVIS_BUILD_DIR}
+  - cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DSAMPLES=ON -DTESTS=ON ${TRAVIS_BUILD_DIR}
 
 script:
   - make
diff --git a/CHANGELOG b/CHANGELOG
index 1d18f46..aa780d4 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,7 @@ Development version (next release)
 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
 - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON)
 - Fixed the RPATH settings on OSX
+- Added Appveyor continuous integration and increased coverage of the Travis builds
 
 Version 2.3.1
 - Fixed a bug where an output buffer could not be used as input at the same time
diff --git a/README.md b/README.md
index 14ff2f4..0445696 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,10 @@
 CLTune: Automatic OpenCL kernel tuning
 ================
 
-[![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=master)](https://travis-ci.org/CNugteren/CLTune)
+| | master | development |
+|-----|-----|-----|
+| Linux/OS X | [![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=master)](https://travis-ci.org/CNugteren/CLTune/branches) | [![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=development)](https://travis-ci.org/CNugteren/CLTune/branches) |
+| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/cltune?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/cltune) | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/cltune?branch=development&svg=true)](https://ci.appveyor.com/project/CNugteren/cltune) |
 
 CLTune is a C++ library which can be used to automatically tune your OpenCL and CUDA kernels. The only thing you'll need to provide is a tuneable kernel and a list of allowed parameters and values.
 

From 609ea4cf78ad9ddb2788f692f5802ae52e17a441 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 29 Jun 2016 17:49:52 +0200
Subject: [PATCH 5/9] Removed building of tests for AppVeyor CI

---
 .appveyor.yml | 2 +-
 .gitignore    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.appveyor.yml b/.appveyor.yml
index a9142b1..3e26ba8 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -48,7 +48,7 @@ before_build:
   - ps: mkdir $env:CLTUNE_ROOT
   - ps: pushd $env:CLTUNE_ROOT
   - ps: mkdir install_dir
-  - cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=install_dir -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DSAMPLES=ON -DTESTS=ON %APPVEYOR_BUILD_FOLDER%
+  - cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=install_dir -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DSAMPLES=ON %APPVEYOR_BUILD_FOLDER%
 
 build_script:
   - nmake
diff --git a/.gitignore b/.gitignore
index 53f935f..2502bdc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
 build
-.*
+.*
\ No newline at end of file

From 6177c14d0828a5093e93640cbb6cb4fa893bb1fe Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 29 Jun 2016 17:50:12 +0200
Subject: [PATCH 6/9] Updated to version 6.0 of the CLCudaAPI header

---
 CHANGELOG                 |  1 +
 include/internal/clpp11.h | 51 ++++++++++++++++++++++++++++++---------
 include/internal/cupp11.h | 34 ++++++++++++++++++++------
 src/tuner_impl.cc         |  2 +-
 4 files changed, 68 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index aa780d4..62d306f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,7 @@
 Development version (next release)
 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
 - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON)
+- Now using version 6.0 of the CLCudaAPI header
 - Fixed the RPATH settings on OSX
 - Added Appveyor continuous integration and increased coverage of the Travis builds
 
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
index 91daa6b..e3bcb01 100644
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@@ -11,7 +11,7 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This is version 4.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
+// This is version 6.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
 //
 // =================================================================================================
 //
@@ -214,6 +214,14 @@ class Device {
     return true;
   }
 
+  // Query for a specific type of device or brand
+  bool IsCPU() const { return Type() == "CPU"; }
+  bool IsGPU() const { return Type() == "GPU"; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
+  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+  bool IsARM() const { return Vendor() == "ARM"; }
+
   // Accessor to the private data-member
   const cl_device_id& operator()() const { return device_; }
  private:
@@ -276,10 +284,14 @@ class Context {
 
   // Accessor to the private data-member
   const cl_context& operator()() const { return *context_; }
+  cl_context* pointer() const { return &(*context_); }
  private:
   std::shared_ptr<cl_context> context_;
 };
 
+// Pointer to an OpenCL context
+using ContextPointer = cl_context*;
+
 // =================================================================================================
 
 // Enumeration of build statuses of the run-time compilation process
@@ -290,7 +302,7 @@ class Program {
  public:
   // Note that there is no constructor based on the regular OpenCL data-type because of extra state
 
-  // Regular constructor with memory management
+  // Source-based constructor with memory management
   explicit Program(const Context &context, std::string source):
       program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
       length_(source.length()),
@@ -301,6 +313,22 @@ class Program {
     CheckError(status);
   }
 
+  // Binary-based constructor with memory management
+  explicit Program(const Device &device, const Context &context, const std::string& binary):
+      program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+      length_(binary.length()),
+      source_(binary),
+      source_ptr_(&source_[0]) {
+    auto status1 = CL_SUCCESS;
+    auto status2 = CL_SUCCESS;
+    const cl_device_id dev = device();
+    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
+                                          reinterpret_cast<const unsigned char**>(&source_ptr_),
+                                          &status1, &status2);
+    CheckError(status1);
+    CheckError(status2);
+  }
+
   // Compiles the device program and returns whether or not there where any warnings/errors
   BuildStatus Build(const Device &device, std::vector<std::string> &options) {
     auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
@@ -329,7 +357,7 @@ class Program {
     return result;
   }
 
-  // Retrieves an intermediate representation of the compiled program
+  // Retrieves a binary or an intermediate representation of the compiled program
   std::string GetIR() const {
     auto bytes = size_t{0};
     CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
@@ -345,7 +373,7 @@ class Program {
  private:
   std::shared_ptr<cl_program> program_;
   size_t length_;
-  std::string source_;
+  std::string source_; // Note: the source can also be a binary or IR
   const char* source_ptr_;
 };
 
@@ -627,15 +655,15 @@ class Kernel {
 
   // Launches a kernel onto the specified queue
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event) {
+              const std::vector<size_t> &local, EventPointer event) {
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), local.data(),
-                                      0, nullptr, &(event())));
+                                      0, nullptr, event));
   }
 
   // As above, but with an event waiting list
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event,
+              const std::vector<size_t> &local, EventPointer event,
               std::vector<Event>& waitForEvents) {
     if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
 
@@ -648,15 +676,16 @@ class Kernel {
     // Launches the kernel while waiting for other events
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), local.data(),
-                                      waitForEventsPlain.size(), waitForEventsPlain.data(),
-                                      &(event())));
+                                      static_cast<cl_uint>(waitForEventsPlain.size()),
+                                      waitForEventsPlain.data(),
+                                      event));
   }
 
   // As above, but with the default local workgroup size
-  void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
+  void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), nullptr,
-                                      0, nullptr, &(event())));
+                                      0, nullptr, event));
   }
 
   // Accessor to the private data-member
diff --git a/include/internal/cupp11.h b/include/internal/cupp11.h
index 515fee6..c1098f4 100644
--- a/include/internal/cupp11.h
+++ b/include/internal/cupp11.h
@@ -11,7 +11,7 @@
 // Portability here means that a similar header exists for OpenCL with the same classes and
 // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change.
 //
-// This is version 4.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
+// This is version 6.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
 //
 // =================================================================================================
 //
@@ -106,13 +106,14 @@ class Event {
   // Accessors to the private data-members
   const CUevent& start() const { return *start_; }
   const CUevent& end() const { return *end_; }
+  Event* pointer() { return this; }
  private:
   std::shared_ptr<CUevent> start_;
   std::shared_ptr<CUevent> end_;
 };
 
 // Pointer to a CUDA event
-using EventPointer = CUevent*;
+using EventPointer = Event*;
 
 // =================================================================================================
 
@@ -159,6 +160,11 @@ class Device {
     CheckError(cuDriverGetVersion(&result));
     return "CUDA driver "+std::to_string(result);
   }
+  size_t VersionNumber() const {
+    auto result = 0;
+    CheckError(cuDriverGetVersion(&result));
+    return static_cast<size_t>(result);
+  }
   std::string Vendor() const { return "NVIDIA Corporation"; }
   std::string Name() const {
     auto result = std::string{};
@@ -206,6 +212,14 @@ class Device {
     return true;
   }
 
+  // Query for a specific type of device or brand
+  bool IsCPU() const { return false; }
+  bool IsGPU() const { return true; }
+  bool IsAMD() const { return false; }
+  bool IsNVIDIA() const { return true; }
+  bool IsIntel() const { return false; }
+  bool IsARM() const { return false; }
+
   // Accessor to the private data-member
   const CUdevice& operator()() const { return device_; }
  private:
@@ -239,10 +253,14 @@ class Context {
 
   // Accessor to the private data-member
   const CUcontext& operator()() const { return *context_; }
+  CUcontext* pointer() const { return &(*context_); }
  private:
   std::shared_ptr<CUcontext> context_;
 };
 
+// Pointer to an OpenCL context
+using ContextPointer = CUcontext*;
+
 // =================================================================================================
 
 // Enumeration of build statuses of the run-time compilation process
@@ -253,7 +271,7 @@ class Program {
  public:
   // Note that there is no constructor based on the regular CUDA data-type because of extra state
 
-  // Regular constructor with memory management
+  // Source-based constructor with memory management
   explicit Program(const Context &, std::string source):
       program_(new nvrtcProgram, [](nvrtcProgram* p) { CheckError(nvrtcDestroyProgram(p));
                                                        delete p; }),
@@ -555,7 +573,7 @@ class Kernel {
 
   // Launches a kernel onto the specified queue
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event) {
+              const std::vector<size_t> &local, EventPointer event) {
 
     // Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
     auto grid = std::vector<size_t>{1, 1, 1};
@@ -571,16 +589,16 @@ class Kernel {
     }
 
     // Launches the kernel, its execution time is recorded by events
-    CheckError(cuEventRecord(event.start(), queue()));
+    CheckError(cuEventRecord(event->start(), queue()));
     CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
                               0, queue(), pointers.data(), nullptr));
-    CheckError(cuEventRecord(event.end(), queue()));
+    CheckError(cuEventRecord(event->end(), queue()));
   }
 
   // As above, but with an event waiting list
   // TODO: Implement this function
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event,
+              const std::vector<size_t> &local, EventPointer event,
               std::vector<Event>& waitForEvents) {
     if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
     Error("launching with an event waiting list is not implemented for the CUDA back-end");
@@ -588,7 +606,7 @@ class Kernel {
 
   // As above, but with the default local workgroup size
   // TODO: Implement this function
-  void Launch(const Queue &, const std::vector<size_t> &, Event &) {
+  void Launch(const Queue &, const std::vector<size_t> &, EventPointer) {
     Error("launching with a default workgroup size is not implemented for the CUDA back-end");
   }
 
diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc
index 5cb7908..4a67495 100644
--- a/src/tuner_impl.cc
+++ b/src/tuner_impl.cc
@@ -331,7 +331,7 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
         fprintf(stdout, "%s Launching kernel (%zu out of %zu for averaging)\n", kMessageVerbose.c_str(),
                 t + 1, kNumRuns);
       #endif
-      tune_kernel.Launch(queue_, global, local, events[t]);
+      tune_kernel.Launch(queue_, global, local, events[t].pointer());
       queue_.Finish(events[t]);
     }
     queue_.Finish();

From 0526f9ded2e3976abd68d618b66e7ee00601ee43 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 29 Jun 2016 18:08:10 +0200
Subject: [PATCH 7/9] Made it possible to run some of the GEMM kernels using
 CUDA (those without shared memory)

---
 samples/cl_to_cuda.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/cl_to_cuda.h b/samples/cl_to_cuda.h
index 3f23d2e..badfa5a 100644
--- a/samples/cl_to_cuda.h
+++ b/samples/cl_to_cuda.h
@@ -17,6 +17,7 @@
 #define __local __shared__
 #define restrict __restrict__
 #define __constant const
+#define inline __device__ inline // assumes all device functions are annotated with inline in OpenCL
 
 // Replaces OpenCL synchronisation with CUDA synchronisation
 #define barrier(x) __syncthreads()

From 45b2c5262c19855b9ce8c4ee6589b47449d84312 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 29 Jun 2016 18:10:46 +0200
Subject: [PATCH 8/9] Updated to version 2.4.0

---
 .appveyor.yml  | 4 ++--
 CHANGELOG      | 2 +-
 CMakeLists.txt | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.appveyor.yml b/.appveyor.yml
index 3e26ba8..ca885b8 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -56,8 +56,8 @@ build_script:
 
 after_build:
   - ps: pushd $env:CLTUNE_ROOT
-  - 7z a CLTune-Windows-%PLATFORM%.zip .\install_dir\*
-  - ps: mv CLTune-Windows-%PLATFORM%.zip $env:APPVEYOR_BUILD_FOLDER
+  - 7z a CLTune-Windows-$env:PLATFORM.zip .\install_dir\*
+  - ps: mv CLTune-Windows-$env:PLATFORM.zip $env:APPVEYOR_BUILD_FOLDER
 
 artifacts:
   - path: '*.zip'
diff --git a/CHANGELOG b/CHANGELOG
index 62d306f..fd6ff46 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,5 @@
 
-Development version (next release)
+Version 2.4.0
 - Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
 - Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON)
 - Now using version 6.0 of the CLCudaAPI header
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b03acf..831b421 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,8 +32,8 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 # CMake project details
 project("cltune" CXX)
 set(cltune_VERSION_MAJOR 2)
-set(cltune_VERSION_MINOR 3)
-set(cltune_VERSION_PATCH 1)
+set(cltune_VERSION_MINOR 4)
+set(cltune_VERSION_PATCH 0)
 
 # Options
 option(SAMPLES "Enable compilation of sample programs" ON)

From a001605156e7e6f6ad88c624e8246ee85a2c40a4 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 29 Jun 2016 18:21:25 +0200
Subject: [PATCH 9/9] Minor fix to the AppVeyor CI build

---
 .appveyor.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.appveyor.yml b/.appveyor.yml
index ca885b8..508489e 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -56,8 +56,8 @@ build_script:
 
 after_build:
   - ps: pushd $env:CLTUNE_ROOT
-  - 7z a CLTune-Windows-$env:PLATFORM.zip .\install_dir\*
-  - ps: mv CLTune-Windows-$env:PLATFORM.zip $env:APPVEYOR_BUILD_FOLDER
+  - 7z a CLTune-Windows.zip .\install_dir\*
+  - ps: mv CLTune-Windows.zip $env:APPVEYOR_BUILD_FOLDER
 
 artifacts:
   - path: '*.zip'