From f6b81460a1f623634d8ace89bec277aa7a2fabaa Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <cedric.nugteren@surfsara.nl>
Date: Fri, 8 May 2015 16:25:49 +0200
Subject: [PATCH 01/10] Added support for the OPENCL_ROOT environmental
 variable

---
 cmake/Modules/FindOpenCL.cmake | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake
index f31807b..9c0e7f2 100644
--- a/cmake/Modules/FindOpenCL.cmake
+++ b/cmake/Modules/FindOpenCL.cmake
@@ -50,15 +50,17 @@
 find_path(OPENCL_INCLUDE_DIRS
     NAMES OpenCL/cl.h CL/cl.h
     HINTS
-        ${OPENCL_ROOT}/include
-        $ENV{AMDAPPSDKROOT}/include
-        $ENV{CUDA_PATH}/include
+        ${OPENCL_ROOT}
+        $ENV{OPENCL_ROOT}
+        $ENV{AMDAPPSDKROOT}
+        $ENV{CUDA_PATH}
+    DOC "OpenCL header file path"
+    PATH_SUFFIXES include
     PATHS
         /usr/include
         /usr/local/include
         /usr/local/cuda/include
         /opt/cuda/include
-    DOC "OpenCL header file path"
 )
 mark_as_advanced( OPENCL_INCLUDE_DIRS )
 
@@ -69,11 +71,12 @@ if( LIB64 )
     find_library( OPENCL_LIBRARIES
         NAMES OpenCL
         HINTS
-            ${OPENCL_ROOT}/lib
-            $ENV{AMDAPPSDKROOT}/lib
-            $ENV{CUDA_PATH}/lib
+            ${OPENCL_ROOT}
+            $ENV{OPENCL_ROOT}
+            $ENV{AMDAPPSDKROOT}
+            $ENV{CUDA_PATH}
         DOC "OpenCL dynamic library path"
-        PATH_SUFFIXES x86_64 x64
+        PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64
         PATHS
             /usr/lib
             /usr/local/cuda/lib

From 032aa2c9fb95534722419c3efda08967c2238c4e Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <cedric.nugteren@surfsara.nl>
Date: Fri, 8 May 2015 16:28:18 +0200
Subject: [PATCH 02/10] Added support for 16-wide vectors to the GEMM sample

---
 samples/gemm/gemm.opencl | 87 ++++++++++++++++++++++++++++++----------
 1 file changed, 66 insertions(+), 21 deletions(-)

diff --git a/samples/gemm/gemm.opencl b/samples/gemm/gemm.opencl
index c974443..fce29c2 100644
--- a/samples/gemm/gemm.opencl
+++ b/samples/gemm/gemm.opencl
@@ -80,7 +80,8 @@
 #define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
 
 // Settings
-#define USE_MAD 0                     // Uses the non IEEE-754 compliant mad() function
+#define USE_VECTOR_MAD 1              // Don't unroll the vector MAD computation
+#define USE_CL_MAD 0                  // Uses the non-IEEE754 compliant OpenCL mad() (if above is 0)
 
 // =================================================================================================
 
@@ -90,6 +91,7 @@
   typedef float2 real2;
   typedef float4 real4;
   typedef float8 real8;
+  typedef float16 real16;
   #define ZERO 0.0f
 #elif PRECISION == 64
   #if __OPENCL_VERSION__ <= CL_VERSION_1_1 // This the default on OpenCL 1.2 or higher
@@ -99,6 +101,7 @@
   typedef double2 real2;
   typedef double4 real4;
   typedef double8 real8;
+  typedef double16 real16;
   #define ZERO 0.0
 #endif
 
@@ -113,6 +116,8 @@
     typedef real4 realM;
 #elif VWM == 8
     typedef real8 realM;
+#elif VWM == 16
+    typedef real16 realM;
 #endif
 
 // Data-widths in dimension N
@@ -124,6 +129,8 @@
     typedef real4 realN;
 #elif VWN == 8
     typedef real8 realN;
+#elif VWN == 16
+    typedef real16 realN;
 #endif
 
 // =================================================================================================
@@ -300,7 +307,7 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
 // =================================================================================================
 
 // The basic scalar multiply-add function
-#if USE_MAD == 1
+#if USE_CL_MAD == 1
   #define MultiplyAdd(cval, aval, bval) (cval = mad(aval, bval, cval))
 #else
   #define MultiplyAdd(cval, aval, bval) (cval += (aval) * (bval))
@@ -308,25 +315,46 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
 
 // The vectorised multiply-add function
 inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
-  #if VWM == 1
-    MultiplyAdd(cvec,    avec,    bval);
-  #elif VWM == 2
-    MultiplyAdd(cvec.x , avec.x,  bval);
-    MultiplyAdd(cvec.y , avec.y,  bval);
-  #elif VWM == 4
-    MultiplyAdd(cvec.x , avec.x,  bval);
-    MultiplyAdd(cvec.y , avec.y,  bval);
-    MultiplyAdd(cvec.z , avec.z,  bval);
-    MultiplyAdd(cvec.w , avec.w,  bval);
-  #elif VWM == 8
-    MultiplyAdd(cvec.s0, avec.s0, bval);
-    MultiplyAdd(cvec.s1, avec.s1, bval);
-    MultiplyAdd(cvec.s2, avec.s2, bval);
-    MultiplyAdd(cvec.s3, avec.s3, bval);
-    MultiplyAdd(cvec.s4, avec.s4, bval);
-    MultiplyAdd(cvec.s5, avec.s5, bval);
-    MultiplyAdd(cvec.s6, avec.s6, bval);
-    MultiplyAdd(cvec.s7, avec.s7, bval);
+  #if USE_VECTOR_MAD == 1
+    cvec += avec * bval;
+  #else
+    #if VWM == 1
+      MultiplyAdd(cvec,    avec,    bval);
+    #elif VWM == 2
+      MultiplyAdd(cvec.x , avec.x,  bval);
+      MultiplyAdd(cvec.y , avec.y,  bval);
+    #elif VWM == 4
+      MultiplyAdd(cvec.x , avec.x,  bval);
+      MultiplyAdd(cvec.y , avec.y,  bval);
+      MultiplyAdd(cvec.z , avec.z,  bval);
+      MultiplyAdd(cvec.w , avec.w,  bval);
+    #elif VWM == 8
+      MultiplyAdd(cvec.s0, avec.s0, bval);
+      MultiplyAdd(cvec.s1, avec.s1, bval);
+      MultiplyAdd(cvec.s2, avec.s2, bval);
+      MultiplyAdd(cvec.s3, avec.s3, bval);
+      MultiplyAdd(cvec.s4, avec.s4, bval);
+      MultiplyAdd(cvec.s5, avec.s5, bval);
+      MultiplyAdd(cvec.s6, avec.s6, bval);
+      MultiplyAdd(cvec.s7, avec.s7, bval);
+    #elif VWM == 16
+      MultiplyAdd(cvec.s0, avec.s0, bval);
+      MultiplyAdd(cvec.s1, avec.s1, bval);
+      MultiplyAdd(cvec.s2, avec.s2, bval);
+      MultiplyAdd(cvec.s3, avec.s3, bval);
+      MultiplyAdd(cvec.s4, avec.s4, bval);
+      MultiplyAdd(cvec.s5, avec.s5, bval);
+      MultiplyAdd(cvec.s6, avec.s6, bval);
+      MultiplyAdd(cvec.s7, avec.s7, bval);
+      MultiplyAdd(cvec.s8, avec.s8, bval);
+      MultiplyAdd(cvec.s9, avec.s9, bval);
+      MultiplyAdd(cvec.sA, avec.sA, bval);
+      MultiplyAdd(cvec.sB, avec.sB, bval);
+      MultiplyAdd(cvec.sC, avec.sC, bval);
+      MultiplyAdd(cvec.sD, avec.sD, bval);
+      MultiplyAdd(cvec.sE, avec.sE, bval);
+      MultiplyAdd(cvec.sF, avec.sF, bval);
+    #endif
   #endif
   return cvec;
 }
@@ -356,6 +384,23 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
         cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], apm[mi], bpm[ni].s5);
         cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], apm[mi], bpm[ni].s6);
         cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], apm[mi], bpm[ni].s7);
+      #elif VWN == 16
+        cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], apm[mi], bpm[ni].s0);
+        cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], apm[mi], bpm[ni].s1);
+        cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], apm[mi], bpm[ni].s2);
+        cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], apm[mi], bpm[ni].s3);
+        cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], apm[mi], bpm[ni].s4);
+        cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], apm[mi], bpm[ni].s5);
+        cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], apm[mi], bpm[ni].s6);
+        cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], apm[mi], bpm[ni].s7);
+        cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], apm[mi], bpm[ni].s8);
+        cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], apm[mi], bpm[ni].s9);
+        cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], apm[mi], bpm[ni].sA);
+        cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], apm[mi], bpm[ni].sB);
+        cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], apm[mi], bpm[ni].sC);
+        cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], apm[mi], bpm[ni].sD);
+        cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], apm[mi], bpm[ni].sE);
+        cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], apm[mi], bpm[ni].sF);
       #endif
     }
   }

From e0b16c10fdb2351b9f2e28ce5450b5dcd472f7d6 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <mail@cedricnugteren.nl>
Date: Thu, 14 May 2015 09:38:59 +0200
Subject: [PATCH 03/10] Updated compiler check and per-compiler flags

---
 CMakeLists.txt | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3285152..c90f3be 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,12 +46,30 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically deter
 # Compiler-version check
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9)
-        message(FATAL_ERROR "GCC version must be at least 4.9 (for full C++11 compatibility)")
+        message(FATAL_ERROR "GCC version must be at least 4.9")
+    endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") # Note: what about AppleClang?
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
+        message(FATAL_ERROR "Clang version must be at least 3.3")
+    endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
+        message(FATAL_ERROR "ICC version must be at least 14.0")
+    endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
+        message(FATAL_ERROR "Visual Studio version must be at least 18.0")
     endif()
 endif()
 
-# C++11 compiler settings
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wall -Wno-comment")
+# C++ compiler settings
+set(FLAGS "-O3 -std=c++11")
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  set(FLAGS "${FLAGS} -Wall -Wno-comment")
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") # Note: what about AppleClang?
+  #set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
+endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
 
 # ==================================================================================================
 

From 517fc470aa434b5e70f75ec48114162458f02207 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <mail@cedricnugteren.nl>
Date: Thu, 14 May 2015 09:40:54 +0200
Subject: [PATCH 04/10] Added support for multiple input files, minor fixes to
 the samples

---
 include/cltune.h         |  6 +++---
 samples/conv/conv.cc     |  4 ++--
 samples/gemm/gemm.cc     |  4 ++--
 samples/simple/simple.cc | 20 ++++++++++++--------
 src/tuner.cc             | 17 ++++++++++++-----
 5 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/include/cltune.h b/include/cltune.h
index 7b98d82..0454ab0 100644
--- a/include/cltune.h
+++ b/include/cltune.h
@@ -96,12 +96,12 @@ class Tuner {
 
   // Adds a new kernel to the list of tuning-kernels and returns a unique ID (to be used when
   // adding tuning parameters)
-  int AddKernel(const std::string &filename, const std::string &kernel_name,
-                const cl::NDRange &global, const cl::NDRange &local);
+  size_t AddKernel(const std::vector<std::string> &filenames, const std::string &kernel_name,
+                   const cl::NDRange &global, const cl::NDRange &local);
 
   // Sets the reference kernel. Same as the AddKernel function, but in this case there is only one
   // reference kernel. Calling this function again will overwrite the previous reference kernel.
-  void SetReference(const std::string &filename, const std::string &kernel_name,
+  void SetReference(const std::vector<std::string> &filenames, const std::string &kernel_name,
                     const cl::NDRange &global, const cl::NDRange &local);
 
   // Adds a new tuning parameter for a kernel with a specific ID. The parameter has a name, the
diff --git a/samples/conv/conv.cc b/samples/conv/conv.cc
index be047e4..ffcc7ce 100644
--- a/samples/conv/conv.cc
+++ b/samples/conv/conv.cc
@@ -121,7 +121,7 @@ int main(int argc, char* argv[]) {
   // ===============================================================================================
 
   // Adds a heavily tuneable kernel and some example parameter values
-  auto id = tuner.AddKernel("../samples/conv/conv.opencl", "conv", {kSizeX, kSizeY}, {1, 1});
+  auto id = tuner.AddKernel({"../samples/conv/conv.opencl"}, "conv", {kSizeX, kSizeY}, {1, 1});
   tuner.AddParameter(id, "TBX", {8, 16, 32, 64});
   tuner.AddParameter(id, "TBY", {8, 16, 32, 64});
   tuner.AddParameter(id, "LOCAL", {0, 1, 2});
@@ -181,7 +181,7 @@ int main(int argc, char* argv[]) {
   // Sets the tuner's golden reference function. This kernel contains the reference code to which
   // the output is compared. Supplying such a function is not required, but it is necessary for
   // correctness checks to be enabled.
-  tuner.SetReference("../samples/conv/conv_reference.opencl", "conv_reference", {kSizeX, kSizeY}, {8,8});
+  tuner.SetReference({"../samples/conv/conv_reference.opencl"}, "conv_reference", {kSizeX, kSizeY}, {8,8});
 
   // Sets the function's arguments. Note that all kernels have to accept (but not necessarily use)
   // all input arguments.
diff --git a/samples/gemm/gemm.cc b/samples/gemm/gemm.cc
index 3b68087..9821b5d 100644
--- a/samples/gemm/gemm.cc
+++ b/samples/gemm/gemm.cc
@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
 
   // Adds a heavily tuneable kernel and some example parameter values. Others can be added, but for
   // this example this already leads to plenty of kernels to test.
-  auto id = tuner.AddKernel("../samples/gemm/gemm.opencl", "gemm_fast", {kSizeM, kSizeN}, {1, 1});
+  auto id = tuner.AddKernel({"../samples/gemm/gemm.opencl"}, "gemm_fast", {kSizeM, kSizeN}, {1, 1});
   tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
   tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
   tuner.AddParameter(id, "KWG", {16, 32});
@@ -164,7 +164,7 @@ int main(int argc, char* argv[]) {
   // Sets the tuner's golden reference function. This kernel contains the reference code to which
   // the output is compared. Supplying such a function is not required, but it is necessarily for
   // correctness checks to be enabled.
-  tuner.SetReference("../samples/gemm/gemm_reference.opencl", "gemm_reference", {kSizeM, kSizeN}, {8,8});
+  tuner.SetReference({"../samples/gemm/gemm_reference.opencl"}, "gemm_reference", {kSizeM, kSizeN}, {8,8});
 
   // Sets the function's arguments. Note that all kernels have to accept (but not necessarily use)
   // all input arguments.
diff --git a/samples/simple/simple.cc b/samples/simple/simple.cc
index f05cf24..a73d54d 100644
--- a/samples/simple/simple.cc
+++ b/samples/simple/simple.cc
@@ -25,9 +25,9 @@
 //
 // =================================================================================================
 
-#include <iostream>
-#include <sstream>
 #include <vector>
+#include <chrono>
+#include <random>
 
 // Includes the OpenCL tuner library
 #include "cltune.h"
@@ -48,10 +48,14 @@ int main() {
   std::vector<float> vec_x(kSizeN);
   std::vector<float> vec_y(kSizeM);
 
+  // Create a random number generator
+  const auto random_seed = std::chrono::system_clock::now().time_since_epoch().count();
+  std::default_random_engine generator(random_seed);
+  std::uniform_real_distribution<float> distribution(-2.0f, 2.0f);
+
   // Populates input data structures
-  srand(time(nullptr));
-  for (auto &item: mat_a) { item = (float)rand() / (float)RAND_MAX; }
-  for (auto &item: vec_x) { item = (float)rand() / (float)RAND_MAX; }
+  for (auto &item: mat_a) { item = distribution(generator); }
+  for (auto &item: vec_x) { item = distribution(generator); }
   for (auto &item: vec_y) { item = 0.0; }
 
   // Initializes the tuner (platform 0, device 1)
@@ -60,20 +64,20 @@ int main() {
   // Adds a kernel which supports unrolling through the UNROLL parameter. Note that the kernel
   // itself needs to implement the UNROLL parameter and (in this case) only accepts a limited
   // amount of values.
-  auto id = tuner.AddKernel("../samples/simple/simple_unroll.opencl", "matvec_unroll", {kSizeM}, {128});
+  auto id = tuner.AddKernel({"../samples/simple/simple_unroll.opencl"}, "matvec_unroll", {kSizeM}, {128});
   tuner.AddParameter(id, "UNROLL", {1, 2, 4});
 
   // Adds another kernel and its parameters. This kernel caches the input vector X into local
   // memory to save global memory accesses. Note that the kernel's workgroup size is determined by
   // the tile size parameter TS.
-  id = tuner.AddKernel("../samples/simple/simple_tiled.opencl", "matvec_tiled", {kSizeM}, {1});
+  id = tuner.AddKernel({"../samples/simple/simple_tiled.opencl"}, "matvec_tiled", {kSizeM}, {1});
   tuner.AddParameter(id, "TS", {32, 64, 128, 256, 512});
   tuner.MulLocalSize(id, {"TS"});
 
   // Sets the tuner's golden reference function. This kernel contains the reference code to which
   // the output is compared. Supplying such a function is not required, but it is necessarily for
   // correctness checks to be enabled.
-  tuner.SetReference("../samples/simple/simple_reference.opencl", "matvec_reference", {kSizeM}, {128});
+  tuner.SetReference({"../samples/simple/simple_reference.opencl"}, "matvec_reference", {kSizeM}, {128});
 
   // Sets the function's arguments. Note that all kernels have to accept (but not necessarily use)
   // all input arguments.
diff --git a/src/tuner.cc b/src/tuner.cc
index d09880a..d2dd86d 100644
--- a/src/tuner.cc
+++ b/src/tuner.cc
@@ -94,11 +94,14 @@ Tuner::~Tuner() {
 
 // Loads the OpenCL source-code from a file and creates a new variable of type KernelInfo to store
 // all the kernel-information.
-int Tuner::AddKernel(const std::string &filename, const std::string &kernel_name,
-                      const cl::NDRange &global, const cl::NDRange &local) {
+size_t Tuner::AddKernel(const std::vector<std::string> &filenames, const std::string &kernel_name,
+                        const cl::NDRange &global, const cl::NDRange &local) {
 
   // Loads the source-code and adds the kernel
-  auto source = LoadFile(filename);
+  auto source = std::string{};
+  for (auto &filename: filenames) {
+    source += LoadFile(filename);
+  }
   kernels_.push_back(KernelInfo(kernel_name, source, opencl_));
 
   // Sets the global and local thread sizes
@@ -113,10 +116,13 @@ int Tuner::AddKernel(const std::string &filename, const std::string &kernel_name
 // Sets the reference kernel (source-code location, kernel name, global/local thread-sizes) and
 // sets a flag to indicate that there is now a reference. Calling this function again will simply
 // overwrite the old reference.
-void Tuner::SetReference(const std::string &filename, const std::string &kernel_name,
+void Tuner::SetReference(const std::vector<std::string> &filenames, const std::string &kernel_name,
                          const cl::NDRange &global, const cl::NDRange &local) {
   has_reference_ = true;
-  auto source = LoadFile(filename);
+  auto source = std::string{};
+  for (auto &filename: filenames) {
+    source += LoadFile(filename);
+  }
   reference_kernel_.reset(new KernelInfo(kernel_name, source, opencl_));
   reference_kernel_->set_global_base(global);
   reference_kernel_->set_local_base(local);
@@ -211,6 +217,7 @@ void Tuner::AddArgumentScalar(const T argument) {
   arguments_scalar_.push_back({argument_counter_++, argument});
 }
 template void Tuner::AddArgumentScalar<int>(const int);
+template void Tuner::AddArgumentScalar<size_t>(const size_t);
 template void Tuner::AddArgumentScalar<float>(const float);
 template void Tuner::AddArgumentScalar<double>(const double);
 

From cce96129135f7b5b0d73b7130df46a61f44020b3 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <mail@cedricnugteren.nl>
Date: Thu, 14 May 2015 10:11:06 +0200
Subject: [PATCH 05/10] Added support for std::complex data-types

---
 include/cltune.h        |  8 +++++-
 include/cltune/memory.h |  9 +++++-
 src/memory.cc           |  6 +++-
 src/tuner.cc            | 63 ++++++++++++++++++++++++++++++++++-------
 4 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/include/cltune.h b/include/cltune.h
index 0454ab0..25fbe32 100644
--- a/include/cltune.h
+++ b/include/cltune.h
@@ -173,6 +173,7 @@ class Tuner {
   // Downloads the output of a tuning run and compares it against the reference run
   bool VerifyOutput();
   template <typename T> bool DownloadAndCompare(const MemArgument &device_buffer, const size_t i);
+  template <typename T> double AbsoluteDifference(const T reference, const T result);
 
   // Prints results of a particular kernel run
   void PrintResult(FILE* fp, const TunerResult &result, const std::string &message) const;
@@ -201,7 +202,12 @@ class Tuner {
   std::vector<KernelInfo> kernels_;
   std::vector<MemArgument> arguments_input_;
   std::vector<MemArgument> arguments_output_;
-  std::vector<std::pair<int,int>> arguments_scalar_;
+  std::vector<std::pair<int,int>> arguments_int_;
+  std::vector<std::pair<int,size_t>> arguments_size_t_;
+  std::vector<std::pair<int,float>> arguments_float_;
+  std::vector<std::pair<int,double>> arguments_double_;
+  std::vector<std::pair<int,float2>> arguments_float2_;
+  std::vector<std::pair<int,double2>> arguments_double2_;
 
   // Storage for the reference kernel and output
   std::unique_ptr<KernelInfo> reference_kernel_;
diff --git a/include/cltune/memory.h b/include/cltune/memory.h
index 181d129..2919ab2 100644
--- a/include/cltune/memory.h
+++ b/include/cltune/memory.h
@@ -34,14 +34,21 @@
 #include <vector>
 #include <stdexcept>
 #include <memory>
+#include <complex>
 
 #include "cltune/opencl.h"
 
 namespace cltune {
 // =================================================================================================
 
+// Shorthands for complex data-types
+using float2 = std::complex<float>; // cl_float2;
+using double2 = std::complex<double>; // cl_double2;
+
+// =================================================================================================
+
 // Enumeration of currently supported data-types by this class
-enum class MemType { kInt, kFloat, kDouble };
+enum class MemType { kInt, kFloat, kDouble, kFloat2, kDouble2 };
 
 // See comment at top of file for a description of the class
 template <typename T>
diff --git a/src/memory.cc b/src/memory.cc
index 903fa63..af6b4db 100644
--- a/src/memory.cc
+++ b/src/memory.cc
@@ -35,6 +35,8 @@ namespace cltune {
 template <> const MemType Memory<int>::type = MemType::kInt;
 template <> const MemType Memory<float>::type = MemType::kFloat;
 template <> const MemType Memory<double>::type = MemType::kDouble;
+template <> const MemType Memory<float2>::type = MemType::kFloat2;
+template <> const MemType Memory<double2>::type = MemType::kDouble2;
 
 // Initializes the memory class, creating a host array with zeroes and an uninitialized device
 // buffer.
@@ -42,7 +44,7 @@ template <typename T>
 Memory<T>::Memory(const size_t size, cl::CommandQueue queue, const cl::Context &context,
                   const cl_mem_flags flags):
     size_(size),
-    host_(size, static_cast<T>(0)),
+    host_(size, T{0}),
     device_(new cl::Buffer(context, flags, size*sizeof(T))),
     queue_(queue) {
 }
@@ -81,6 +83,8 @@ void Memory<T>::DownloadFromDevice() {
 template class Memory<int>;
 template class Memory<float>;
 template class Memory<double>;
+template class Memory<float2>;
+template class Memory<double2>;
 
 // =================================================================================================
 } // namespace cltune
diff --git a/src/tuner.cc b/src/tuner.cc
index d2dd86d..592d5e8 100644
--- a/src/tuner.cc
+++ b/src/tuner.cc
@@ -198,6 +198,8 @@ void Tuner::AddArgumentInput(const std::vector<T> &source) {
 template void Tuner::AddArgumentInput<int>(const std::vector<int>&);
 template void Tuner::AddArgumentInput<float>(const std::vector<float>&);
 template void Tuner::AddArgumentInput<double>(const std::vector<double>&);
+template void Tuner::AddArgumentInput<float2>(const std::vector<float2>&);
+template void Tuner::AddArgumentInput<double2>(const std::vector<double2>&);
 
 // As above, but now marked as output buffer
 template <typename T>
@@ -210,16 +212,28 @@ void Tuner::AddArgumentOutput(const std::vector<T> &source) {
 template void Tuner::AddArgumentOutput<int>(const std::vector<int>&);
 template void Tuner::AddArgumentOutput<float>(const std::vector<float>&);
 template void Tuner::AddArgumentOutput<double>(const std::vector<double>&);
+template void Tuner::AddArgumentOutput<float2>(const std::vector<float2>&);
+template void Tuner::AddArgumentOutput<double2>(const std::vector<double2>&);
 
-// Sets a simple scalar value as an argument to the kernel
-template <typename T>
-void Tuner::AddArgumentScalar(const T argument) {
-  arguments_scalar_.push_back({argument_counter_++, argument});
+// Sets a scalar value as an argument to the kernel
+template <> void Tuner::AddArgumentScalar<int>(const int argument) {
+  arguments_int_.push_back({argument_counter_++, argument});
+}
+template <> void Tuner::AddArgumentScalar<size_t>(const size_t argument) {
+  arguments_size_t_.push_back({argument_counter_++, argument});
+}
+template <> void Tuner::AddArgumentScalar<float>(const float argument) {
+  arguments_float_.push_back({argument_counter_++, argument});
+}
+template <> void Tuner::AddArgumentScalar<double>(const double argument) {
+  arguments_double_.push_back({argument_counter_++, argument});
+}
+template <> void Tuner::AddArgumentScalar<float2>(const float2 argument) {
+  arguments_float2_.push_back({argument_counter_++, argument});
+}
+template <> void Tuner::AddArgumentScalar<double2>(const double2 argument) {
+  arguments_double2_.push_back({argument_counter_++, argument});
 }
-template void Tuner::AddArgumentScalar<int>(const int);
-template void Tuner::AddArgumentScalar<size_t>(const size_t);
-template void Tuner::AddArgumentScalar<float>(const float);
-template void Tuner::AddArgumentScalar<double>(const double);
 
 // =================================================================================================
 
@@ -495,6 +509,8 @@ Tuner::TunerResult Tuner::RunKernel(const std::string &source, const KernelInfo
       case MemType::kInt: ResetMemArgument<int>(output); break;
       case MemType::kFloat: ResetMemArgument<float>(output); break;
       case MemType::kDouble: ResetMemArgument<double>(output); break;
+      case MemType::kFloat2: ResetMemArgument<float2>(output); break;
+      case MemType::kDouble2: ResetMemArgument<double2>(output); break;
       default: throw Exception("Unsupported reference output data-type");
     }
   }
@@ -503,7 +519,12 @@ Tuner::TunerResult Tuner::RunKernel(const std::string &source, const KernelInfo
   auto tune_kernel = cl::Kernel(program, kernel.name().c_str());
   for (auto &i: arguments_input_)  { tune_kernel.setArg(i.index, i.buffer); }
   for (auto &i: arguments_output_) { tune_kernel.setArg(i.index, i.buffer); }
-  for (auto &i: arguments_scalar_) { tune_kernel.setArg(i.first, i.second); }
+  for (auto &i: arguments_int_) { tune_kernel.setArg(i.first, i.second); }
+  for (auto &i: arguments_size_t_) { tune_kernel.setArg(i.first, i.second); }
+  for (auto &i: arguments_float_) { tune_kernel.setArg(i.first, i.second); }
+  for (auto &i: arguments_double_) { tune_kernel.setArg(i.first, i.second); }
+  for (auto &i: arguments_float2_) { tune_kernel.setArg(i.first, i.second); }
+  for (auto &i: arguments_double2_) { tune_kernel.setArg(i.first, i.second); }
 
   // Sets the global and local thread-sizes
   auto global = kernel.global();
@@ -569,7 +590,7 @@ template <typename T>
 void Tuner::ResetMemArgument(MemArgument &argument) {
 
   // Create an array with zeroes
-  std::vector<T> buffer(argument.size, static_cast<T>(0));
+  std::vector<T> buffer(argument.size, T{0});
 
   // Copy the new array to the OpenCL buffer on the device
   auto bytes = sizeof(T)*argument.size;
@@ -589,6 +610,8 @@ void Tuner::StoreReferenceOutput() {
       case MemType::kInt: DownloadReference<int>(output_buffer); break;
       case MemType::kFloat: DownloadReference<float>(output_buffer); break;
       case MemType::kDouble: DownloadReference<double>(output_buffer); break;
+      case MemType::kFloat2: DownloadReference<float2>(output_buffer); break;
+      case MemType::kDouble2: DownloadReference<double2>(output_buffer); break;
       default: throw Exception("Unsupported reference output data-type");
     }
   }
@@ -615,6 +638,8 @@ bool Tuner::VerifyOutput() {
         case MemType::kInt: status &= DownloadAndCompare<int>(output_buffer, i); break;
         case MemType::kFloat: status &= DownloadAndCompare<float>(output_buffer, i); break;
         case MemType::kDouble: status &= DownloadAndCompare<double>(output_buffer, i); break;
+        case MemType::kFloat2: status &= DownloadAndCompare<float2>(output_buffer, i); break;
+        case MemType::kDouble2: status &= DownloadAndCompare<double2>(output_buffer, i); break;
         default: throw Exception("Unsupported output data-type");
       }
       ++i;
@@ -636,7 +661,7 @@ bool Tuner::DownloadAndCompare(const MemArgument &device_buffer, const size_t i)
   // Compares the results (L2 norm)
   T* reference_output = (T*)reference_outputs_[i];
   for (auto j=0UL; j<device_buffer.size; ++j) {
-    l2_norm += fabs((double)reference_output[j] - (double)host_buffer[j]);
+    l2_norm += AbsoluteDifference(reference_output[j], host_buffer[j]);
   }
 
   // Verifies if everything was OK, if not: print the L2 norm
@@ -648,6 +673,22 @@ bool Tuner::DownloadAndCompare(const MemArgument &device_buffer, const size_t i)
   return true;
 }
 
+// Computes the absolute difference
+template <typename T>
+double Tuner::AbsoluteDifference(const T reference, const T result) {
+  return fabs(static_cast<double>(reference) - static_cast<double>(result));
+}
+template <> double Tuner::AbsoluteDifference(const float2 reference, const float2 result) {
+  auto real = fabs(static_cast<double>(reference.real()) - static_cast<double>(result.real()));
+  auto imag = fabs(static_cast<double>(reference.imag()) - static_cast<double>(result.imag()));
+  return real + imag;
+}
+template <> double Tuner::AbsoluteDifference(const double2 reference, const double2 result) {
+  auto real = fabs(reference.real() - result.real());
+  auto imag = fabs(reference.imag() - result.imag());
+  return real + imag;
+}
+
 // =================================================================================================
 
 // Prints a result by looping over all its configuration parameters

From d2e118062079a129315fb36cce58fc0c365d92bc Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <mail@cedricnugteren.nl>
Date: Fri, 15 May 2015 09:33:26 +0200
Subject: [PATCH 06/10] Better size_t support

---
 include/cltune.h             | 26 +++++++++++++-------------
 include/cltune/kernel_info.h | 10 +++++-----
 samples/conv/conv.cc         | 24 +++++++++++++-----------
 samples/gemm/gemm.cc         | 10 +++++-----
 src/kernel_info.cc           |  8 ++++----
 src/tuner.cc                 | 12 ++++++------
 6 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/include/cltune.h b/include/cltune.h
index 25fbe32..b73ec9d 100644
--- a/include/cltune.h
+++ b/include/cltune.h
@@ -67,7 +67,7 @@ class Tuner {
 
   // Helper structure to store an OpenCL memory argument for a kernel
   struct MemArgument {
-    int index;          // The OpenCL kernel-argument index
+    size_t index;       // The OpenCL kernel-argument index
     size_t size;        // The number of elements (not bytes)
     MemType type;       // The data-type (e.g. float)
     cl::Buffer buffer;  // The host memory and OpenCL buffer on the device
@@ -91,7 +91,7 @@ class Tuner {
 
   // Initialize either with platform 0 and device 0 or with a custom platform/device
   explicit Tuner();
-  explicit Tuner(int platform_id, int device_id);
+  explicit Tuner(size_t platform_id, size_t device_id);
   ~Tuner();
 
   // Adds a new kernel to the list of tuning-kernels and returns a unique ID (to be used when
@@ -108,8 +108,8 @@ class Tuner {
   // number of values, and a list of values.
   // TODO: Remove all following functions (those that take "const size_t id" as first argument) and
   // make the KernelInfo class publicly accessible instead.
-  void AddParameter(const size_t id, const std::string parameter_name,
-                    const std::initializer_list<int> values);
+  void AddParameter(const size_t id, const std::string &parameter_name,
+                    const std::initializer_list<size_t> &values);
 
   // Modifies the global or local thread-size (in NDRange form) by one of the parameters (in
   // StringRange form). The modifier can be multiplication or division.
@@ -137,8 +137,8 @@ class Tuner {
 
   // Configures a specific search method. The default search method is "FullSearch"
   void UseFullSearch();
-  void UseRandomSearch(const float fraction);
-  void UseAnnealing(const float fraction, const double max_temperature);
+  void UseRandomSearch(const double fraction);
+  void UseAnnealing(const double fraction, const double max_temperature);
   void UsePSO(const double fraction, const size_t swarm_size, const double influence_global,
               const double influence_local, const double influence_random);
 
@@ -198,16 +198,16 @@ class Tuner {
   std::vector<double> search_args_;
 
   // Storage of kernel sources, arguments, and parameters
-  int argument_counter_;
+  size_t argument_counter_;
   std::vector<KernelInfo> kernels_;
   std::vector<MemArgument> arguments_input_;
   std::vector<MemArgument> arguments_output_;
-  std::vector<std::pair<int,int>> arguments_int_;
-  std::vector<std::pair<int,size_t>> arguments_size_t_;
-  std::vector<std::pair<int,float>> arguments_float_;
-  std::vector<std::pair<int,double>> arguments_double_;
-  std::vector<std::pair<int,float2>> arguments_float2_;
-  std::vector<std::pair<int,double2>> arguments_double2_;
+  std::vector<std::pair<size_t,int>> arguments_int_;
+  std::vector<std::pair<size_t,size_t>> arguments_size_t_;
+  std::vector<std::pair<size_t,float>> arguments_float_;
+  std::vector<std::pair<size_t,double>> arguments_double_;
+  std::vector<std::pair<size_t,float2>> arguments_float2_;
+  std::vector<std::pair<size_t,double2>> arguments_double2_;
 
   // Storage for the reference kernel and output
   std::unique_ptr<KernelInfo> reference_kernel_;
diff --git a/include/cltune/kernel_info.h b/include/cltune/kernel_info.h
index dc806bb..a4457bf 100644
--- a/include/cltune/kernel_info.h
+++ b/include/cltune/kernel_info.h
@@ -55,14 +55,14 @@ class KernelInfo {
   // Helper structure holding a parameter name and a list of all values
   struct Parameter {
     std::string name;
-    std::vector<int> values;
+    std::vector<size_t> values;
   };
 
   // Helper structure holding a setting: a name and a value. Multiple settings combined make a
   // single configuration.
   struct Setting {
     std::string name;
-    int value;
+    size_t value;
     std::string GetDefine() const { return "#define "+name+" "+GetValueString()+"\n"; }
     std::string GetConfig() const { return name+" "+GetValueString(); }
     std::string GetDatabase() const { return "{\""+name+"\","+GetValueString()+"}"; }
@@ -78,14 +78,14 @@ class KernelInfo {
 
   // Helper structure holding a constraint on parameters. This constraint consists of a constraint
   // function object and a vector of paramater names represented as strings.
-  using ConstraintFunction = std::function<bool(std::vector<int>)>;
+  using ConstraintFunction = std::function<bool(std::vector<size_t>)>;
   struct Constraint {
     ConstraintFunction valid_if;
     std::vector<std::string> parameters;
   };
 
   // As above, but for local memory size.
-  using LocalMemoryFunction = std::function<size_t(std::vector<int>)>;
+  using LocalMemoryFunction = std::function<size_t(std::vector<size_t>)>;
   struct LocalMemory {
     LocalMemoryFunction amount;
     std::vector<std::string> parameters;
@@ -116,7 +116,7 @@ class KernelInfo {
   void set_local_base(cl::NDRange local) { local_base_ = local; local_ = local; }
 
   // Adds a new parameter with a name and a vector of possible values
-  void AddParameter(const std::string name, const std::vector<int> values);
+  void AddParameter(const std::string &name, const std::vector<size_t> &values);
 
   // Checks wheter a parameter exists, returns "true" if it does exist
   bool ParameterExists(const std::string parameter_name);
diff --git a/samples/conv/conv.cc b/samples/conv/conv.cc
index ffcc7ce..df1fd94 100644
--- a/samples/conv/conv.cc
+++ b/samples/conv/conv.cc
@@ -35,10 +35,10 @@
 #include "cltune.h"
 
 // Helper function to perform an integer division + ceiling (round-up)
-int CeilDiv(int a, int b) { return (a + b - 1)/b; }
+size_t CeilDiv(size_t a, size_t b) { return (a + b - 1)/b; }
 
 // Helper function to determine whether or not 'a' is a multiple of 'b'
-bool IsMultiple(int a, int b) {
+bool IsMultiple(size_t a, size_t b) {
   return ((a/b)*b == a) ? true : false;
 };
 
@@ -135,13 +135,15 @@ int main(int argc, char* argv[]) {
   // In this case, the workgroup size (TBX by TBY) is extra large (TBX_XL by TBY_XL) because it uses
   // extra threads to compute the halo threads. How many extra threads are needed is dependend on
   // the filter size. Here we support a the TBX and TBY size plus up to 10 extra threads.
-  auto integers = {8,9,10,11,12,13,14,15,
-                   16,17,18,19,20,21,22,23,24,25,26,
-                   32,33,34,35,36,37,38,39,40,41,42,
-                   64,65,66,67,68,69,70,71,72,73,74};
+  auto integers = std::initializer_list<size_t>{
+    8,9,10,11,12,13,14,15,
+    16,17,18,19,20,21,22,23,24,25,26,
+    32,33,34,35,36,37,38,39,40,41,42,
+    64,65,66,67,68,69,70,71,72,73,74
+  };
   tuner.AddParameter(id, "TBX_XL", integers);
   tuner.AddParameter(id, "TBY_XL", integers);
-  auto HaloThreads = [] (std::vector<int> v) {
+  auto HaloThreads = [] (std::vector<size_t> v) {
     if (v[0] == 2) { return (v[1] == v[2] + CeilDiv(2*HFS,v[3])); } // With halo threads
     else           { return (v[1] == v[2]); }                       // Without halo threads
   };
@@ -149,22 +151,22 @@ int main(int argc, char* argv[]) {
   tuner.AddConstraint(id, HaloThreads, {"LOCAL", "TBY_XL", "TBY", "WPTY"});
 
   // Sets the constrains on the vector size
-  auto VectorConstraint = [] (std::vector<int> v) {
+  auto VectorConstraint = [] (std::vector<size_t> v) {
     if (v[0] == 2) { return IsMultiple(v[2],v[1]) && IsMultiple(2*HFS,v[1]); }
     else           { return IsMultiple(v[2],v[1]); }
   };
   tuner.AddConstraint(id, VectorConstraint, {"LOCAL", "VECTOR", "WPTX"});
 
   // Makes sure the work per thread is not too high, otherwise too many registers would be used.
-  //auto WorkPerThreadConstraint = [] (std::vector<int> v) { return (v[0]*v[1] < 32); };
+  //auto WorkPerThreadConstraint = [] (std::vector<size_t> v) { return (v[0]*v[1] < 32); };
   //tuner.AddConstraint(id, WorkPerThreadConstraint, {"WPTX", "WPTY"});
 
   // Sets padding to zero in case local memory is not used
-  auto PaddingConstraint = [] (std::vector<int> v) { return (v[1] == 0 || v[0] != 0); };
+  auto PaddingConstraint = [] (std::vector<size_t> v) { return (v[1] == 0 || v[0] != 0); };
   tuner.AddConstraint(id, PaddingConstraint, {"LOCAL", "PADDING"});
 
   // Sets the constraints for local memory size limitations
-  auto LocalMemorySize = [] (std::vector<int> v) {
+  auto LocalMemorySize = [] (std::vector<size_t> v) {
     if (v[0] != 0) { return ((v[3]*v[4] + 2*HFS) * (v[1]*v[2] + 2*HFS + v[5]))*sizeof(float); }
     else           { return 0UL; }
   };
diff --git a/samples/gemm/gemm.cc b/samples/gemm/gemm.cc
index 9821b5d..dfa5f37 100644
--- a/samples/gemm/gemm.cc
+++ b/samples/gemm/gemm.cc
@@ -38,7 +38,7 @@
 #include "cltune.h"
 
 // Helper function to determine whether or not 'a' is a multiple of 'b'
-bool IsMultiple(int a, int b) {
+bool IsMultiple(size_t a, size_t b) {
   return ((a/b)*b == a) ? true : false;
 };
 
@@ -129,9 +129,9 @@ int main(int argc, char* argv[]) {
   // a boolean value whether or not the tuning configuration is legal. In this case, the helper
   // function 'IsMultiple' is employed for convenience. In the calls to 'AddConstraint' below, the
   // vector of parameter names (as strings) matches the input integer vector of the lambda's.
-  auto MultipleOfX = [] (std::vector<int> v) { return IsMultiple(v[0], v[1]); };
-  auto MultipleOfXMulY = [] (std::vector<int> v) { return IsMultiple(v[0], v[1]*v[2]); };
-  auto MultipleOfXMulYDivZ = [] (std::vector<int> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
+  auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+  auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
+  auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
 
   // Sets constraints: Requirement for unrolling the KWG loop
   tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"});
@@ -149,7 +149,7 @@ int main(int argc, char* argv[]) {
   tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"});
 
   // Sets the constraints for local memory size limitations
-  auto LocalMemorySize = [] (std::vector<int> v) {
+  auto LocalMemorySize = [] (std::vector<size_t> v) {
     return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*sizeof(float));
   };
   tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM", "SB", "KWG", "NWG", "VWN"});
diff --git a/src/kernel_info.cc b/src/kernel_info.cc
index 2dc407d..2947d1b 100644
--- a/src/kernel_info.cc
+++ b/src/kernel_info.cc
@@ -41,7 +41,7 @@ KernelInfo::KernelInfo(const std::string name, const std::string source,
   parameters_(),
   configurations_(),
   constraints_(),
-  local_memory_(LocalMemory{[] (std::vector<int> v) { return 0UL; }, std::vector<std::string>(0)}),
+  local_memory_(LocalMemory{[] (std::vector<size_t> v) { return 0UL; }, std::vector<std::string>(0)}),
   opencl_(opencl),
   global_base_(), local_base_(),
   global_(), local_(),
@@ -51,7 +51,7 @@ KernelInfo::KernelInfo(const std::string name, const std::string source,
 // =================================================================================================
 
 // Pushes a new parameter to the list of parameters
-void KernelInfo::AddParameter(const std::string name, const std::vector<int> values) {
+void KernelInfo::AddParameter(const std::string &name, const std::vector<size_t> &values) {
   Parameter parameter = {name, values};
   parameters_.push_back(parameter);
 }
@@ -196,7 +196,7 @@ bool KernelInfo::ValidConfiguration(const Configuration &config) {
   for (auto &constraint: constraints_) {
 
     // Finds the values of the parameters
-    std::vector<int> values(0);
+    std::vector<size_t> values(size_t{0});
     for (auto &parameter: constraint.parameters) {
       for (auto &setting: config) {
         if (setting.name == parameter) {
@@ -222,7 +222,7 @@ bool KernelInfo::ValidConfiguration(const Configuration &config) {
   if (!opencl_->ValidThreadSizes(global_, local_)) { return false; };
 
   // Verifies the local memory usage
-  std::vector<int> values_local_memory(0);
+  std::vector<size_t> values_local_memory(size_t{0});
   for (auto &parameter: local_memory_.parameters) {
     for (auto &setting: config) {
       if (setting.name == parameter) {
diff --git a/src/tuner.cc b/src/tuner.cc
index 592d5e8..007eca7 100644
--- a/src/tuner.cc
+++ b/src/tuner.cc
@@ -69,7 +69,7 @@ Tuner::Tuner():
 }
 
 // Initializes with a custom platform and device
-Tuner::Tuner(int platform_id, int device_id):
+Tuner::Tuner(size_t platform_id, size_t device_id):
     opencl_(new OpenCL(platform_id, device_id)),
     has_reference_(false),
     suppress_output_(false),
@@ -131,8 +131,8 @@ void Tuner::SetReference(const std::vector<std::string> &filenames, const std::s
 // =================================================================================================
 
 // Adds parameters for a kernel to tune. Also checks whether this parameter already exists.
-void Tuner::AddParameter(const size_t id, const std::string parameter_name,
-                         const std::initializer_list<int> values) {
+void Tuner::AddParameter(const size_t id, const std::string &parameter_name,
+                         const std::initializer_list<size_t> &values) {
   if (id >= kernels_.size()) { throw Exception("Invalid kernel ID"); }
   if (kernels_[id].ParameterExists(parameter_name)) {
     throw Exception("Parameter already exists");
@@ -243,13 +243,13 @@ void Tuner::UseFullSearch() {
 }
 
 // Use random search as a search strategy.
-void Tuner::UseRandomSearch(const float fraction) {
+void Tuner::UseRandomSearch(const double fraction) {
   search_method_ = SearchMethod::RandomSearch;
   search_args_.push_back(fraction);
 }
 
 // Use simulated annealing as a search strategy.
-void Tuner::UseAnnealing(const float fraction, const double max_temperature) {
+void Tuner::UseAnnealing(const double fraction, const double max_temperature) {
   search_method_ = SearchMethod::Annealing;
   search_args_.push_back(fraction);
   search_args_.push_back(max_temperature);
@@ -458,7 +458,7 @@ void Tuner::PrintToFile(const std::string &filename) const {
       fprintf(file, "%.2lf;", tuning_result.time);
       fprintf(file, "%lu;", tuning_result.threads);
       for (auto &setting: tuning_result.configuration) {
-        fprintf(file, "%d;", setting.value);
+        fprintf(file, "%lu;", setting.value);
       }
       fprintf(file, "\n");
     }

From 8d989b002b4d1cc7f8aef45a8453efd7f032274a Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <cedric.nugteren@surfsara.nl>
Date: Mon, 18 May 2015 16:10:29 +0200
Subject: [PATCH 07/10] Updated the tests

---
 test/kernel_info.cc | 28 ++++++++++++++------------
 test/tuner.cc       | 48 ++++++++++++++++++++++-----------------------
 2 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/test/kernel_info.cc b/test/kernel_info.cc
index 4999e7b..87efc17 100644
--- a/test/kernel_info.cc
+++ b/test/kernel_info.cc
@@ -37,24 +37,25 @@
 // Initializes a KernelInfo test fixture
 class KernelInfoTest : public testing::Test {
  protected:
-  static constexpr auto kNumParameters = 8;
-  static constexpr auto kNumRanges = 8;
+  static constexpr auto kNumParameters = size_t{8};
+  static constexpr auto kNumRanges = size_t{8};
 
   // Constructor
   explicit KernelInfoTest() :
-    kernel_{new cltune::KernelInfo("name", "source")} {
+    opencl_{new cltune::OpenCL(0, 0)},
+    kernel_{new cltune::KernelInfo("name", "source", opencl_)} {
   }
 
   // Initializes the tester
   virtual void SetUp() {
 
     // Sets a bunch of parameters to test
-    for (auto i=0; i<kNumParameters; ++i) {
+    for (auto i=size_t{0}; i<kNumParameters; ++i) {
 
        // Creates a pseudo-random name and values
       auto name = "TEST_PARAM_" + std::to_string(static_cast<long long>(i));
-      auto values = std::vector<int>{1, 6+i, 9, 1*i, 2000};
-      for (auto j=0; j<i; ++j) { values.push_back((j+3)*i); }
+      auto values = std::vector<size_t>{1, 6+i, 9, 1*i, 2000};
+      for (auto j=size_t{0}; j<i; ++j) { values.push_back((j+3)*i); }
 
       // Sets the name and value
       values_list_.push_back(values);
@@ -62,7 +63,7 @@ class KernelInfoTest : public testing::Test {
     }
 
     // Creates some example NDRanges and StringRanges
-    for (auto i=0; i<kNumRanges; ++i) {
+    for (auto i=size_t{0}; i<kNumRanges; ++i) {
 
       // Sets some example values
       auto v1 = static_cast<long long>(i*i);
@@ -100,9 +101,10 @@ class KernelInfoTest : public testing::Test {
   }
 
   // Member variables
+  std::shared_ptr<cltune::OpenCL> opencl_;
   std::unique_ptr<cltune::KernelInfo> kernel_;
   std::vector<std::string> names_;
-  std::vector<std::vector<int>> values_list_;
+  std::vector<std::vector<size_t>> values_list_;
   std::vector<cl::NDRange> ranges_;
   std::vector<cltune::StringRange> string_ranges_;
 };
@@ -111,7 +113,7 @@ class KernelInfoTest : public testing::Test {
 
 // Tests set_global_base for a number of example NDRange values
 TEST_F(KernelInfoTest, SetGlobalBase) {
-  for (auto i=0; i<kNumRanges; ++i) {
+  for (auto i=size_t{0}; i<kNumRanges; ++i) {
     kernel_->set_global_base(ranges_[i]);
     ASSERT_EQ(ranges_[i].dimensions(), kernel_->global_base().dimensions());
     for (auto j=static_cast<size_t>(0); j<kernel_->global_base().dimensions(); ++j) {
@@ -122,7 +124,7 @@ TEST_F(KernelInfoTest, SetGlobalBase) {
 
 // Tests set_local_base for a number of example NDRange values
 TEST_F(KernelInfoTest, SetLocalBase) {
-  for (auto i=0; i<kNumRanges; ++i) {
+  for (auto i=size_t{0}; i<kNumRanges; ++i) {
     kernel_->set_local_base(ranges_[i]);
     ASSERT_EQ(ranges_[i].dimensions(), kernel_->local_base().dimensions());
     for (auto j=static_cast<size_t>(0); j<kernel_->local_base().dimensions(); ++j) {
@@ -135,12 +137,12 @@ TEST_F(KernelInfoTest, SetLocalBase) {
 TEST_F(KernelInfoTest, AddParameter) {
 
   // Adds several parameters
-  for (auto i=0; i<kNumParameters; ++i) {
+  for (auto i=size_t{0}; i<kNumParameters; ++i) {
     kernel_->AddParameter(names_[i], values_list_[i]);
   }
 
   // Tests each parameter
-  for (auto i=0; i<kNumParameters; ++i) {
+  for (auto i=size_t{0}; i<kNumParameters; ++i) {
     ASSERT_EQ(values_list_[i].size(), kernel_->parameters()[i].values.size());
     EXPECT_EQ(names_[i], kernel_->parameters()[i].name);
     for (auto j=static_cast<size_t>(0); j<kernel_->parameters()[i].values.size(); ++j) {
@@ -157,7 +159,7 @@ TEST_F(KernelInfoTest, CreateLocalRange) {
   config.push_back(cltune::KernelInfo::Setting({"PARAM", 32}));
 
   // Tests a couple of different ranges against this configuration
-  for (auto i=0; i<kNumRanges; ++i) {
+  for (auto i=size_t{0}; i<kNumRanges; ++i) {
     kernel_->set_global_base(ranges_[i]);
     kernel_->set_local_base(ranges_[i]);
     kernel_->ComputeRanges(config);
diff --git a/test/tuner.cc b/test/tuner.cc
index f2152c5..60b5270 100644
--- a/test/tuner.cc
+++ b/test/tuner.cc
@@ -42,21 +42,21 @@ class TunerTest : public testing::Test {
   };
 
   // Test parameters
-  static constexpr auto kNumKernelAdditions = 1;
-  static constexpr auto kNumParameters = 2;
-  static constexpr auto kNumParameterAdditions = 3;
+  static constexpr auto kNumKernelAdditions = size_t{1};
+  static constexpr auto kNumParameters = size_t{2};
+  static constexpr auto kNumParameterAdditions = size_t{3};
 
   // Test kernels (taken from the samples folder)
-  static constexpr auto kNumKernels = 2;
+  static constexpr auto kNumKernels = size_t{2};
   const std::vector<CLKernel> kKernelFiles = {
-    {"../samples/simple_reference.opencl","matvec_reference"},
-    {"../samples/simple_unroll.opencl","matvec_unroll"}
+    {"../samples/simple/simple_reference.opencl","matvec_reference"},
+    {"../samples/simple/simple_unroll.opencl","matvec_unroll"}
   };
 
   // Test matrix sizes
-  static constexpr auto kSizeM = 128;
-  static constexpr auto kSizeN = 512;
-  static constexpr auto kSizeK = 256;
+  static constexpr auto kSizeM = size_t{128};
+  static constexpr auto kSizeN = size_t{512};
+  static constexpr auto kSizeK = size_t{256};
 
   // Constructor
   explicit TunerTest() :
@@ -72,9 +72,9 @@ class TunerTest : public testing::Test {
     local_ = cl::NDRange{8, 1};
 
     // Adds example parameters
-    for (auto k=0; k<kNumParameters; ++k) {
+    for (auto k=size_t{0}; k<kNumParameters; ++k) {
       auto parameter = "TEST_PARAM_"+std::to_string(static_cast<long long>(k));
-      auto values = {5, 1, 999};
+      auto values = {size_t{5}, size_t{1}, size_t{999}};
       auto string_range = cltune::StringRange{parameter, parameter};
       parameter_list_.push_back(parameter);
       values_list_.push_back(values);
@@ -90,7 +90,7 @@ class TunerTest : public testing::Test {
   cl::NDRange global_;
   cl::NDRange local_;
   std::vector<std::string> parameter_list_;
-  std::vector<std::initializer_list<int>> values_list_;
+  std::vector<std::initializer_list<size_t>> values_list_;
   std::vector<cltune::StringRange> string_ranges_;
 };
 
@@ -113,10 +113,10 @@ TEST_F(TunerTest, InitOpenCL) {
 
 // Checks whether AddKernel returns an incrementing ID
 TEST_F(TunerTest, AddKernel) {
-  auto counter = 0;
+  auto counter = size_t{0};
   for (auto &kernel_file: kKernelFiles) {
-    for (auto i=0; i<kNumKernelAdditions; ++i) {
-      auto id = tuner_->AddKernel(kernel_file.filename, kernel_file.kernel_name, global_, local_);
+    for (auto i=size_t{0}; i<kNumKernelAdditions; ++i) {
+      auto id = tuner_->AddKernel({kernel_file.filename}, kernel_file.kernel_name, global_, local_);
       EXPECT_EQ(counter, id);
       counter++;
     }
@@ -127,17 +127,17 @@ TEST_F(TunerTest, AddKernel) {
 TEST_F(TunerTest, AddParameter) {
 
   // Adds parameters for invalid kernels, expecting a crash
-  for (auto k=0; k<kNumParameters; ++k) {
+  for (auto k=size_t{0}; k<kNumParameters; ++k) {
     ASSERT_THROW(tuner_->AddParameter(k, parameter_list_[k], values_list_[k]),
                  cltune::Tuner::Exception);
   }
 
   // Adds a new kernel and then adds parameters
   for (auto &kernel_file: kKernelFiles) {
-    for (auto i=0; i<kNumKernelAdditions; ++i) {
-      auto id = tuner_->AddKernel(kernel_file.filename, kernel_file.kernel_name, global_, local_);
-      for (auto k=0; k<kNumParameters; ++k) {
-        for (auto j=0; j<kNumParameterAdditions; ++j) {
+    for (auto i=size_t{0}; i<kNumKernelAdditions; ++i) {
+      auto id = tuner_->AddKernel({kernel_file.filename}, kernel_file.kernel_name, global_, local_);
+      for (auto k=size_t{0}; k<kNumParameters; ++k) {
+        for (auto j=size_t{0}; j<kNumParameterAdditions; ++j) {
           if (j == 0) {
             tuner_->AddParameter(id, parameter_list_[k], values_list_[k]);
           }
@@ -156,7 +156,7 @@ TEST_F(TunerTest, AddParameter) {
 TEST_F(TunerTest, ModifyThreadSize) {
 
   // Modifies parameters for invalid kernels, expecting a crash
-  for (auto k=0; k<kNumParameters; ++k) {
+  for (auto k=size_t{0}; k<kNumParameters; ++k) {
     ASSERT_THROW(tuner_->MulGlobalSize(k, string_ranges_[k]), cltune::Tuner::Exception);
     ASSERT_THROW(tuner_->DivGlobalSize(k, string_ranges_[k]), cltune::Tuner::Exception);
     ASSERT_THROW(tuner_->MulLocalSize(k, string_ranges_[k]), cltune::Tuner::Exception);
@@ -165,9 +165,9 @@ TEST_F(TunerTest, ModifyThreadSize) {
 
   // Adds a new kernel and then modifies the thread-sizes
   for (auto &kernel_file: kKernelFiles) {
-    for (auto i=0; i<kNumKernelAdditions; ++i) {
-      auto id = tuner_->AddKernel(kernel_file.filename, kernel_file.kernel_name, global_, local_);
-      for (auto k=0; k<kNumParameters; ++k) {
+    for (auto i=size_t{0}; i<kNumKernelAdditions; ++i) {
+      auto id = tuner_->AddKernel({kernel_file.filename}, kernel_file.kernel_name, global_, local_);
+      for (auto k=size_t{0}; k<kNumParameters; ++k) {
         tuner_->MulGlobalSize(id, string_ranges_[k]);
         tuner_->DivGlobalSize(id, string_ranges_[k]);
         tuner_->MulLocalSize(id, string_ranges_[k]);

From 4bbd62687b2c9fc89da221d553de11d92367fc5f Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <cedric.nugteren@surfsara.nl>
Date: Mon, 18 May 2015 16:10:59 +0200
Subject: [PATCH 08/10] New custom FindOpenCL.cmake

---
 cmake/Modules/FindOpenCL.cmake | 159 ++++++++++++++-------------------
 1 file changed, 69 insertions(+), 90 deletions(-)

diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake
index 9c0e7f2..3ca8fa8 100644
--- a/cmake/Modules/FindOpenCL.cmake
+++ b/cmake/Modules/FindOpenCL.cmake
@@ -1,107 +1,86 @@
-# ########################################################################
-# Copyright 2013 Advanced Micro Devices, Inc.
+# ==================================================================================================
+# This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
+# a tab-size of two spaces and a max-width of 100 characters per line.
 #
+# Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
+#
+# Defines the following variables:
+#   OPENCL_FOUND          Boolean holding whether or not the OpenCL library was found
+#   OPENCL_INCLUDE_DIRS   The OpenCL include directory
+#   OPENCL_LIBRARIES      The OpenCL library
+#
+# In case OpenCL is not installed in the default directory, set the OPENCL_ROOT variable to point to
+# the root of OpenCL, such that 'OpenCL/cl.h' or 'CL/cl.h' can be found in $OPENCL_ROOT/include.
+# This can either be done using an environmental variable (e.g. export OPENCL_ROOT=/path/to/opencl)
+# or using a CMake variable (e.g. cmake -DOPENCL_ROOT=/path/to/opencl ..).
+#
+# --------------------------------------------------------------------------------------------------
+#
+# Copyright 2014 SURFsara
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
+#  http://www.apache.org/licenses/LICENSE-2.0
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ########################################################################
-
-
-# Locate an OpenCL implementation.
-# Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/)
-#
-# Defines the following variables:
-#
-#   OPENCL_FOUND - Found the OPENCL framework
-#   OPENCL_INCLUDE_DIRS - Include directories
-#
-# Also defines the library variables below as normal
-# variables.  These contain debug/optimized keywords when
-# a debugging library is found.
-#
-#   OPENCL_LIBRARIES - libopencl
 #
-# Accepts the following variables as input:
-#
-#   OPENCL_ROOT - (as a CMake or environment variable)
-#                The root directory of the OpenCL implementation found
-#
-#   FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findOpenCL should search for
-#                              64bit or 32bit libs
-#-----------------------
-# Example Usage:
-#
-#    find_package(OPENCL REQUIRED)
-#    include_directories(${OPENCL_INCLUDE_DIRS})
-#
-#    add_executable(foo foo.cc)
-#    target_link_libraries(foo ${OPENCL_LIBRARIES})
-#
-#-----------------------
+# ==================================================================================================
 
-find_path(OPENCL_INCLUDE_DIRS
-    NAMES OpenCL/cl.h CL/cl.h
-    HINTS
-        ${OPENCL_ROOT}
-        $ENV{OPENCL_ROOT}
-        $ENV{AMDAPPSDKROOT}
-        $ENV{CUDA_PATH}
-    DOC "OpenCL header file path"
-    PATH_SUFFIXES include
-    PATHS
-        /usr/include
-        /usr/local/include
-        /usr/local/cuda/include
-        /opt/cuda/include
+# Sets the possible install locations
+set(OPENCL_HINTS
+  ${OPENCL_ROOT}
+  $ENV{OPENCL_ROOT}
+  $ENV{AMDAPPSDKROOT}
+  $ENV{CUDA_PATH}
+  $ENV{INTELOCLSDKROOT}
+  $ENV{NVSDKCOMPUTE_ROOT}
+  $ENV{ATISTREAMSDKROOT}
+)
+set(OPENCL_PATHS
+  /usr/local/cuda
+  /opt/cuda
+  /usr
+  /usr/local
 )
-mark_as_advanced( OPENCL_INCLUDE_DIRS )
 
-# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
-get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
+# Finds the include directories
+find_path(OPENCL_INCLUDE_DIRS
+  NAMES OpenCL/cl.h CL/cl.h
+  HINTS ${OPENCL_HINTS}
+  PATH_SUFFIXES include OpenCL/common/inc inc include/x86_64 include/x64
+  PATHS ${OPENCL_PATHS}
+  DOC "OpenCL include header OpenCL/cl.h or CL/cl.h"
+)
+mark_as_advanced(OPENCL_INCLUDE_DIRS)
 
-if( LIB64 )
-    find_library( OPENCL_LIBRARIES
-        NAMES OpenCL
-        HINTS
-            ${OPENCL_ROOT}
-            $ENV{OPENCL_ROOT}
-            $ENV{AMDAPPSDKROOT}
-            $ENV{CUDA_PATH}
-        DOC "OpenCL dynamic library path"
-        PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64
-        PATHS
-            /usr/lib
-            /usr/local/cuda/lib
-            /opt/cuda/lib
-    )
-else( )
-    find_library( OPENCL_LIBRARIES
-        NAMES OpenCL
-        HINTS
-            ${OPENCL_ROOT}/lib
-            $ENV{AMDAPPSDKROOT}/lib
-            $ENV{CUDA_PATH}/lib
-        DOC "OpenCL dynamic library path"
-        PATH_SUFFIXES x86 Win32
-        PATHS
-            /usr/lib
-            /usr/local/cuda/lib
-            /opt/cuda/lib
-    )
-endif( )
-mark_as_advanced( OPENCL_LIBRARIES )
+# Finds the library
+find_library(OPENCL_LIBRARIES
+  NAMES OpenCL
+  HINTS ${OPENCL_HINTS}
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
+  PATHS ${OPENCL_PATHS}
+  DOC "OpenCL library"
+)
+mark_as_advanced(OPENCL_LIBRARIES)
 
-include( FindPackageHandleStandardArgs )
-FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )
+# ==================================================================================================
 
-if( NOT OPENCL_FOUND )
-    message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
+# Notification messages
+if(NOT OPENCL_INCLUDE_DIRS)
+    message(STATUS "Could NOT find 'OpenCL/cl.h' or 'CL/cl.h', install OpenCL or set OPENCL_ROOT")
 endif()
+if(NOT OPENCL_LIBRARIES)
+    message(STATUS "Could NOT find OpenCL library, install it or set OPENCL_ROOT")
+endif()
+
+# Determines whether or not OpenCL was found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_INCLUDE_DIRS OPENCL_LIBRARIES)
+
+# ==================================================================================================

From c7a2078a74b1952350f4b3a8f3d6bcfbe94e3776 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <cedric.nugteren@surfsara.nl>
Date: Mon, 18 May 2015 16:11:18 +0200
Subject: [PATCH 09/10] Updated the README w.r.t. the latest changes

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7eaa022..a378634 100644
--- a/README.md
+++ b/README.md
@@ -47,9 +47,9 @@ Before we start using the tuner, we'll have to create one. The constructor takes
 
     cltune::Tuner my_tuner(0, 1); // Tuner on device 1 of OpenCL platform 0
 
-Now that we have a tuner, we can add a tuning kernel. This is done by providing the path to an OpenCL kernel (first argument), the name of the kernel (second argument), a list of global thread dimensions (third argument), and a list of local thread or workgroup dimensions (fourth argument). Here is an example:
+Now that we have a tuner, we can add a tuning kernel. This is done by providing a list of paths to OpenCL kernel files (first argument), the name of the kernel (second argument), a list of global thread dimensions (third argument), and a list of local thread or workgroup dimensions (fourth argument). Here is an example:
 
-    int id = my_tuner.AddKernel("path/to/kernel.opencl", "my_kernel", {1024,512}, {16,8});
+    size_t id = my_tuner.AddKernel({"path/to/kernel.opencl"}, "my_kernel", {1024,512}, {16,8});
 
 Notice that the AddKernel function returns an integer: it is the ID of the added kernel. We'll need this ID when we want to add tuning parameters to this kernel. Let's say that our kernel has two pre-processor parameters named `PARAM_1` and `PARAM_2`:
 
@@ -58,7 +58,7 @@ Notice that the AddKernel function returns an integer: it is the ID of the added
 
 Now that we've added a kernel and its parameters, we can add another one if we wish. When we're done, there are a couple of things left to be done. Let's start with adding an reference kernel. This reference kernel can provide the tuner with the ground-truth and is optional - only when it is provided will the tuner perform verification checks to ensure correctness.
 
-    my_tuner.SetReference("path/to/reference.opencl", "my_reference", {8192}, {128});
+    my_tuner.SetReference({"path/to/reference.opencl"}, "my_reference", {8192}, {128});
 
 The tuner also needs to know which arguments the kernels take. Scalar arguments can be provided as-is and are passed-by-value, whereas arrays have to be provided as C++ `std::vector`s. That's right, we won't have to create OpenCL buffers, CLTune will handle that for us! Here is an example:
 

From 2416b3b2d6d9af5b1870b8afc3f52a6547dc5995 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <cedric.nugteren@surfsara.nl>
Date: Mon, 18 May 2015 16:12:48 +0200
Subject: [PATCH 10/10] Updated to version 1.5.1

---
 CHANGELOG      |  8 ++++++++
 CMakeLists.txt | 10 +++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index c74b8d6..0612960 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,12 @@
 
+Version 1.5.1
+- Improved the GEMM example to support the Intel MIC (Xeon Phi) accelerators
+- Updated compiler check and compiler flags
+- Adds support for multiple OpenCL kernel files at once (e.g. when wanting to include a header file)
+- Adds support for the std::complex data-types
+- Fixed some compilation warnings regarding size_t conversions
+- Updated the FindOpenCL.cmake file
+
 Version 1.5.0
 - OpenCL local work size and memory size constraints are now automatically handled
 - Greatly improved the new 2D convolution example:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c90f3be..2ade1e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ cmake_minimum_required(VERSION 2.8)
 project("cltune" CXX)
 set(cltune_VERSION_MAJOR 1)
 set(cltune_VERSION_MINOR 5)
-set(cltune_VERSION_PATCH 0)
+set(cltune_VERSION_PATCH 1)
 
 # Options
 option(SAMPLES "Enable compilation of sample programs" ON)
@@ -48,10 +48,14 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9)
         message(FATAL_ERROR "GCC version must be at least 4.9")
     endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") # Note: what about AppleClang?
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
         message(FATAL_ERROR "Clang version must be at least 3.3")
     endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+        message(FATAL_ERROR "Clang version must be at least 5.0")
+    endif()
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
     if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
         message(FATAL_ERROR "ICC version must be at least 14.0")
@@ -66,7 +70,7 @@ endif()
 set(FLAGS "-O3 -std=c++11")
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
   set(FLAGS "${FLAGS} -Wall -Wno-comment")
-elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") # Note: what about AppleClang?
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
   #set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
 endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")