Merge pull request #42 from CNugteren/development

Update to version 2.4.0
CNugteren · Jun 29, 2016 · 86dbb2e · 86dbb2e
2 parents f1b0900 + a001605
commit 86dbb2e
Show file tree

Hide file tree

Showing 12 changed files with 209 additions and 45 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -0,0 +1,65 @@
+environment:
+  global:
+    CLTUNE_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\cltune"
+    OPENCL_REGISTRY: "https://www.khronos.org/registry/cl"
+    OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl"
+
+platform:
+  - x64
+  - x86
+
+configuration:
+  - Release
+
+init:
+  - cmake --version
+  - C:\"Program Files (x86)"\"Microsoft Visual Studio 14.0"\VC\vcvarsall.bat %PLATFORM%
+
+# Creates an OpenCL library to link against. Taken from clMathLibraries/clBLAS
+install:
+  - ps: mkdir $env:OPENCL_ROOT
+  - ps: pushd $env:OPENCL_ROOT
+  - ps: $opencl_registry = $env:OPENCL_REGISTRY
+  # This downloads the source to the Khronos ICD library
+  - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git
+  - ps: mv ./OpenCL-ICD-Loader/* .
+  # This downloads all the opencl header files
+  # The cmake build files expect a directory called inc
+  - ps: mkdir inc/CL
+  - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL
+  - ps: wget $opencl_registry/api/2.1/cl.hpp -OutFile inc/CL/cl.hpp
+  # - ps: dir; if( $lastexitcode -eq 0 ){ dir include/CL } else { Write-Output boom }
+  # Create the static import lib in a directory called lib, so findopencl() will find it
+  - ps: mkdir lib
+  - ps: pushd lib
+  - cmake -G "NMake Makefiles" ..
+  - nmake
+  - ps: popd
+  # Switch to OpenCL 1.2 headers
+  - ps: pushd inc/CL
+  - git fetch origin opencl12:opencl12
+  - git checkout opencl12
+  - ps: popd
+  # Rename the inc directory to include, so FindOpencl() will find it
+  - ps: ren inc include
+  - ps: popd
+
+before_build:
+  - ps: mkdir $env:CLTUNE_ROOT
+  - ps: pushd $env:CLTUNE_ROOT
+  - ps: mkdir install_dir
+  - cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=install_dir -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DSAMPLES=ON %APPVEYOR_BUILD_FOLDER%
+
+build_script:
+  - nmake
+  - nmake install
+
+after_build:
+  - ps: pushd $env:CLTUNE_ROOT
+  - 7z a CLTune-Windows.zip .\install_dir\*
+  - ps: mv CLTune-Windows.zip $env:APPVEYOR_BUILD_FOLDER
+
+artifacts:
+  - path: '*.zip'
+    name: release
+    type: zip
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-build/
+build
+.*
diff --git a/.travis.yml b/.travis.yml
@@ -2,6 +2,10 @@ language: cpp
 sudo: required
 dist: trusty
 
+os:
+  - linux
+  - osx
+
 compiler:
   - gcc
   - clang
@@ -16,7 +20,7 @@ addons:
 
 env:
   global:
-    - CLTUNE_ROOT=${TRAVIS_BUILD_DIR}/bin/make/release
+    - CLTUNE_ROOT=${TRAVIS_BUILD_DIR}/bin/cltune
     - OPENCL_REGISTRY=https://www.khronos.org/registry/cl
     - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
 
@@ -55,7 +59,7 @@ install:
 before_script:
   - mkdir -p ${CLTUNE_ROOT}
   - pushd ${CLTUNE_ROOT}
-  - cmake -DSAMPLES=ON -DTESTS=ON -DOPENCL_ROOT=${OPENCL_ROOT} ${TRAVIS_BUILD_DIR}
+  - cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DSAMPLES=ON -DTESTS=ON ${TRAVIS_BUILD_DIR}
 
 script:
   - make

diff --git a/CHANGELOG b/CHANGELOG
@@ -1,4 +1,11 @@
 
+Version 2.4.0
+- Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
+- Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON)
+- Now using version 6.0 of the CLCudaAPI header
+- Fixed the RPATH settings on OSX
+- Added Appveyor continuous integration and increased coverage of the Travis builds
+
 Version 2.3.1
 - Fixed a bug where an output buffer could not be used as input at the same time
 - Fixed computing the validation error for half-precision fp16 data-types

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,8 +32,8 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 # CMake project details
 project("cltune" CXX)
 set(cltune_VERSION_MAJOR 2)
-set(cltune_VERSION_MINOR 3)
-set(cltune_VERSION_PATCH 1)
+set(cltune_VERSION_MINOR 4)
+set(cltune_VERSION_PATCH 0)
 
 # Options
 option(SAMPLES "Enable compilation of sample programs" ON)
@@ -48,13 +48,17 @@ else()
   message("-- Building with CUDA")
 endif()
 
+# Compile in verbose mode with additional diagnostic messages
+option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
+if(VERBOSE)
+  message("-- Building in verbose mode")
+  add_definitions(-DVERBOSE)
+endif()
+
 # ==================================================================================================
 
 # RPATH settings
-set(CMAKE_SKIP_BUILD_RPATH false) # Use, i.e. don't skip the full RPATH for the build tree
-set(CMAKE_BUILD_WITH_INSTALL_RPATH false) # When building, don't use the install RPATH already
-set(CMAKE_INSTALL_RPATH "") # The RPATH to be used when installing
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically determined parts
+set(CMAKE_MACOSX_RPATH 1)
 
 # ==================================================================================================
 

diff --git a/README.md b/README.md
@@ -2,7 +2,10 @@
 CLTune: Automatic OpenCL kernel tuning
 ================
 
-[![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=master)](https://travis-ci.org/CNugteren/CLTune)
+| | master | development |
+|-----|-----|-----|
+| Linux/OS X | [![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=master)](https://travis-ci.org/CNugteren/CLTune/branches) | [![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=development)](https://travis-ci.org/CNugteren/CLTune/branches) |
+| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/cltune?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/cltune) | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/cltune?branch=development&svg=true)](https://ci.appveyor.com/project/CNugteren/cltune) |
 
 CLTune is a C++ library which can be used to automatically tune your OpenCL and CUDA kernels. The only thing you'll need to provide is a tuneable kernel and a list of allowed parameters and values.
 

diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
@@ -11,7 +11,7 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This is version 4.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
+// This is version 6.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
 //
 // =================================================================================================
 //
@@ -214,6 +214,14 @@ class Device {
     return true;
   }
 
+  // Query for a specific type of device or brand
+  bool IsCPU() const { return Type() == "CPU"; }
+  bool IsGPU() const { return Type() == "GPU"; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
+  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+  bool IsARM() const { return Vendor() == "ARM"; }
+
   // Accessor to the private data-member
   const cl_device_id& operator()() const { return device_; }
  private:
@@ -276,10 +284,14 @@ class Context {
 
   // Accessor to the private data-member
   const cl_context& operator()() const { return *context_; }
+  cl_context* pointer() const { return &(*context_); }
  private:
   std::shared_ptr<cl_context> context_;
 };
 
+// Pointer to an OpenCL context
+using ContextPointer = cl_context*;
+
 // =================================================================================================
 
 // Enumeration of build statuses of the run-time compilation process
@@ -290,7 +302,7 @@ class Program {
  public:
   // Note that there is no constructor based on the regular OpenCL data-type because of extra state
 
-  // Regular constructor with memory management
+  // Source-based constructor with memory management
   explicit Program(const Context &context, std::string source):
       program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
       length_(source.length()),
@@ -301,6 +313,22 @@ class Program {
     CheckError(status);
   }
 
+  // Binary-based constructor with memory management
+  explicit Program(const Device &device, const Context &context, const std::string& binary):
+      program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+      length_(binary.length()),
+      source_(binary),
+      source_ptr_(&source_[0]) {
+    auto status1 = CL_SUCCESS;
+    auto status2 = CL_SUCCESS;
+    const cl_device_id dev = device();
+    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
+                                          reinterpret_cast<const unsigned char**>(&source_ptr_),
+                                          &status1, &status2);
+    CheckError(status1);
+    CheckError(status2);
+  }
+
   // Compiles the device program and returns whether or not there where any warnings/errors
   BuildStatus Build(const Device &device, std::vector<std::string> &options) {
     auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
@@ -329,7 +357,7 @@ class Program {
     return result;
   }
 
-  // Retrieves an intermediate representation of the compiled program
+  // Retrieves a binary or an intermediate representation of the compiled program
   std::string GetIR() const {
     auto bytes = size_t{0};
     CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
@@ -345,7 +373,7 @@ class Program {
  private:
   std::shared_ptr<cl_program> program_;
   size_t length_;
-  std::string source_;
+  std::string source_; // Note: the source can also be a binary or IR
   const char* source_ptr_;
 };
 
@@ -627,15 +655,15 @@ class Kernel {
 
   // Launches a kernel onto the specified queue
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event) {
+              const std::vector<size_t> &local, EventPointer event) {
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), local.data(),
-                                      0, nullptr, &(event())));
+                                      0, nullptr, event));
   }
 
   // As above, but with an event waiting list
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event,
+              const std::vector<size_t> &local, EventPointer event,
               std::vector<Event>& waitForEvents) {
     if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
 
@@ -648,15 +676,16 @@ class Kernel {
     // Launches the kernel while waiting for other events
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), local.data(),
-                                      waitForEventsPlain.size(), waitForEventsPlain.data(),
-                                      &(event())));
+                                      static_cast<cl_uint>(waitForEventsPlain.size()),
+                                      waitForEventsPlain.data(),
+                                      event));
   }
 
   // As above, but with the default local workgroup size
-  void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
+  void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), nullptr,
-                                      0, nullptr, &(event())));
+                                      0, nullptr, event));
   }
 
   // Accessor to the private data-member

diff --git a/include/internal/cupp11.h b/include/internal/cupp11.h
@@ -11,7 +11,7 @@
 // Portability here means that a similar header exists for OpenCL with the same classes and
 // interfaces. In other words, moving from the CUDA API to the OpenCL API becomes a one-line change.
 //
-// This is version 4.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
+// This is version 6.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
 //
 // =================================================================================================
 //
@@ -106,13 +106,14 @@ class Event {
   // Accessors to the private data-members
   const CUevent& start() const { return *start_; }
   const CUevent& end() const { return *end_; }
+  Event* pointer() { return this; }
  private:
   std::shared_ptr<CUevent> start_;
   std::shared_ptr<CUevent> end_;
 };
 
 // Pointer to a CUDA event
-using EventPointer = CUevent*;
+using EventPointer = Event*;
 
 // =================================================================================================
 
@@ -159,6 +160,11 @@ class Device {
     CheckError(cuDriverGetVersion(&result));
     return "CUDA driver "+std::to_string(result);
   }
+  size_t VersionNumber() const {
+    auto result = 0;
+    CheckError(cuDriverGetVersion(&result));
+    return static_cast<size_t>(result);
+  }
   std::string Vendor() const { return "NVIDIA Corporation"; }
   std::string Name() const {
     auto result = std::string{};
@@ -206,6 +212,14 @@ class Device {
     return true;
   }
 
+  // Query for a specific type of device or brand
+  bool IsCPU() const { return false; }
+  bool IsGPU() const { return true; }
+  bool IsAMD() const { return false; }
+  bool IsNVIDIA() const { return true; }
+  bool IsIntel() const { return false; }
+  bool IsARM() const { return false; }
+
   // Accessor to the private data-member
   const CUdevice& operator()() const { return device_; }
  private:
@@ -239,10 +253,14 @@ class Context {
 
   // Accessor to the private data-member
   const CUcontext& operator()() const { return *context_; }
+  CUcontext* pointer() const { return &(*context_); }
  private:
   std::shared_ptr<CUcontext> context_;
 };
 
+// Pointer to an OpenCL context
+using ContextPointer = CUcontext*;
+
 // =================================================================================================
 
 // Enumeration of build statuses of the run-time compilation process
@@ -253,7 +271,7 @@ class Program {
  public:
   // Note that there is no constructor based on the regular CUDA data-type because of extra state
 
-  // Regular constructor with memory management
+  // Source-based constructor with memory management
   explicit Program(const Context &, std::string source):
       program_(new nvrtcProgram, [](nvrtcProgram* p) { CheckError(nvrtcDestroyProgram(p));
                                                        delete p; }),
@@ -555,7 +573,7 @@ class Kernel {
 
   // Launches a kernel onto the specified queue
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event) {
+              const std::vector<size_t> &local, EventPointer event) {
 
     // Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
     auto grid = std::vector<size_t>{1, 1, 1};
@@ -571,24 +589,24 @@ class Kernel {
     }
 
     // Launches the kernel, its execution time is recorded by events
-    CheckError(cuEventRecord(event.start(), queue()));
+    CheckError(cuEventRecord(event->start(), queue()));
     CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
                               0, queue(), pointers.data(), nullptr));
-    CheckError(cuEventRecord(event.end(), queue()));
+    CheckError(cuEventRecord(event->end(), queue()));
   }
 
   // As above, but with an event waiting list
   // TODO: Implement this function
   void Launch(const Queue &queue, const std::vector<size_t> &global,
-              const std::vector<size_t> &local, Event &event,
+              const std::vector<size_t> &local, EventPointer event,
               std::vector<Event>& waitForEvents) {
     if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
     Error("launching with an event waiting list is not implemented for the CUDA back-end");
   }
 
   // As above, but with the default local workgroup size
   // TODO: Implement this function
-  void Launch(const Queue &, const std::vector<size_t> &, Event &) {
+  void Launch(const Queue &, const std::vector<size_t> &, EventPointer) {
     Error("launching with a default workgroup size is not implemented for the CUDA back-end");
   }
-Original file line number
+Diff line change
@@ -1 +1,2 @@
-    build/
+    build
+    .*