Skip to content

Commit

Permalink
Merge pull request #45 from CNugteren/development
Browse files Browse the repository at this point in the history
Update to version 2.5.0
  • Loading branch information
CNugteren authored Sep 27, 2016
2 parents 86dbb2e + 82dd234 commit d0ec5a1
Show file tree
Hide file tree
Showing 10 changed files with 146 additions and 107 deletions.
32 changes: 2 additions & 30 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,49 +17,21 @@ addons:
- kubuntu-backports
packages:
- cmake
- ocl-icd-opencl-dev

env:
global:
- CLTUNE_ROOT=${TRAVIS_BUILD_DIR}/bin/cltune
- OPENCL_REGISTRY=https://www.khronos.org/registry/cl
- OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl

before_install:
- cmake --version;
- ${CC} --version;
- ${CXX} --version;

install:
# The following linux logic is necessary because of Travis's move to the GCE platform, which does not
# currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221
# We build our own linkable .so file
- if [ ${TRAVIS_OS_NAME} == "linux" ]; then
mkdir -p ${OPENCL_ROOT};
pushd ${OPENCL_ROOT};
travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git;
mv ./OpenCL-ICD-Loader/* .;
travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL;
pushd inc/CL;
travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp;
popd;
mkdir -p lib;
pushd lib;
cmake -G "Unix Makefiles" ..;
make;
cp ./bin/libOpenCL.so .;
popd;
pushd inc/CL;
travis_retry git fetch origin opencl12:opencl12;
git checkout opencl12;
popd;
mv inc/ include/;
popd;
fi

before_script:
- mkdir -p ${CLTUNE_ROOT}
- pushd ${CLTUNE_ROOT}
- cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DSAMPLES=ON -DTESTS=ON ${TRAVIS_BUILD_DIR}
- cmake -DSAMPLES=ON -DTESTS=ON ${TRAVIS_BUILD_DIR}

script:
- make
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@

Version 2.5.0
- Updated to version 8.0 of the CLCudaAPI header
- Made it possible to configure the number of times each kernel is run (to average results)

Version 2.4.0
- Made it possible to run the unit-tests independently of the provided OpenCL kernel samples
- Added an option to compile in verbose mode for additional diagnostic messages (-DVERBOSE=ON)
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
# CMake project details
project("cltune" CXX)
set(cltune_VERSION_MAJOR 2)
set(cltune_VERSION_MINOR 4)
set(cltune_VERSION_MINOR 5)
set(cltune_VERSION_PATCH 0)

# Options
Expand Down
3 changes: 3 additions & 0 deletions include/cltune.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ class Tuner {
// Disables all further printing to stdout
void PUBLIC_API SuppressOutput();

// Changes the number of times each kernel should be run. Used for averaging execution times.
void PUBLIC_API SetNumRuns(const size_t num_runs);

private:

// This implements the pointer to implementation idiom (pimpl) and hides all private functions and
Expand Down
130 changes: 77 additions & 53 deletions include/internal/clpp11.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
// Portability here means that a similar header exists for CUDA with the same classes and
// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
//
// This is version 6.0 of CLCudaAPI <https://github.com/CNugteren/CLCudaAPI>.
// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
// therefore contains the following header copyright notice:
//
// =================================================================================================
//
Expand Down Expand Up @@ -70,37 +71,46 @@ inline void CheckError(const cl_int status) {
class Event {
public:

// Constructor based on the regular OpenCL data-type
explicit Event(const cl_event event): event_(event) { }
// Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
explicit Event(const cl_event event):
event_(new cl_event) {
*event_ = event;
}

// Regular constructor
explicit Event(): event_(nullptr) { }
// Regular constructor with memory management
explicit Event():
event_(new cl_event, [](cl_event* e) {
if (*e) { CheckError(clReleaseEvent(*e)); }
delete e;
}) {
*event_ = nullptr;
}

// Waits for completion of this event
void WaitForCompletion() const {
CheckError(clWaitForEvents(1, &event_));
CheckError(clWaitForEvents(1, &(*event_)));
}

// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
float GetElapsedTime() const {
WaitForCompletion();
auto bytes = size_t{0};
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
auto time_start = size_t{0};
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
auto time_end = size_t{0};
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
return (time_end - time_start) * 1.0e-6f;
const auto bytes = sizeof(cl_ulong);
auto time_start = cl_ulong{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
auto time_end = cl_ulong{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
return static_cast<float>(time_end - time_start) * 1.0e-6f;
}

// Accessor to the private data-member
cl_event& operator()() { return event_; }
cl_event* pointer() { return &event_; }
cl_event& operator()() { return *event_; }
const cl_event& operator()() const { return *event_; }
cl_event* pointer() { return &(*event_); }
const cl_event* pointer() const { return &(*event_); }
private:
cl_event event_;
std::shared_ptr<cl_event> event_;
};

// Pointer to an OpenCL event
Expand Down Expand Up @@ -139,6 +149,17 @@ class Platform {
cl_platform_id platform_;
};

// Retrieves a vector with all platforms
inline std::vector<Platform> GetAllPlatforms() {
auto num_platforms = cl_uint{0};
CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
auto all_platforms = std::vector<Platform>();
for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
all_platforms.push_back(Platform(platform_id));
}
return all_platforms;
}

// =================================================================================================

// C++11 version of 'cl_device_id'
Expand Down Expand Up @@ -183,24 +204,32 @@ class Device {
}
size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
size_t MaxWorkItemDimensions() const {
return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
}
std::vector<size_t> MaxWorkItemSizes() const {
return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
}
size_t LocalMemSize() const {
return static_cast<size_t>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
unsigned long LocalMemSize() const {
return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
}
std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
size_t CoreClock() const {
return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
}
size_t ComputeUnits() const {
return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_COMPUTE_UNITS));
}
unsigned long MemorySize() const {
return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE));
}
unsigned long MaxAllocSize() const {
return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE));
}
size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL

// Configuration-validity checks
bool IsLocalMemoryValid(const size_t local_mem_usage) const {
bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const {
return (local_mem_usage <= LocalMemSize());
}
bool IsThreadConfigValid(const std::vector<size_t> &local) const {
Expand All @@ -217,9 +246,11 @@ class Device {
// Query for a specific type of device or brand
bool IsCPU() const { return Type() == "CPU"; }
bool IsGPU() const { return Type() == "GPU"; }
bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
Vendor() == "AuthenticAMD";; }
bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
Vendor() == "GenuineIntel"; }
bool IsARM() const { return Vendor() == "ARM"; }

// Accessor to the private data-member
Expand All @@ -236,13 +267,6 @@ class Device {
CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
return result;
}
size_t GetInfo(const cl_device_info info) const {
auto bytes = size_t{0};
CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
auto result = cl_uint(0);
CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
return static_cast<size_t>(result);
}
template <typename T>
std::vector<T> GetInfoVector(const cl_device_info info) const {
auto bytes = size_t{0};
Expand Down Expand Up @@ -592,8 +616,7 @@ class Buffer {

// Retrieves the actual allocated size in bytes
size_t GetSize() const {
auto bytes = size_t{0};
CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
const auto bytes = sizeof(size_t);
auto result = size_t{0};
CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
return result;
Expand Down Expand Up @@ -644,13 +667,22 @@ class Kernel {
}

// Retrieves the amount of local memory used per work-group for this kernel
size_t LocalMemUsage(const Device &device) const {
auto bytes = size_t{0};
unsigned long LocalMemUsage(const Device &device) const {
const auto bytes = sizeof(cl_ulong);
auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
auto result = size_t{0};
auto result = cl_ulong{0};
CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
return result;
return static_cast<unsigned long>(result);
}

// Retrieves the name of the kernel
std::string GetFunctionName() const {
auto bytes = size_t{0};
CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
auto result = std::string{};
result.resize(bytes);
CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr));
return std::string{result.c_str()}; // Removes any trailing '\0'-characters
}

// Launches a kernel onto the specified queue
Expand All @@ -664,30 +696,22 @@ class Kernel {
// As above, but with an event waiting list
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event,
std::vector<Event>& waitForEvents) {
if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
const std::vector<Event> &waitForEvents) {

// Builds a plain version of the events waiting list
auto waitForEventsPlain = std::vector<cl_event>();
for (auto &waitEvent : waitForEvents) {
waitForEventsPlain.push_back(waitEvent());
if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); }
}

// Launches the kernel while waiting for other events
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), local.data(),
nullptr, global.data(), !local.empty() ? local.data() : nullptr,
static_cast<cl_uint>(waitForEventsPlain.size()),
waitForEventsPlain.data(),
!waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
event));
}

// As above, but with the default local workgroup size
void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), nullptr,
0, nullptr, event));
}

// Accessor to the private data-member
const cl_kernel& operator()() const { return *kernel_; }
private:
Expand Down
Loading

0 comments on commit d0ec5a1

Please sign in to comment.