Skip to content

Commit

Permalink
Cosmetics
Browse files Browse the repository at this point in the history
  • Loading branch information
ProjectPhysX committed Aug 3, 2024
1 parent d0110bb commit c7e8987
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 26 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@ Works with any GPU in Windows, Linux, macOS and Android.


## Measurements
- compute performance (FP64, FP32, FP16, INT64, INT32, INT16, INT8)
- memory bandwidth (coalesced/misaligned read/write)
- PCIe bandwidth (send/receive/bidirectional)
- compute performance (`FP64`, `FP32`, `FP16`, `INT64`, `INT32`, `INT16`, `INT8`)
- closest possible fraction/multiplicator of `measured compute performance` divided by `reported theoretical FP32 performance` is shown in `(round brackets)`
- for example when OpenCL reports `19.492` TFLOPs/s theoretical FP32, and the benchmark measures `9.512` TFLOPs/s for FP64, the ratio of `(measured FP64)/(theoretical FP32) = 9.512/19.492 = 1/2.05` is rounded to the next possible value of `1/2` and reported as such
- these ratios for any GPU/CPU architecture can only be either `1/64`, `1/32`, `1/24`, `1/16`, `1/12`, `1/8`, `1/4`, `1/3`, `1/2`, `2/3`, `1x`, `2x`, `4x`, `8x`, `16x`, `32x`, `64x`, and nothing in between
- memory bandwidth (`coalesced`/`misaligned` `read`/`write`)
- PCIe bandwidth (`send`/`receive`/`bidirectional`)
- PCIe Gen is estimated based on measured PCIe bandwidth and assumed x16 link width



Expand Down
46 changes: 23 additions & 23 deletions src/opencl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,21 @@ sudo shutdown -r now
)"+string("\033[96m")+R"(.-----------------------------------------------------------------------------.
| CPU Option 1: Intel CPU Runtime for OpenCL (works for both AMD/Intel CPUs) |
'-----------------------------------------------------------------------------'
export OCLCPUEXP_VERSION="2024.18.6.0.02_rel"
export ONEAPI_TBB_VERSION="2021.13.0"
export OCLV="2024.18.6.0.02_rel"
export TBBV="2021.13.0"
sudo apt update && sudo apt upgrade -y
sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev
sudo mkdir -p ~/cpuruntime /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION} /etc/OpenCL/vendors /etc/ld.so.conf.d
sudo wget -P ~/cpuruntime https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-${OCLCPUEXP_VERSION}.tar.gz
sudo wget -P ~/cpuruntime https://github.com/oneapi-src/oneTBB/releases/download/v${ONEAPI_TBB_VERSION}/oneapi-tbb-${ONEAPI_TBB_VERSION}-lin.tgz
sudo tar -zxvf ~/cpuruntime/oclcpuexp-${OCLCPUEXP_VERSION}.tar.gz -C /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}
sudo tar -zxvf ~/cpuruntime/oneapi-tbb-${ONEAPI_TBB_VERSION}-lin.tgz -C /opt/intel
echo /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64/libintelocl.so | sudo tee /etc/OpenCL/vendors/intel_expcpu.icd
echo /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64 | sudo tee /etc/ld.so.conf.d/libintelopenclexp.conf
sudo ln -sf /opt/intel/oneapi-tbb-${ONEAPI_TBB_VERSION}/lib/intel64/gcc4.8/libtbb.so /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64
sudo ln -sf /opt/intel/oneapi-tbb-${ONEAPI_TBB_VERSION}/lib/intel64/gcc4.8/libtbbmalloc.so /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64
sudo ln -sf /opt/intel/oneapi-tbb-${ONEAPI_TBB_VERSION}/lib/intel64/gcc4.8/libtbb.so.12 /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64
sudo ln -sf /opt/intel/oneapi-tbb-${ONEAPI_TBB_VERSION}/lib/intel64/gcc4.8/libtbbmalloc.so.2 /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64
sudo mkdir -p ~/cpuruntime /opt/intel/oclcpuexp_${OCLV} /etc/OpenCL/vendors /etc/ld.so.conf.d
sudo wget -P ~/cpuruntime https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-${OCLV}.tar.gz
sudo wget -P ~/cpuruntime https://github.com/oneapi-src/oneTBB/releases/download/v${TBBV}/oneapi-tbb-${TBBV}-lin.tgz
sudo tar -zxvf ~/cpuruntime/oclcpuexp-${OCLV}.tar.gz -C /opt/intel/oclcpuexp_${OCLV}
sudo tar -zxvf ~/cpuruntime/oneapi-tbb-${TBBV}-lin.tgz -C /opt/intel
echo /opt/intel/oclcpuexp_${OCLV}/x64/libintelocl.so | sudo tee /etc/OpenCL/vendors/intel_expcpu.icd
echo /opt/intel/oclcpuexp_${OCLV}/x64 | sudo tee /etc/ld.so.conf.d/libintelopenclexp.conf
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbb.so /opt/intel/oclcpuexp_${OCLV}/x64
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so /opt/intel/oclcpuexp_${OCLV}/x64
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbb.so.12 /opt/intel/oclcpuexp_${OCLV}/x64
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so.2 /opt/intel/oclcpuexp_${OCLV}/x64
sudo ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf
sudo rm -r ~/cpuruntime

Expand All @@ -85,20 +85,20 @@ struct Device_Info {
cl::Device cl_device; // OpenCL device
cl::Context cl_context; // multiple devices in the same context can communicate buffers
uint id = 0u; // unique device ID assigned by get_devices()
string name, vendor; // device name, vendor
string driver_version, opencl_c_version; // device driver version, OpenCL C version
uint memory=0u; // global memory in MB
uint memory_used=0u; // track global memory usage in MB
string name="", vendor=""; // device name, vendor
string driver_version="", opencl_c_version=""; // device driver version, OpenCL C version
uint memory = 0u; // global memory in MB
uint memory_used = 0u; // track global memory usage in MB
uint global_cache=0u, local_cache=0u; // global cache in KB, local cache in KB
uint max_global_buffer=0u, max_constant_buffer=0u; // maximum global buffer size in MB, maximum constant buffer size in KB
uint compute_units=0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
uint clock_frequency=0u; // in MHz
uint compute_units = 0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
uint clock_frequency = 0u; // in MHz
bool is_cpu=false, is_gpu=false;
bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
bool legacy_gpu_fma_patch = false; // some old GPUs have terrible fma performance, so replace with a*b+c
uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
uint cores = 0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
float tflops = 0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
inline Device_Info(const cl::Device& cl_device, const cl::Context& cl_context, const uint id) {
this->cl_device = cl_device; // see https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
this->cl_context = cl_context;
Expand Down Expand Up @@ -565,14 +565,14 @@ class Kernel {
if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor.");
this->name = name;
cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str());
link_parameters(number_of_parameters, parameters...); // expand variadic template to link kernel parameters
link_parameters(0u, parameters...); // expand variadic template to link kernel parameters
set_ranges(N);
cl_queue = device.get_cl_queue();
}
template<class... T> inline Kernel(const Device& device, const ulong N, const uint workgroup_size, const string& name, const T&... parameters) { // accepts Memory<T> objects and fundamental data type constants
if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor.");
cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str());
link_parameters(number_of_parameters, parameters...); // expand variadic template to link kernel parameters
link_parameters(0u, parameters...); // expand variadic template to link kernel parameters
set_ranges(N, (ulong)workgroup_size);
cl_queue = device.get_cl_queue();
}
Expand Down

0 comments on commit c7e8987

Please sign in to comment.