diff --git a/.gitmodules b/.gitmodules index 91e2b92ead7..c6f328bbd8a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "llvm"] path = compiler url = https://github.com/RadeonOpenCompute/llvm.git - branch = amd-common + branch = feature_hc_next [submodule "lld"] path = lld url = https://github.com/RadeonOpenCompute/lld.git @@ -9,11 +9,11 @@ [submodule "clang"] path = clang url = https://github.com/RadeonOpenCompute/hcc-clang-upgrade.git - branch = clang_tot_upgrade + branch = feature_trampolines_are_for_babies [submodule "compiler-rt"] path = compiler-rt url = https://github.com/RadeonOpenCompute/compiler-rt - branch = amd-hcc + branch = amd-common [submodule "rocdl"] path = rocdl url = http://github.com/RadeonOpenCompute/ROCm-Device-Libs.git @@ -21,4 +21,4 @@ [submodule "clang-tools-extra"] path = clang-tools-extra url = https://github.com/RadeonOpenCompute/clang-tools-extra.git - branch = amd-hcc + branch = amd-common diff --git a/CMakeLists.txt b/CMakeLists.txt index b9e35332c39..65d0d20ead6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,30 +1,39 @@ -cmake_minimum_required( VERSION 3.0 ) -project (HCC) +cmake_minimum_required(VERSION 3.0) +project(HCC LANGUAGES CXX) + +set(CXX_EXTENSIONS OFF) include(GNUInstallDirs) -SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/scripts/cmake") -MESSAGE("Module path: ${CMAKE_MODULE_PATH}") +set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/scripts/cmake") +message("Module path: ${CMAKE_MODULE_PATH}") # set as release build by default -IF (NOT CMAKE_BUILD_TYPE) - SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: Release Debug" FORCE) -ENDIF(NOT CMAKE_BUILD_TYPE) - -# Use two different methods to determine host distribution: lsb_release and if that fails grep /etc/os-release -find_program( lsb_executable lsb_release ) +if (NOT CMAKE_BUILD_TYPE) + set( + CMAKE_BUILD_TYPE Release + CACHE STRING "Choose the type of build, options are: Release Debug" FORCE) +endif() -if( lsb_executable ) - execute_process( COMMAND ${lsb_executable} -is OUTPUT_VARIABLE DISTRO_ID OUTPUT_STRIP_TRAILING_WHITESPACE ) - execute_process( COMMAND ${lsb_executable} -rs OUTPUT_VARIABLE DISTRO_RELEASE OUTPUT_STRIP_TRAILING_WHITESPACE ) +# Use two different methods to determine host distribution: lsb_release and if +# that fails grep /etc/os-release +find_program(lsb_executable lsb_release) + +if (lsb_executable) + execute_process( + COMMAND ${lsb_executable} -is + OUTPUT_VARIABLE DISTRO_ID OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${lsb_executable} -rs + OUTPUT_VARIABLE DISTRO_RELEASE OUTPUT_STRIP_TRAILING_WHITESPACE) else() - if( EXISTS "/etc/os-release" ) - file( STRINGS "/etc/os-release" DISTRO_ID REGEX "^ID=" ) - file( STRINGS "/etc/os-release" DISTRO_RELEASE REGEX "^VERSION_ID=" ) - string( REPLACE "ID=" "" DISTRO_ID "${DISTRO_ID}" ) - string( REPLACE "VERSION_ID=" "" DISTRO_RELEASE "${DISTRO_RELEASE}" ) - endif( ) -endif( ) + if (EXISTS "/etc/os-release") + file(STRINGS "/etc/os-release" DISTRO_ID REGEX "^ID=") + file(STRINGS "/etc/os-release" DISTRO_RELEASE REGEX "^VERSION_ID=") + string(REPLACE "ID=" "" DISTRO_ID "${DISTRO_ID}") + string(REPLACE "VERSION_ID=" "" DISTRO_RELEASE "${DISTRO_RELEASE}") + endif() +endif() # Accepted values for DISTRO_ID: trusty (Ubuntu 14.04), xenial (Ubuntu 16.06), fd23 (Fedora 23) string(TOLOWER "${DISTRO_ID}" DISTRO_ID ) @@ -213,9 +222,9 @@ if (NOT HCC_VERSION_STRING) endif() # Set HCC version string. The rule for version string is: -# HCC_VERSION_MAJOR . HCC_VERSION_MINOR . HCC_VERSION_PATCH-KALMAR_SDK_COMIT-KALMAR_FRONTEND_COMMIT-KALMAR_BACKEND_COMMIT +# HCC_VERSION_MAJOR . HCC_VERSION_MINOR . HCC_VERSION_PATCH-HC_SDK_COMIT-HC_FRONTEND_COMMIT-HC_BACKEND_COMMIT add_version_info_from_git(HCC_VERSION_STRING - HCC_VERSION_PATCH KALMAR_SDK_COMMIT KALMAR_FRONTEND_COMMIT KALMAR_BACKEND_COMMIT) + HCC_VERSION_PATCH HC_SDK_COMMIT HC_FRONTEND_COMMIT HC_BACKEND_COMMIT) # set default installation path set(INSTALL_DIR_NAME "hcc") @@ -290,7 +299,7 @@ MESSAGE(STATUS "HCC configured with AMDGPU targets: ${AMDGPU_TARGET}") # - AMDGPU : for HSA systems configured with Lightning backend ################# -set(KALMAR_BACKEND "HCC_BACKEND_AMDGPU") +set(HC_BACKEND "HCC_BACKEND_AMDGPU") ######################### # build target: world @@ -357,14 +366,16 @@ add_custom_command(TARGET clang_links POST_BUILD ) # install certain LLVM libraries needed by HIP +# TODO: why HIP needs this random soup is unclear; the HC specific passes are +# definitely not "certain LLVM libraries needed by HIP". install(PROGRAMS $ $ $ $ $ - $ $ - $ + $ + $ DESTINATION lib COMPONENT compiler ) @@ -383,6 +394,7 @@ add_custom_target(world DEPENDS clang_links) # move headers to build dir before building rocdl and hcc lib add_subdirectory(include) +add_subdirectory(third_party) # build the integrated ROCm Device Library set(AMDHSACOD ${ROCM_ROOT}/bin/amdhsacod CACHE FILEPATH "Specify the amdhsacod tool") diff --git a/Jenkinsfile b/Jenkinsfile index 0771b62b435..c5ec339393d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -97,7 +97,7 @@ node( 'hcctest' ) -DHSA_AMDGPU_GPU_TARGET="gfx900;gfx803" \ -DNUM_TEST_THREADS="4" \ ../.. - make -j\$(nproc) + make -j2 """ // Cap the maximum amount of testing, in case of hangs diff --git a/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp b/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp index b4e4f0f27e7..b10ee0f3170 100644 --- a/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp +++ b/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp @@ -1,8 +1,8 @@ -// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 -DRUNMASK=0x18 && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out -#include -#include +// RUN: %hc %s -o %t.out -DRUNMASK=0x18 && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out +#include +#include -#include "/opt/rocm/include/hsa/hsa.h" +#include #include #include diff --git a/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp b/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp index b6ef2edcdb3..69b14eb873e 100644 --- a/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp +++ b/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp @@ -1,8 +1,8 @@ -// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 -DRUNMASK=0xff && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out -#include -#include +// RUN: %hc %s -o %t.out -DRUNMASK=0xff && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out +#include +#include -#include "/opt/rocm/include/hsa/hsa.h" +#include #include #include diff --git a/benchmarks/RuntimeOverheads/kernel_dispatch_latency.cpp b/benchmarks/RuntimeOverheads/kernel_dispatch_latency.cpp index 0fd6cb70014..c7f0f56ea17 100644 --- a/benchmarks/RuntimeOverheads/kernel_dispatch_latency.cpp +++ b/benchmarks/RuntimeOverheads/kernel_dispatch_latency.cpp @@ -1,7 +1,7 @@ -// RUN: %hc %s -lhc_am -o %t.out && %t.out +// RUN: %hc %s -o %t.out && %t.out -#include -#include +#include +#include #include #include @@ -23,10 +23,7 @@ bool test1() { // launch kernel hc::extent<1> e(1024); clock_gettime(CLOCK_REALTIME, &begin); - hc::completion_future fut = hc::parallel_for_each( - e, - [=](hc::index<1> idx) restrict(amp) { - }); + auto fut = hc::parallel_for_each(e, [=](hc::index<1>) [[hc]] {}); fut.wait(); ret &= (fut.is_ready() == true); @@ -77,11 +74,8 @@ bool test2() { // launch kernel hc::extent<1> e(vecSize); clock_gettime(CLOCK_REALTIME, &begin); - hc::completion_future fut = hc::parallel_for_each( - e, - [=](hc::index<1> idx) restrict(amp) { - p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]]; - + auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] { + p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]]; }); fut.wait(); ret &= (fut.is_ready() == true); @@ -104,10 +98,7 @@ bool test2() { void init() { // launch an empty kernel to initialize everything hc::extent<1> e(1024); - hc::completion_future fut = hc::parallel_for_each( - e, - [=](hc::index<1> idx) restrict(amp) { - }); + auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] {}); fut.wait(); } diff --git a/benchmarks/RuntimeOverheads/kernel_enqueue_overhead.cpp b/benchmarks/RuntimeOverheads/kernel_enqueue_overhead.cpp index 266708fab9f..ed7a47fb93a 100644 --- a/benchmarks/RuntimeOverheads/kernel_enqueue_overhead.cpp +++ b/benchmarks/RuntimeOverheads/kernel_enqueue_overhead.cpp @@ -1,7 +1,7 @@ -// RUN: %hc %s -lhc_am -o %t.out && %t.out +// RUN: %hc %s -o %t.out && %t.out -#include -#include +#include +#include #include #include @@ -23,10 +23,7 @@ bool test1() { // launch kernel hc::extent<1> e(1024); clock_gettime(CLOCK_REALTIME, &begin); - hc::completion_future fut = hc::parallel_for_each( - e, - [=](hc::index<1> idx) restrict(amp) { - }); + auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] {}); clock_gettime(CLOCK_REALTIME, &end); time_spent_once = ((end.tv_sec - begin.tv_sec) * 1000 * 1000) + ((end.tv_nsec - begin.tv_nsec) / 1000); time_spent += time_spent_once; @@ -77,11 +74,8 @@ bool test2() { // launch kernel hc::extent<1> e(vecSize); clock_gettime(CLOCK_REALTIME, &begin); - hc::completion_future fut = hc::parallel_for_each( - e, - [=](hc::index<1> idx) restrict(amp) { - p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]]; - + auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] { + p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]]; }); clock_gettime(CLOCK_REALTIME, &end); time_spent_once = ((end.tv_sec - begin.tv_sec) * 1000 * 1000) + ((end.tv_nsec - begin.tv_nsec) / 1000); @@ -104,10 +98,7 @@ bool test2() { void init() { // launch an empty kernel to initialize everything hc::extent<1> e(1024); - hc::completion_future fut = hc::parallel_for_each( - e, - [=](hc::index<1> idx) restrict(amp) { - }); + auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] {}); fut.wait(); } diff --git a/benchmarks/benchEmptyKernel/bench.cpp b/benchmarks/benchEmptyKernel/bench.cpp index 1c729024f8a..80e3f2758e7 100644 --- a/benchmarks/benchEmptyKernel/bench.cpp +++ b/benchmarks/benchEmptyKernel/bench.cpp @@ -22,10 +22,9 @@ #define BENCH_HSA 1 +#include +#include -#include "hc.hpp" -#include "hc_am.hpp" -#include "grid_launch.hpp" #include #include #include @@ -242,7 +241,7 @@ int main(int argc, char* argv[]) { // timing for null kernel launch appears later hc::parallel_for_each(av, hc::extent<3>(lp.grid_dim.x*lp.group_dim.x,1,1).tile(lp.group_dim.x,1,1), - [=](hc::index<3>& idx) __HC__ { + [=](hc::index<3>& idx) [[hc]] { }).wait(); // Setting lp.cf to completion_future so we can track completion: (NULL ignores all synchronization) @@ -258,7 +257,7 @@ int main(int argc, char* argv[]) { hc::completion_future cf; for (int j=0; j(lp.grid_dim.x*lp.group_dim.x,1,1).tile(lp.group_dim.x,1,1), - [=](hc::index<3>& idx) __HC__ { + [=](hc::index<3>& idx) [[hc]] { }); }; cf.wait(hc::hcWaitModeActive); @@ -281,7 +280,7 @@ int main(int argc, char* argv[]) { hc::completion_future cf; for (int j=0; j(lp.grid_dim.x*lp.group_dim.x,1,1).tile(lp.group_dim.x,1,1), - [=](hc::index<3>& idx) __HC__ { + [=](hc::index<3>& idx) [[hc]] { }); }; cf.wait(hc::hcWaitModeBlocked); diff --git a/benchmarks/benchEmptyKernel/hsacodelib.CPP b/benchmarks/benchEmptyKernel/hsacodelib.CPP index 208d8e8277a..c246dd68b42 100644 --- a/benchmarks/benchEmptyKernel/hsacodelib.CPP +++ b/benchmarks/benchEmptyKernel/hsacodelib.CPP @@ -2,8 +2,7 @@ #include #include -#include -#include +#include #include diff --git a/benchmarks/benchEmptyKernel/nullkernel.cpp b/benchmarks/benchEmptyKernel/nullkernel.cpp index f1ec520cfd4..44d186b7637 100644 --- a/benchmarks/benchEmptyKernel/nullkernel.cpp +++ b/benchmarks/benchEmptyKernel/nullkernel.cpp @@ -1,10 +1,8 @@ // RUN: %hc --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 -fPIC -shared %S/nullkernel.cpp -o %T/nullkernel // RUN: HCC_HOME=%llvm_libs_dir/../../ %extractkernel -i %T/nullkernel -#include "hc.hpp" -#include "grid_launch.hpp" +#include -__attribute__((hc_grid_launch)) void nullkernel(const grid_launch_parm lp, float* Ad) { if (Ad) { Ad[0] = 42; diff --git a/clang b/clang index 4600645e1c6..37ff576f91b 160000 --- a/clang +++ b/clang @@ -1 +1 @@ -Subproject commit 4600645e1c652ab8324f7e6c1b99502ab036de78 +Subproject commit 37ff576f91b0f7ad0030171b9da310d3ced757e9 diff --git a/clang-tools-extra b/clang-tools-extra index f4b9e0b89f9..0254eba919e 160000 --- a/clang-tools-extra +++ b/clang-tools-extra @@ -1 +1 @@ -Subproject commit f4b9e0b89f99ffbe7bcf4e8d5ac08f61e65b5a98 +Subproject commit 0254eba919ec417ad27f4e475c758d4d10c1d77d diff --git a/cmake-tests/CMakeLists.txt b/cmake-tests/CMakeLists.txt index a851247e092..d8e5bb3a019 100644 --- a/cmake-tests/CMakeLists.txt +++ b/cmake-tests/CMakeLists.txt @@ -17,7 +17,7 @@ endif() set_target_properties(cmake-test PROPERTIES LINK_FLAGS ${new_cmake_test_link_flags}) if(TARGET hccrt) - add_dependencies(cmake-test clang_links rocdl_links mcwamp_hsa mcwamp) + add_dependencies(cmake-test clang_links rocdl_links) target_link_libraries(cmake-test hccrt hc_am) else() # Append default hcc installation diff --git a/cmake-tests/cmake-test.cpp b/cmake-tests/cmake-test.cpp index da33e9e8dfd..8c8fbc485ed 100644 --- a/cmake-tests/cmake-test.cpp +++ b/cmake-tests/cmake-test.cpp @@ -1,6 +1,6 @@ -#include -#include +#include +#include #include #include #include diff --git a/compiler b/compiler index 1474588ae16..d74f56ca4f3 160000 --- a/compiler +++ b/compiler @@ -1 +1 @@ -Subproject commit 1474588ae16aed5e8311d26335d8d41fa08ef0b6 +Subproject commit d74f56ca4f35917356bfbedcb153feea7647cd73 diff --git a/compiler-rt b/compiler-rt index ae38e94c812..2264b759866 160000 --- a/compiler-rt +++ b/compiler-rt @@ -1 +1 @@ -Subproject commit ae38e94c8126c896ddbb7aadf0644f35666e97ef +Subproject commit 2264b759866e950be23bb12c3ea50d515134b8ae diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index b9e9da2463f..945f1f335da 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -34,15 +34,15 @@ execute_process(COMMAND date +%y%W # get commit information execute_process(COMMAND git rev-parse --short HEAD WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/.. - OUTPUT_VARIABLE KALMAR_DRIVER_COMMIT + OUTPUT_VARIABLE HC_DRIVER_COMMIT OUTPUT_STRIP_TRAILING_WHITESPACE) execute_process(COMMAND git rev-parse --short HEAD WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/../compiler/tools/clang - OUTPUT_VARIABLE KALMAR_COMPILER_COMMIT + OUTPUT_VARIABLE HC_COMPILER_COMMIT OUTPUT_STRIP_TRAILING_WHITESPACE) # set HCC version string -set(HCC_VERSION_STRING "${HCC_VERSION_MAJOR}.${HCC_VERSION_MINOR}.${HCC_VERSION_PATCH}-${KALMAR_DRIVER_COMMIT}-${KALMAR_COMPILER_COMMIT}") +set(HCC_VERSION_STRING "${HCC_VERSION_MAJOR}.${HCC_VERSION_MINOR}.${HCC_VERSION_PATCH}-${HC_DRIVER_COMMIT}-${HC_COMPILER_COMMIT}") # show HCC version string MESSAGE("========================================") diff --git a/hc2/external/elfio/elf_types.hpp b/hc2/external/elfio/elf_types.hpp index 1b90c4c57dd..1301cf4315f 100644 --- a/hc2/external/elfio/elf_types.hpp +++ b/hc2/external/elfio/elf_types.hpp @@ -460,53 +460,84 @@ typedef uint64_t Elf64_Off; #define STN_UNDEF 0 // Relocation types -#define R_386_NONE 0 -#define R_X86_64_NONE 0 -#define R_386_32 1 -#define R_X86_64_64 1 -#define R_386_PC32 2 -#define R_X86_64_PC32 2 -#define R_386_GOT32 3 -#define R_X86_64_GOT32 3 -#define R_386_PLT32 4 -#define R_X86_64_PLT32 4 -#define R_386_COPY 5 -#define R_X86_64_COPY 5 -#define R_386_GLOB_DAT 6 -#define R_X86_64_GLOB_DAT 6 -#define R_386_JMP_SLOT 7 -#define R_X86_64_JUMP_SLOT 7 -#define R_386_RELATIVE 8 -#define R_X86_64_RELATIVE 8 -#define R_386_GOTOFF 9 -#define R_X86_64_GOTPCREL 9 -#define R_386_GOTPC 10 -#define R_X86_64_32 10 -#define R_X86_64_32S 11 -#define R_X86_64_16 12 -#define R_X86_64_PC16 13 -#define R_X86_64_8 14 -#define R_X86_64_PC8 15 -#define R_X86_64_DTPMOD64 16 -#define R_X86_64_DTPOFF64 17 -#define R_X86_64_TPOFF64 18 -#define R_X86_64_TLSGD 19 -#define R_X86_64_TLSLD 20 -#define R_X86_64_DTPOFF32 21 -#define R_X86_64_GOTTPOFF 22 -#define R_X86_64_TPOFF32 23 -#define R_X86_64_PC64 24 -#define R_X86_64_GOTOFF64 25 -#define R_X86_64_GOTPC32 26 -#define R_X86_64_GOT64 27 -#define R_X86_64_GOTPCREL64 28 -#define R_X86_64_GOTPC64 29 -#define R_X86_64_GOTPLT64 30 -#define R_X86_64_PLTOFF64 31 -#define R_X86_64_GOTPC32_TLSDESC 34 -#define R_X86_64_TLSDESC_CALL 35 -#define R_X86_64_TLSDESC 36 -#define R_X86_64_IRELATIVE 37 +#define R_386_NONE 0 +#define R_X86_64_NONE 0 +#define R_386_32 1 +#define R_X86_64_64 1 +#define R_386_PC32 2 +#define R_X86_64_PC32 2 +#define R_386_GOT32 3 +#define R_X86_64_GOT32 3 +#define R_386_PLT32 4 +#define R_X86_64_PLT32 4 +#define R_386_COPY 5 +#define R_X86_64_COPY 5 +#define R_386_GLOB_DAT 6 +#define R_X86_64_GLOB_DAT 6 +#define R_386_JMP_SLOT 7 +#define R_X86_64_JUMP_SLOT 7 +#define R_386_RELATIVE 8 +#define R_X86_64_RELATIVE 8 +#define R_386_GOTOFF 9 +#define R_X86_64_GOTPCREL 9 +#define R_386_GOTPC 10 +#define R_X86_64_32 10 +#define R_386_32PLT 11 +#define R_X86_64_32S 11 +#define R_X86_64_16 12 +#define R_X86_64_PC16 13 +#define R_386_TLS_TPOFF 14 +#define R_X86_64_8 14 +#define R_386_TLS_IE 15 +#define R_X86_64_PC8 15 +#define R_386_TLS_GOTIE 16 +#define R_X86_64_DTPMOD64 16 +#define R_386_TLS_LE 17 +#define R_X86_64_DTPOFF64 17 +#define R_386_TLS_GD 18 +#define R_X86_64_TPOFF64 18 +#define R_386_TLS_LDM 19 +#define R_X86_64_TLSGD 19 +#define R_386_16 20 +#define R_X86_64_TLSLD 20 +#define R_386_PC16 21 +#define R_X86_64_DTPOFF32 21 +#define R_386_8 22 +#define R_X86_64_GOTTPOFF 22 +#define R_386_PC8 23 +#define R_X86_64_TPOFF32 23 +#define R_386_TLS_GD_32 24 +#define R_X86_64_PC64 24 +#define R_386_TLS_GD_PUSH 25 +#define R_X86_64_GOTOFF64 25 +#define R_386_TLS_GD_CALL 26 +#define R_X86_64_GOTPC32 26 +#define R_386_TLS_GD_POP 27 +#define R_X86_64_GOT64 27 +#define R_386_TLS_LDM_32 28 +#define R_X86_64_GOTPCREL64 28 +#define R_386_TLS_LDM_PUSH 29 +#define R_X86_64_GOTPC64 29 +#define R_386_TLS_LDM_CALL 30 +#define R_X86_64_GOTPLT64 30 +#define R_386_TLS_LDM_POP 31 +#define R_X86_64_PLTOFF64 31 +#define R_386_TLS_LDO_32 32 +#define R_386_TLS_IE_32 33 +#define R_386_TLS_LE_32 34 +#define R_X86_64_GOTPC32_TLSDESC 34 +#define R_386_TLS_DTPMOD32 35 +#define R_X86_64_TLSDESC_CALL 35 +#define R_386_TLS_DTPOFF32 36 +#define R_X86_64_TLSDESC 36 +#define R_386_TLS_TPOFF32 37 +#define R_X86_64_IRELATIVE 37 +#define R_386_SIZE32 38 +#define R_386_TLS_GOTDESC 39 +#define R_386_TLS_DESC_CALL 40 +#define R_386_TLS_DESC 41 +#define R_386_IRELATIVE 42 +#define R_386_GOT32X 43 #define R_X86_64_GNU_VTINHERIT 250 #define R_X86_64_GNU_VTENTRY 251 diff --git a/hc2/external/elfio/elfio.hpp b/hc2/external/elfio/elfio.hpp index b59295b342c..508f8e77d03 100644 --- a/hc2/external/elfio/elfio.hpp +++ b/hc2/external/elfio/elfio.hpp @@ -51,19 +51,21 @@ THE SOFTWARE. TYPE \ get_##FNAME() const \ { \ - return header->get_##FNAME(); \ + return header? header->get_##FNAME() : 0; \ } #define ELFIO_HEADER_ACCESS_GET_SET( TYPE, FNAME ) \ TYPE \ get_##FNAME() const \ { \ - return header->get_##FNAME(); \ + return header? header->get_##FNAME() : 0; \ } \ void \ set_##FNAME( TYPE val ) \ -{ \ - header->set_##FNAME( val ); \ +{ \ + if (header) { \ + header->set_##FNAME( val ); \ + } \ } \ namespace ELFIO { @@ -112,11 +114,9 @@ class elfio { clean(); - unsigned char e_ident[EI_NIDENT]; - - // Read ELF file signature - stream.seekg( 0 ); - stream.read( reinterpret_cast( &e_ident ), sizeof( e_ident ) ); + unsigned char e_ident[EI_NIDENT]; + // Read ELF file signature + stream.read( reinterpret_cast( &e_ident ), sizeof( e_ident ) ); // Is it ELF file? if ( stream.gcount() != sizeof( e_ident ) || @@ -133,7 +133,6 @@ class elfio } convertor.setup( e_ident[EI_DATA] ); - header = create_header( e_ident[EI_CLASS], e_ident[EI_DATA] ); if ( 0 == header ) { return false; @@ -143,9 +142,8 @@ class elfio } load_sections( stream ); - load_segments( stream ); - - return true; + bool is_still_good = load_segments( stream ); + return is_still_good; } //------------------------------------------------------------------------------ @@ -153,12 +151,11 @@ class elfio { std::ofstream f( file_name.c_str(), std::ios::out | std::ios::binary ); - if ( !f ) { + if ( !f || !header) { return false; } bool is_still_good = true; - // Define layout specific header fields // The position of the segment table is fixed after the header. // The position of the section table is variable and needs to be fixed @@ -172,6 +169,8 @@ class elfio current_file_pos = header->get_header_size() + header->get_segment_entry_size() * header->get_segments_num(); + calc_segment_alignment(); + is_still_good = layout_segments_and_their_sections(); is_still_good = is_still_good && layout_sections_without_segments(); is_still_good = is_still_good && layout_section_table(); @@ -248,6 +247,45 @@ class elfio } } +//------------------------------------------------------------------------------ + private: + bool is_offset_in_section( Elf64_Off offset, const section* sec ) const { + return offset >= sec->get_offset() && offset < sec->get_offset()+sec->get_size(); + } + +//------------------------------------------------------------------------------ + public: + + //! returns an empty string if no problems are detected, + //! or a string containing an error message if problems are found + std::string validate() const { + + // check for overlapping sections in the file + for ( int i = 0; i < sections.size(); ++i) { + for ( int j = i+1; j < sections.size(); ++j ) { + const section* a = sections[i]; + const section* b = sections[j]; + if ( !(a->get_type() & SHT_NOBITS) + && !(b->get_type() & SHT_NOBITS) + && (a->get_size() > 0) + && (b->get_size() > 0) + && (a->get_offset() > 0) + && (b->get_offset() > 0)) { + if ( is_offset_in_section( a->get_offset(), b ) + || is_offset_in_section( a->get_offset()+a->get_size()-1, b ) + || is_offset_in_section( b->get_offset(), a ) + || is_offset_in_section( b->get_offset()+b->get_size()-1, a )) { + return "Sections " + a->get_name() + " and " + b->get_name() + " overlap in file"; + } + } + } + } + + // more checks to be added here... + + return ""; + } + //------------------------------------------------------------------------------ private: //------------------------------------------------------------------------------ @@ -382,6 +420,18 @@ class elfio return num; } +//------------------------------------------------------------------------------ + //! Checks whether the addresses of the section entirely fall within the given segment. + //! It doesn't matter if the addresses are memory addresses, or file offsets, + //! they just need to be in the same address space + bool is_sect_in_seg ( Elf64_Off sect_begin, Elf_Xword sect_size, Elf64_Off seg_begin, Elf64_Off seg_end ) { + return seg_begin <= sect_begin + && sect_begin + sect_size <= seg_end + && sect_begin < seg_end; // this is important criteria when sect_size == 0 + // Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11) + // sect_begin=12, sect_size=0 -> shall return false! + } + //------------------------------------------------------------------------------ bool load_segments( std::istream& stream ) { @@ -417,14 +467,11 @@ class elfio // SHF_ALLOC sections are matched based on the virtual address // otherwise the file offset is matched if( psec->get_flags() & SHF_ALLOC - ? (segVBaseAddr <= psec->get_address() - && psec->get_address() + psec->get_size() - <= segVEndAddr) - : (segBaseOffset <= psec->get_offset() - && psec->get_offset() + psec->get_size() - <= segEndOffset)) { - seg->add_section_index( psec->get_index(), - psec->get_addr_align() ); + ? is_sect_in_seg( psec->get_address(), psec->get_size(), segVBaseAddr, segVEndAddr ) + : is_sect_in_seg( psec->get_offset(), psec->get_size(), segBaseOffset, segEndOffset )) { + // Alignment of segment shall not be updated, to preserve original value + // It will be re-calculated on saving. + seg->add_section_index( psec->get_index(), 0 ); } } @@ -517,6 +564,9 @@ class elfio for( size_t i = 0; i < worklist.size(); ++i ) { if( i != nextSlot && worklist[i]->is_offset_initialized() && worklist[i]->get_offset() == 0 ) { + if (worklist[nextSlot]->get_offset() == 0) { + ++nextSlot; + } std::swap(worklist[i],worklist[nextSlot]); ++nextSlot; } @@ -570,6 +620,20 @@ class elfio } +//------------------------------------------------------------------------------ + void calc_segment_alignment( ) + { + for( std::vector::iterator s = segments_.begin(); s != segments_.end(); ++s ) { + segment* seg = *s; + for ( int i = 0; i < seg->get_sections_num(); ++i ) { + section* sect = sections_[ seg->get_section_index_at(i) ]; + if ( sect->get_addr_align() > seg->get_align() ) { + seg->set_align( sect->get_addr_align() ); + } + } + } + } + //------------------------------------------------------------------------------ bool layout_segments_and_their_sections( ) { @@ -606,11 +670,12 @@ class elfio // have to be aligned else if ( seg->get_sections_num() && !section_generated[seg->get_section_index_at( 0 )] ) { - Elf64_Off cur_page_alignment = current_file_pos % seg->get_align(); - Elf64_Off req_page_alignment = seg->get_virtual_address() % seg->get_align(); + Elf_Xword align = seg->get_align() > 0 ? seg->get_align() : 1; + Elf64_Off cur_page_alignment = current_file_pos % align; + Elf64_Off req_page_alignment = seg->get_virtual_address() % align; Elf64_Off error = req_page_alignment - cur_page_alignment; - current_file_pos += ( seg->get_align() + error ) % seg->get_align(); + current_file_pos += ( seg->get_align() + error ) % align; seg_start_pos = current_file_pos; } else if ( seg->get_sections_num() ) { @@ -633,14 +698,20 @@ class elfio // Fix up the alignment if ( !section_generated[index] && sec->is_address_initialized() && SHT_NOBITS != sec->get_type() - && SHT_NULL != sec->get_type() ) { + && SHT_NULL != sec->get_type() + && 0 != sec->get_size() ) { // Align the sections based on the virtual addresses // when possible (this is what matters for execution) Elf64_Off req_offset = sec->get_address() - seg->get_virtual_address(); Elf64_Off cur_offset = current_file_pos - seg_start_pos; + if ( req_offset < cur_offset) { + // something has gone awfully wrong, abort! + // secAlign would turn out negative, seeking backwards and overwriting previous data + return false; + } secAlign = req_offset - cur_offset; } - else if (!section_generated[index]) { + else if (!section_generated[index] && !sec->is_address_initialized() ) { // If no address has been specified then only the section // alignment constraint has to be matched Elf_Xword align = sec->get_addr_align(); @@ -650,7 +721,7 @@ class elfio Elf64_Off error = current_file_pos % align; secAlign = ( align - error ) % align; } - else { + else if (section_generated[index] ) { // Alignment for already generated sections secAlign = sec->get_offset() - seg_start_pos - segment_filesize; } @@ -685,7 +756,15 @@ class elfio } seg->set_file_size( segment_filesize ); - seg->set_memory_size( segment_memory ); + + // If we already have a memory size from loading an elf file (value > 0), + // it must not shrink! + // Memory size may be bigger than file size and it is the loader's job to do something + // with the surplus bytes in memory, like initializing them with a defined value. + if ( seg->get_memory_size() < segment_memory ) { + seg->set_memory_size( segment_memory ); + } + seg->set_offset(seg_start_pos); } @@ -775,6 +854,16 @@ class elfio return parent->sections_.end(); } +//------------------------------------------------------------------------------ + std::vector::const_iterator begin() const { + return parent->sections_.cbegin(); + } + +//------------------------------------------------------------------------------ + std::vector::const_iterator end() const { + return parent->sections_.cend(); + } + //------------------------------------------------------------------------------ private: elfio* parent; @@ -820,6 +909,16 @@ class elfio return parent->segments_.end(); } +//------------------------------------------------------------------------------ + std::vector::const_iterator begin() const { + return parent->segments_.cbegin(); + } + +//------------------------------------------------------------------------------ + std::vector::const_iterator end() const { + return parent->segments_.cend(); + } + //------------------------------------------------------------------------------ private: elfio* parent; diff --git a/hc2/external/elfio/elfio_dump.hpp b/hc2/external/elfio/elfio_dump.hpp index 04948529603..d98c1ff1881 100644 --- a/hc2/external/elfio/elfio_dump.hpp +++ b/hc2/external/elfio/elfio_dump.hpp @@ -429,18 +429,22 @@ class dump //------------------------------------------------------------------------------ static void header( std::ostream& out, const elfio& reader ) - { - out << "ELF Header" << std::endl << std::endl - << " Class: " << str_class( reader.get_class() ) << std::endl - << " Encoding: " << str_endian( reader.get_encoding() ) << std::endl - << " ELFVersion: " << str_version( reader.get_elf_version() ) << std::endl - << " Type: " << str_type( reader.get_type() ) << std::endl - << " Machine: " << str_machine( reader.get_machine() ) << std::endl - << " Version: " << str_version( reader.get_version() ) << std::endl - << " Entry: " << "0x" << std::hex << reader.get_entry() << std::endl - << " Flags: " << "0x" << std::hex << reader.get_flags() << std::endl - << std::endl; - } + { + if (!reader.get_header_size()) + { + return; + } + out << "ELF Header" << std::endl << std::endl + << " Class: " << str_class( reader.get_class() ) << std::endl + << " Encoding: " << str_endian( reader.get_encoding() ) << std::endl + << " ELFVersion: " << str_version( reader.get_elf_version() ) << std::endl + << " Type: " << str_type( reader.get_type() ) << std::endl + << " Machine: " << str_machine( reader.get_machine() ) << std::endl + << " Version: " << str_version( reader.get_version() ) << std::endl + << " Entry: " << "0x" << std::hex << reader.get_entry() << std::endl + << " Flags: " << "0x" << std::hex << reader.get_flags() << std::endl + << std::endl; + } //------------------------------------------------------------------------------ static void @@ -728,7 +732,7 @@ class dump if ( dyn_no > 0 ) { out << "Dynamic section (" << sec->get_name() << ")" << std::endl; out << "[ Nr ] Tag Name/Value" << std::endl; - for ( int i = 0; i < dyn_no; ++i ) { + for ( Elf_Xword i = 0; i < dyn_no; ++i ) { Elf_Xword tag = 0; Elf_Xword value = 0; std::string str; diff --git a/hc2/external/elfio/elfio_dynamic.hpp b/hc2/external/elfio/elfio_dynamic.hpp index 6f2d041e0fc..64f13b9ce7a 100644 --- a/hc2/external/elfio/elfio_dynamic.hpp +++ b/hc2/external/elfio/elfio_dynamic.hpp @@ -26,13 +26,14 @@ THE SOFTWARE. namespace ELFIO { //------------------------------------------------------------------------------ -class dynamic_section_accessor +template< class S > +class dynamic_section_accessor_template { public: //------------------------------------------------------------------------------ - dynamic_section_accessor( const elfio& elf_file_, section* section_ ) : - elf_file( elf_file_ ), - dynamic_section( section_ ) + dynamic_section_accessor_template( const elfio& elf_file_, S* section_ ) : + elf_file( elf_file_ ), + dynamic_section( section_ ) { } @@ -245,9 +246,12 @@ class dynamic_section_accessor //------------------------------------------------------------------------------ private: const elfio& elf_file; - section* dynamic_section; + S* dynamic_section; }; +using dynamic_section_accessor = dynamic_section_accessor_template
; +using const_dynamic_section_accessor = dynamic_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_DYNAMIC_HPP diff --git a/hc2/external/elfio/elfio_header.hpp b/hc2/external/elfio/elfio_header.hpp index d689a8899f7..e8713cd7894 100644 --- a/hc2/external/elfio/elfio_header.hpp +++ b/hc2/external/elfio/elfio_header.hpp @@ -38,11 +38,11 @@ class elf_header ELFIO_GET_ACCESS_DECL( unsigned char, class ); ELFIO_GET_ACCESS_DECL( unsigned char, elf_version ); ELFIO_GET_ACCESS_DECL( unsigned char, encoding ); - ELFIO_GET_ACCESS_DECL( Elf_Word, version ); ELFIO_GET_ACCESS_DECL( Elf_Half, header_size ); ELFIO_GET_ACCESS_DECL( Elf_Half, section_entry_size ); ELFIO_GET_ACCESS_DECL( Elf_Half, segment_entry_size ); + ELFIO_GET_SET_ACCESS_DECL( Elf_Word, version ); ELFIO_GET_SET_ACCESS_DECL( unsigned char, os_abi ); ELFIO_GET_SET_ACCESS_DECL( unsigned char, abi_version ); ELFIO_GET_SET_ACCESS_DECL( Elf_Half, type ); @@ -86,8 +86,6 @@ template< class T > class elf_header_impl : public elf_header header.e_ident[EI_CLASS] = elf_header_impl_types::file_class; header.e_ident[EI_DATA] = encoding; header.e_ident[EI_VERSION] = EV_CURRENT; - header.e_version = EV_CURRENT; - header.e_version = (*convertor)( header.e_version ); header.e_ehsize = ( sizeof( header ) ); header.e_ehsize = (*convertor)( header.e_ehsize ); header.e_shstrndx = (*convertor)( (Elf_Half)1 ); @@ -95,6 +93,8 @@ template< class T > class elf_header_impl : public elf_header header.e_shentsize = sizeof( typename elf_header_impl_types::Shdr_type ); header.e_phentsize = (*convertor)( header.e_phentsize ); header.e_shentsize = (*convertor)( header.e_shentsize ); + + set_version( EV_CURRENT ); } bool @@ -119,11 +119,11 @@ template< class T > class elf_header_impl : public elf_header ELFIO_GET_ACCESS( unsigned char, class, header.e_ident[EI_CLASS] ); ELFIO_GET_ACCESS( unsigned char, elf_version, header.e_ident[EI_VERSION] ); ELFIO_GET_ACCESS( unsigned char, encoding, header.e_ident[EI_DATA] ); - ELFIO_GET_ACCESS( Elf_Word, version, header.e_version ); ELFIO_GET_ACCESS( Elf_Half, header_size, header.e_ehsize ); ELFIO_GET_ACCESS( Elf_Half, section_entry_size, header.e_shentsize ); ELFIO_GET_ACCESS( Elf_Half, segment_entry_size, header.e_phentsize ); + ELFIO_GET_SET_ACCESS( Elf_Word, version, header.e_version); ELFIO_GET_SET_ACCESS( unsigned char, os_abi, header.e_ident[EI_OSABI] ); ELFIO_GET_SET_ACCESS( unsigned char, abi_version, header.e_ident[EI_ABIVERSION] ); ELFIO_GET_SET_ACCESS( Elf_Half, type, header.e_type ); diff --git a/hc2/external/elfio/elfio_note.hpp b/hc2/external/elfio/elfio_note.hpp index 35c6fe344cc..8619c7385db 100644 --- a/hc2/external/elfio/elfio_note.hpp +++ b/hc2/external/elfio/elfio_note.hpp @@ -38,12 +38,13 @@ namespace ELFIO { //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ -class note_section_accessor +template< class S > +class note_section_accessor_template { public: //------------------------------------------------------------------------------ - note_section_accessor( const elfio& elf_file_, section* section_ ) : - elf_file( elf_file_ ), note_section( section_ ) + note_section_accessor_template( const elfio& elf_file_, S* section_ ) : + elf_file( elf_file_ ), note_section( section_ ) { process_section(); } @@ -71,10 +72,10 @@ class note_section_accessor int align = sizeof( Elf_Word ); const endianess_convertor& convertor = elf_file.get_convertor(); - type = convertor( *(Elf_Word*)( pData + 2*align ) ); - Elf_Word namesz = convertor( *(Elf_Word*)( pData ) ); - descSize = convertor( *(Elf_Word*)( pData + sizeof( namesz ) ) ); - Elf_Word max_name_size = note_section->get_size() - note_start_positions[index]; + type = convertor( *(const Elf_Word*)( pData + 2*align ) ); + Elf_Word namesz = convertor( *(const Elf_Word*)( pData ) ); + descSize = convertor( *(const Elf_Word*)( pData + sizeof( namesz ) ) ); + Elf_Xword max_name_size = note_section->get_size() - note_start_positions[index]; if ( namesz > max_name_size || namesz + descSize > max_name_size ) { return false; @@ -144,9 +145,9 @@ class note_section_accessor while ( current + 3*align <= size ) { note_start_positions.push_back( current ); Elf_Word namesz = convertor( - *(Elf_Word*)( data + current ) ); + *(const Elf_Word*)( data + current ) ); Elf_Word descsz = convertor( - *(Elf_Word*)( data + current + sizeof( namesz ) ) ); + *(const Elf_Word*)( data + current + sizeof( namesz ) ) ); current += 3*sizeof( Elf_Word ) + ( ( namesz + align - 1 ) / align ) * align + @@ -157,10 +158,13 @@ class note_section_accessor //------------------------------------------------------------------------------ private: const elfio& elf_file; - section* note_section; + S* note_section; std::vector note_start_positions; }; +using note_section_accessor = note_section_accessor_template
; +using const_note_section_accessor = note_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_NOTE_HPP diff --git a/hc2/external/elfio/elfio_relocation.hpp b/hc2/external/elfio/elfio_relocation.hpp index d13d8b23c7f..238598e97ba 100644 --- a/hc2/external/elfio/elfio_relocation.hpp +++ b/hc2/external/elfio/elfio_relocation.hpp @@ -73,13 +73,14 @@ template<> struct get_sym_and_type< Elf64_Rela > //------------------------------------------------------------------------------ -class relocation_section_accessor +template< class S > +class relocation_section_accessor_template { public: //------------------------------------------------------------------------------ - relocation_section_accessor( const elfio& elf_file_, section* section_ ) : - elf_file( elf_file_ ), - relocation_section( section_ ) + relocation_section_accessor_template( const elfio& elf_file_, S* section_ ) : + elf_file( elf_file_ ), + relocation_section( section_ ) { } @@ -361,9 +362,12 @@ class relocation_section_accessor //------------------------------------------------------------------------------ private: const elfio& elf_file; - section* relocation_section; + S* relocation_section; }; +using relocation_section_accessor = relocation_section_accessor_template
; +using const_relocation_section_accessor = relocation_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_RELOCATION_HPP diff --git a/hc2/external/elfio/elfio_section.hpp b/hc2/external/elfio/elfio_section.hpp index b2c9b456b55..cb188c14d08 100644 --- a/hc2/external/elfio/elfio_section.hpp +++ b/hc2/external/elfio/elfio_section.hpp @@ -45,6 +45,17 @@ class section ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, address ); ELFIO_GET_SET_ACCESS_DECL( Elf_Xword, size ); ELFIO_GET_SET_ACCESS_DECL( Elf_Word, name_string_offset ); + ELFIO_GET_ACCESS_DECL ( Elf64_Off, offset ); + size_t stream_size; + size_t get_stream_size() const + { + return stream_size; + } + + void set_stream_size(size_t value) + { + stream_size = value; + } virtual const char* get_data() const = 0; virtual void set_data( const char* pData, Elf_Word size ) = 0; @@ -53,7 +64,7 @@ class section virtual void append_data( const std::string& data ) = 0; protected: - ELFIO_GET_SET_ACCESS_DECL( Elf64_Off, offset ); + ELFIO_SET_ACCESS_DECL( Elf64_Off, offset ); ELFIO_SET_ACCESS_DECL( Elf_Half, index ); virtual void load( std::istream& f, @@ -223,23 +234,29 @@ class section_impl : public section std::streampos header_offset ) { std::fill_n( reinterpret_cast( &header ), sizeof( header ), '\0' ); + + stream.seekg ( 0, stream.end ); + set_stream_size ( stream.tellg() ); + stream.seekg( header_offset ); stream.read( reinterpret_cast( &header ), sizeof( header ) ); + Elf_Xword size = get_size(); - if ( 0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type() ) { - try { - data = new char[size]; - } catch (const std::bad_alloc&) { - data = 0; - data_size = 0; - } - if ( 0 != size ) { - stream.seekg( (*convertor)( header.sh_offset ) ); - stream.read( data, size ); - data_size = size; - } - } + if ( 0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type() && size < get_stream_size()) { + try { + data = new char[size + 1]; + } catch (const std::bad_alloc&) { + data = 0; + data_size = 0; + } + if ( 0 != size ) { + stream.seekg( (*convertor)( header.sh_offset ) ); + stream.read( data, size ); + data[size] = 0; //ensure data is ended with 0 to avoid oob read + data_size = size; + } + } } //------------------------------------------------------------------------------ diff --git a/hc2/external/elfio/elfio_segment.hpp b/hc2/external/elfio/elfio_segment.hpp index 35f17e939bc..02d752a90b0 100644 --- a/hc2/external/elfio/elfio_segment.hpp +++ b/hc2/external/elfio/elfio_segment.hpp @@ -92,6 +92,21 @@ class segment_impl : public segment ELFIO_GET_SET_ACCESS( Elf_Xword, file_size, ph.p_filesz ); ELFIO_GET_SET_ACCESS( Elf_Xword, memory_size, ph.p_memsz ); ELFIO_GET_ACCESS( Elf64_Off, offset, ph.p_offset ); + size_t stream_size; + +//------------------------------------------------------------------------------ + size_t + get_stream_size() const + { + return stream_size; + } + +//------------------------------------------------------------------------------ + void + set_stream_size(size_t value) + { + stream_size = value; + } //------------------------------------------------------------------------------ Elf_Half @@ -176,6 +191,10 @@ class segment_impl : public segment load( std::istream& stream, std::streampos header_offset ) { + + stream.seekg ( 0, stream.end ); + set_stream_size ( stream.tellg() ); + stream.seekg( header_offset ); stream.read( reinterpret_cast( &ph ), sizeof( ph ) ); is_offset_set = true; @@ -183,14 +202,19 @@ class segment_impl : public segment if ( PT_NULL != get_type() && 0 != get_file_size() ) { stream.seekg( (*convertor)( ph.p_offset ) ); Elf_Xword size = get_file_size(); - try { - data = new char[size]; - } catch (const std::bad_alloc&) { - data = 0; - } - if ( 0 != data ) { - stream.read( data, size ); - } + if ( size > get_stream_size() ) { + data = 0; + } else { + try { + data = new char[size + 1]; + } catch (const std::bad_alloc&) { + data = 0; + } + if ( 0 != data ) { + stream.read( data, size ); + data[size] = 0; + } + } } } diff --git a/hc2/external/elfio/elfio_strings.hpp b/hc2/external/elfio/elfio_strings.hpp index df952a2145d..552f000294f 100644 --- a/hc2/external/elfio/elfio_strings.hpp +++ b/hc2/external/elfio/elfio_strings.hpp @@ -30,12 +30,13 @@ THE SOFTWARE. namespace ELFIO { //------------------------------------------------------------------------------ -class string_section_accessor +template< class S > +class string_section_accessor_template { public: //------------------------------------------------------------------------------ - string_section_accessor( section* section_ ) : - string_section( section_ ) + string_section_accessor_template( S* section_ ) : + string_section( section_ ) { } @@ -88,9 +89,12 @@ class string_section_accessor //------------------------------------------------------------------------------ private: - section* string_section; + S* string_section; }; +using string_section_accessor = string_section_accessor_template
; +using const_string_section_accessor = string_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_STRINGS_HPP diff --git a/hc2/external/elfio/elfio_symbols.hpp b/hc2/external/elfio/elfio_symbols.hpp index 80e498d8d59..d18756a9af9 100644 --- a/hc2/external/elfio/elfio_symbols.hpp +++ b/hc2/external/elfio/elfio_symbols.hpp @@ -26,13 +26,14 @@ THE SOFTWARE. namespace ELFIO { //------------------------------------------------------------------------------ -class symbol_section_accessor +template< class S > +class symbol_section_accessor_template { public: //------------------------------------------------------------------------------ - symbol_section_accessor( const elfio& elf_file_, section* symbol_section_ ) : - elf_file( elf_file_ ), - symbol_section( symbol_section_ ) + symbol_section_accessor_template( const elfio& elf_file_, S* symbol_section_ ) : + elf_file( elf_file_ ), + symbol_section( symbol_section_ ) { find_hash_section(); } @@ -87,17 +88,17 @@ class symbol_section_accessor bool ret = false; if ( 0 != get_hash_table_index() ) { - Elf_Word nbucket = *(Elf_Word*)hash_section->get_data(); - Elf_Word nchain = *(Elf_Word*)( hash_section->get_data() + + Elf_Word nbucket = *(const Elf_Word*)hash_section->get_data(); + Elf_Word nchain = *(const Elf_Word*)( hash_section->get_data() + sizeof( Elf_Word ) ); Elf_Word val = elf_hash( (const unsigned char*)name.c_str() ); - Elf_Word y = *(Elf_Word*)( hash_section->get_data() + + Elf_Word y = *(const Elf_Word*)( hash_section->get_data() + ( 2 + val % nbucket ) * sizeof( Elf_Word ) ); std::string str; get_symbol( y, str, value, size, bind, type, section_index, other ); while ( str != name && STN_UNDEF != y && y < nchain ) { - y = *(Elf_Word*)( hash_section->get_data() + + y = *(const Elf_Word*)( hash_section->get_data() + ( 2 + nbucket + y ) * sizeof( Elf_Word ) ); get_symbol( y, str, value, size, bind, type, section_index, other ); } @@ -268,11 +269,14 @@ class symbol_section_accessor //------------------------------------------------------------------------------ private: const elfio& elf_file; - section* symbol_section; + S* symbol_section; Elf_Half hash_section_index; const section* hash_section; }; +using symbol_section_accessor = symbol_section_accessor_template
; +using const_symbol_section_accessor = symbol_section_accessor_template; + } // namespace ELFIO #endif // ELFIO_SYMBOLS_HPP diff --git a/hc2/external/elfio/elfio_utils.hpp b/hc2/external/elfio/elfio_utils.hpp index f8423bd1475..2baf5a77ccb 100644 --- a/hc2/external/elfio/elfio_utils.hpp +++ b/hc2/external/elfio/elfio_utils.hpp @@ -174,7 +174,7 @@ class endianess_convertor { get_host_encoding() const { static const int tmp = 1; - if ( 1 == *(char*)&tmp ) { + if ( 1 == *(const char*)&tmp ) { return ELFDATA2LSB; } else { diff --git a/hc2/headers/types/program_state.hpp b/hc2/headers/types/program_state.hpp index 6ea79b2c20d..818db1b1f3c 100644 --- a/hc2/headers/types/program_state.hpp +++ b/hc2/headers/types/program_state.hpp @@ -15,7 +15,7 @@ #include -#include "../../external/elfio/elfio.hpp" +#include #include @@ -127,18 +127,20 @@ namespace hc2 static int copy_kernel_sections_(dl_phdr_info* x, size_t, void* kernels) { - static constexpr const char kernel[] = ".kernel"; - auto out = static_cast(kernels); ELFIO::elfio tmp; - if (tmp.load(x->dlpi_name)) { - for (auto&& y : tmp.sections) { - if (y->get_name() == kernel) { - out->emplace_back( - y->get_data(), y->get_data() + y->get_size()); - } - } + + if (!tmp.load(x->dlpi_name)) return 0; + + for (auto&& y : tmp.sections) { + static constexpr const char kernel[] = ".kernel"; + + if (y->get_name() != kernel) continue; + + out->emplace_back(y->get_data(), y->get_data() + y->get_size()); + + return 0; } return 0; diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index 933af298f33..ca8a71e19d8 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -1,42 +1,47 @@ +# Handle HC Legacy, for HIP compatibility (forever?) # Put all hcc headers into the hcc-headers target # .h and .hpp headers -FILE(GLOB H_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.h*) -# .inl headers -FILE(GLOB INL_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.inl) -# N4494 headers -FILE(GLOB N4494 ${CMAKE_CURRENT_SOURCE_DIR}/coordinate - ${CMAKE_CURRENT_SOURCE_DIR}/array_view) - - -set(HCC_HEADERS) -#Obtain the names of each Header File -foreach(InFName ${H_HEADERS} ${INL_HEADERS} ${N4494}) - STRING(REGEX REPLACE ${CMAKE_CURRENT_SOURCE_DIR}/ "" OutFName ${InFName}) - set(HCC_HEADERS ${HCC_HEADERS} "${OutFName}") -endforeach(InFName) - +set(HCC_headers + array_view + coordinate + hc_am_internal.hpp + hc_am.hpp + hc_defines.h + hc_math.hpp + hc_printf.hpp + hc_rt_debug.h + hc.hpp + hcc_features.hpp + kalmar_aligned_alloc.h + kalmar_exception.h + kalmar_runtime.h) # Set location for output directory set(output_dir "${PROJECT_BINARY_DIR}/include") set(out_files) -foreach( f ${HCC_HEADERS} ) - set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} ) - set( dst ${output_dir}/${f} ) - add_custom_command(OUTPUT ${dst} - DEPENDS ${src} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst} - COMMENT "Copying HCC's ${f}...") - list(APPEND out_files ${dst}) -endforeach( f ) +foreach(f ${HCC_headers}) + set(src ${CMAKE_CURRENT_SOURCE_DIR}/${f}) + set(dst ${output_dir}/${f}) + add_custom_command( + OUTPUT ${dst} + DEPENDS ${src} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst} + COMMENT "Copying HCC's ${f}...") + list(APPEND out_files ${dst}) +endforeach() # Create target for hcc-headers and set dependencies add_custom_target(hcc-headers ALL DEPENDS ${out_files}) add_dependencies(world hcc-headers) # Install command for headers -install(FILES ${HCC_HEADERS} - PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ - DESTINATION include) +install( + FILES ${HCC_headers} + PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ + DESTINATION include) + +# Handle pSTL +add_subdirectory(experimental) -# PSTL headers -ADD_SUBDIRECTORY(experimental) +# Handle HC Next (post ROCm 2.0) +add_subdirectory(hc) \ No newline at end of file diff --git a/include/coordinate b/include/coordinate index 959e6d624e0..6b326682424 100644 --- a/include/coordinate +++ b/include/coordinate @@ -59,34 +59,34 @@ class __coordinate_leaf { ptrdiff_t __idx; int dummy; public: - explicit __coordinate_leaf(ptrdiff_t __t) restrict(amp,cpu) : __idx(__t) {} + explicit __coordinate_leaf(ptrdiff_t __t) [[cpu, hc]] : __idx(__t) {} - __coordinate_leaf& operator=(const ptrdiff_t __t) restrict(amp,cpu) { + __coordinate_leaf& operator=(const ptrdiff_t __t) [[cpu, hc]] { __idx = __t; return *this; } - __coordinate_leaf& operator+=(const ptrdiff_t __t) restrict(amp,cpu) { + __coordinate_leaf& operator+=(const ptrdiff_t __t) [[cpu, hc]] { __idx += __t; return *this; } - __coordinate_leaf& operator-=(const ptrdiff_t __t) restrict(amp,cpu) { + __coordinate_leaf& operator-=(const ptrdiff_t __t) [[cpu, hc]] { __idx -= __t; return *this; } - __coordinate_leaf& operator*=(const ptrdiff_t __t) restrict(amp,cpu) { + __coordinate_leaf& operator*=(const ptrdiff_t __t) [[cpu, hc]] { __idx *= __t; return *this; } - __coordinate_leaf& operator/=(const ptrdiff_t __t) restrict(amp,cpu) { + __coordinate_leaf& operator/=(const ptrdiff_t __t) [[cpu, hc]] { __idx /= __t; return *this; } - ptrdiff_t& get() restrict(amp,cpu) { return __idx; } - const ptrdiff_t& get() const restrict(amp,cpu) { return __idx; } + ptrdiff_t& get() [[cpu, hc]] { return __idx; } + const ptrdiff_t& get() const [[cpu, hc]] { return __idx; } }; template -inline void __std_swallow(_Tp&&...) /*noexcept*/ restrict(amp,cpu) {} +inline void __std_swallow(_Tp&&...) /*noexcept*/ [[cpu, hc]] {} inline const ptrdiff_t coordinate_mul() { @@ -111,61 +111,61 @@ struct __coordinate_impl<__std_indices> { private: template - explicit __coordinate_impl(_Up... __u) restrict(amp,cpu) + explicit __coordinate_impl(_Up... __u) [[cpu, hc]] : __coordinate_leaf(__u)... {} public: - __coordinate_impl() restrict(amp,cpu) + __coordinate_impl() [[cpu, hc]] : __coordinate_leaf(0)... {} - __coordinate_impl(initializer_list il) restrict(amp,cpu) : + __coordinate_impl(initializer_list il) [[cpu, hc]] : __coordinate_leaf(*(il.begin() + N))... {} - __coordinate_impl(const __coordinate_impl& other) restrict(amp,cpu) + __coordinate_impl(const __coordinate_impl& other) [[cpu, hc]] : __coordinate_impl(static_cast&>(other).get()...) {} - __coordinate_impl(ptrdiff_t component) restrict(amp,cpu) + __coordinate_impl(ptrdiff_t component) [[cpu, hc]] : __coordinate_leaf(component)... {} - const ptrdiff_t& operator[] (size_t c) const restrict(amp,cpu) { + const ptrdiff_t& operator[] (size_t c) const [[cpu, hc]] { return static_cast&>(*((const __coordinate_leaf<0> *)this + c)).get(); } - ptrdiff_t& operator[] (size_t c) restrict(amp,cpu) { + ptrdiff_t& operator[] (size_t c) [[cpu, hc]] { return static_cast<__coordinate_leaf<0>&>(*((__coordinate_leaf<0> *)this + c)).get(); } - __coordinate_impl& operator=(const __coordinate_impl& __t) restrict(amp,cpu) { + __coordinate_impl& operator=(const __coordinate_impl& __t) [[cpu, hc]] { __std_swallow(__coordinate_leaf::operator=(static_cast&>(__t).get())...); return *this; } - __coordinate_impl& operator+=(const __coordinate_impl& __t) restrict(amp,cpu) { + __coordinate_impl& operator+=(const __coordinate_impl& __t) [[cpu, hc]] { __std_swallow(__coordinate_leaf::operator+=(static_cast&>(__t).get())...); return *this; } - __coordinate_impl& operator-=(const __coordinate_impl& __t) restrict(amp,cpu) { + __coordinate_impl& operator-=(const __coordinate_impl& __t) [[cpu, hc]] { __std_swallow(__coordinate_leaf::operator-=(static_cast&>(__t).get())...); return *this; } - __coordinate_impl& operator*=(const __coordinate_impl& __t) restrict(amp,cpu) { + __coordinate_impl& operator*=(const __coordinate_impl& __t) [[cpu, hc]] { __std_swallow(__coordinate_leaf::operator*=(static_cast&>(__t).get())...); return *this; } - __coordinate_impl& operator/=(const __coordinate_impl& __t) restrict(amp,cpu) { + __coordinate_impl& operator/=(const __coordinate_impl& __t) [[cpu, hc]] { __std_swallow(__coordinate_leaf::operator/=(static_cast&>(__t).get())...); return *this; } - __coordinate_impl& operator+=(const ptrdiff_t __t) restrict(amp,cpu) { + __coordinate_impl& operator+=(const ptrdiff_t __t) [[cpu, hc]] { __std_swallow(__coordinate_leaf::operator+=(__t)...); return *this; } - __coordinate_impl& operator-=(const ptrdiff_t __t) restrict(amp,cpu) { + __coordinate_impl& operator-=(const ptrdiff_t __t) [[cpu, hc]] { __std_swallow(__coordinate_leaf::operator-=(__t)...); return *this; } - __coordinate_impl& operator*=(const ptrdiff_t __t) restrict(amp,cpu) { + __coordinate_impl& operator*=(const ptrdiff_t __t) [[cpu, hc]] { __std_swallow(__coordinate_leaf::operator*=(__t)...); return *this; } - __coordinate_impl& operator/=(const ptrdiff_t __t) restrict(amp,cpu) { + __coordinate_impl& operator/=(const ptrdiff_t __t) [[cpu, hc]] { __std_swallow(__coordinate_leaf::operator/=(__t)...); return *this; } @@ -186,14 +186,14 @@ public: } }; -extern "C" __attribute__((const)) uint32_t amp_get_global_id(unsigned int n) restrict(amp); +extern "C" __attribute__((const)) uint32_t amp_get_global_id(unsigned int n) [[hc]]; template class offset; template struct offset_helper { - static inline void set(_Tp& now) restrict(amp,cpu) { + static inline void set(_Tp& now) [[cpu, hc]] { now[N - 1] = static_cast(amp_get_global_id(_Tp::rank - N)); offset_helper::set(now); } @@ -201,7 +201,7 @@ struct offset_helper template struct offset_helper<1, _Tp> { - static inline void set(_Tp& now) restrict(amp,cpu) { + static inline void set(_Tp& now) [[cpu, hc]] { now[0] = static_cast(amp_get_global_id(_Tp::rank - 1)); } }; @@ -216,75 +216,75 @@ public: using size_type = size_t; using value_type = ptrdiff_t; - offset() /*noexcept*/ restrict(amp,cpu) : base_() {} + offset() /*noexcept*/ [[cpu, hc]] : base_() {} template ::type> - offset(value_type v) /*noexcept*/ restrict(amp,cpu) : base_(v) {} + offset(value_type v) /*noexcept*/ [[cpu, hc]] : base_(v) {} - offset(initializer_list il) restrict(amp,cpu) : base_(il) + offset(initializer_list il) [[cpu, hc]] : base_(il) { #if __KALMAR_ACCELERATOR__ != 1 assert(il.size() == N); #endif }; - reference operator[](size_type n) restrict(amp,cpu) { + reference operator[](size_type n) [[cpu, hc]] { #if __KALMAR_ACCELERATOR__ != 1 assert(n < N); #endif return base_[n]; } - const_reference operator[](size_type n) const restrict(amp,cpu) { + const_reference operator[](size_type n) const [[cpu, hc]] { #if __KALMAR_ACCELERATOR__ != 1 assert(n < N); #endif return base_[n]; } - offset& operator+=(const offset& rhs) restrict(amp,cpu) { + offset& operator+=(const offset& rhs) [[cpu, hc]] { base_ += rhs.base_; return *this; } - offset& operator-=(const offset& rhs) restrict(amp,cpu) { + offset& operator-=(const offset& rhs) [[cpu, hc]] { base_ -= rhs.base_; return *this; } template ::type> - offset& operator++() restrict(amp,cpu) { + offset& operator++() [[cpu, hc]] { base_ += 1; return *this; } template ::type> - offset operator++(int) restrict(amp,cpu) { + offset operator++(int) [[cpu, hc]] { offset ret = *this; base_ += 1; return ret; }; template ::type> - offset& operator--() restrict(amp,cpu) { + offset& operator--() [[cpu, hc]] { base_ -= 1; return *this; } template ::type> - offset operator--(int) restrict(amp,cpu) { + offset operator--(int) [[cpu, hc]] { offset ret = *this; base_ -= 1; return ret; } - offset operator+() const /*noexcept*/ restrict(amp,cpu) { return *this; } - offset operator-() const restrict(amp,cpu) { + offset operator+() const /*noexcept*/ [[cpu, hc]] { return *this; } + offset operator-() const [[cpu, hc]] { offset __r; __r -= *this; return __r; } - offset& operator*=(value_type v) restrict(amp,cpu) { + offset& operator*=(value_type v) [[cpu, hc]] { base_ *= v; return *this; } - offset& operator/=(value_type v) restrict(amp,cpu) { + offset& operator/=(value_type v) [[cpu, hc]] { base_ /= v; return *this; } @@ -300,7 +300,7 @@ private: public: __attribute__((annotate("__cxxamp_opencl_index"))) - void __cxxamp_opencl_index() restrict(amp,cpu) + void __cxxamp_opencl_index() [[cpu, hc]] #if __KALMAR_ACCELERATOR__ == 1 { offset_helper>::set(*this); @@ -394,7 +394,7 @@ class bounds_iterator : public std::iterator friend class bounds; ptrdiff_t stride; bounds bnd_; // exposition only - explicit bounds_iterator(const bounds& bnd_, ptrdiff_t stride_ = 0) restrict(amp,cpu) + explicit bounds_iterator(const bounds& bnd_, ptrdiff_t stride_ = 0) [[cpu, hc]] : bnd_(bnd_), stride(stride_) {} public: using value_type = offset; @@ -523,16 +523,16 @@ public: using size_type = size_t; using value_type = ptrdiff_t; - bounds() restrict(amp,cpu) : base_() {} + bounds() [[cpu, hc]] : base_() {} template ::type> - bounds(value_type v) restrict(amp,cpu) : base_(v) { + bounds(value_type v) [[cpu, hc]] : base_(v) { #if __KALMAR_ACCELERATOR__ != 1 assert(v >= 0 && v <= numeric_limits::max()); #endif } - bounds(initializer_list il) restrict(amp,cpu) : base_(il) { + bounds(initializer_list il) [[cpu, hc]] : base_(il) { #if __KALMAR_ACCELERATOR__ != 1 assert(il.size() == N); #endif diff --git a/include/experimental/algorithm b/include/experimental/algorithm index 408bcd8047f..eb7a8f35015 100644 --- a/include/experimental/algorithm +++ b/include/experimental/algorithm @@ -19,7 +19,7 @@ */ #pragma once -#include "../hc.hpp" +#include #include "execution_policy" diff --git a/include/experimental/impl/algorithm_impl.inl b/include/experimental/impl/algorithm_impl.inl index b2f9a5867b1..a261653b647 100644 --- a/include/experimental/impl/algorithm_impl.inl +++ b/include/experimental/impl/algorithm_impl.inl @@ -46,12 +46,12 @@ void generate_impl(ForwardIterator first, ForwardIterator last, } // FIXME: [[hc]] will cause g() having ambient context, - // use restrict(amp) temporarily + // use [[hc]] temporarily using _Ty = typename std::iterator_traits::value_type; auto first_ = utils::get_pointer(first); hc::array_view<_Ty> av(hc::extent<1>(N), first_); av.discard_data(); - kernel_launch(N, [av, g](hc::index<1> idx) restrict(amp) { + kernel_launch(N, [av, g](hc::index<1> idx) [[hc]] { av(idx) = g(); }); } diff --git a/include/grid_launch.h b/include/grid_launch.h deleted file mode 100644 index f91d23341a3..00000000000 --- a/include/grid_launch.h +++ /dev/null @@ -1,69 +0,0 @@ -#pragma once - -#include - -#include - -#define GRID_LAUNCH_VERSION 20 - -// Extern definitions -namespace hc{ -class completion_future; -class accelerator_view; -} - - -// 3 dim structure for groups and grids. -typedef struct gl_dim3 -{ - int x,y,z; - gl_dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {}; -} gl_dim3; - -typedef enum gl_barrier_bit { - barrier_bit_queue_default, - barrier_bit_none, - barrier_bit_wait, -} gl_barrier_bit; - - -// grid_launch_parm contains information used to launch the kernel. -typedef struct grid_launch_parm -{ - //! Grid dimensions - gl_dim3 grid_dim; - - //! Group dimensions - gl_dim3 group_dim;; - - //! Amount of dynamic group memory to use with the kernel launch. - //! This memory is in addition to the amount used statically in the kernel. - unsigned int dynamic_group_mem_bytes;; - - //! Control setting of barrier bit on per-packet basis: - //! See gl_barrier_bit description. - //! Placeholder, is not used to control packet dispatch yet - enum gl_barrier_bit barrier_bit; - - //! Value of packet fences to apply to launch. - //! The correspond to the value of bits 9:14 in the AQL packet, - //! see HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE and hsa_fence_scope_t. - //! Set to -1 for conservative defaults. - //! Placeholder, is not used to control packet dispatch yet - unsigned int launch_fence; - - //! Pointer to the accelerator_view where the kernel should execute. - //! If NULL, the default view on the default accelerator is used. - hc::accelerator_view *av; - - //! Pointe to the completion_future used to track the status of the command. - //! If NULL, the command does not write status. In this case, - //! synchronization can be enforced with queue-level waits or - //! waiting on younger commands. - hc::completion_future *cf; - - grid_launch_parm() = default; -} grid_launch_parm; - - -extern void init_grid_launch(grid_launch_parm *gl); diff --git a/include/grid_launch.hpp b/include/grid_launch.hpp deleted file mode 100644 index 04ce7e03664..00000000000 --- a/include/grid_launch.hpp +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -#include "grid_launch.h" -#include "hc.hpp" - -class grid_launch_parm_cxx : public grid_launch_parm -{ -public: - grid_launch_parm_cxx() = default; - - // customized serialization: don't need av and cf in kernel - __attribute__((annotate("serialize"))) - void __cxxamp_serialize(Kalmar::Serialize& s) const { - s.Append(sizeof(int), &grid_dim.x); - s.Append(sizeof(int), &grid_dim.y); - s.Append(sizeof(int), &grid_dim.z); - s.Append(sizeof(int), &group_dim.x); - s.Append(sizeof(int), &group_dim.y); - s.Append(sizeof(int), &group_dim.z); - } - - __attribute__((annotate("user_deserialize"))) - grid_launch_parm_cxx(int grid_dim_x, int grid_dim_y, int grid_dim_z, - int group_dim_x, int group_dim_y, int group_dim_z) { - grid_dim.x = grid_dim_x; - grid_dim.y = grid_dim_y; - grid_dim.z = grid_dim_z; - group_dim.x = group_dim_x; - group_dim.y = group_dim_y; - group_dim.z = group_dim_z; - } -}; - - -extern inline void grid_launch_init(grid_launch_parm *lp) { - lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1; - - lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1; - - lp->dynamic_group_mem_bytes = 0; - - lp->barrier_bit = barrier_bit_queue_default; - lp->launch_fence = -1; - - // TODO - set to NULL? - static hc::accelerator_view av = hc::accelerator().get_default_view(); - lp->av = &av; - lp->cf = NULL; -} - diff --git a/include/hc.hpp b/include/hc.hpp index 5817321b902..b456e633b41 100644 --- a/include/hc.hpp +++ b/include/hc.hpp @@ -12,17 +12,14 @@ #pragma once +#if !defined(__HIPCC__) + #warning "This header is only intended for HIP usage, and not for direct inclusion." +#endif + #include "hc_defines.h" #include "kalmar_exception.h" -#include "kalmar_index.h" #include "kalmar_runtime.h" -#include "kalmar_serialize.h" -#include "kalmar_launch.h" -#include "kalmar_buffer.h" -#include "kalmar_math.h" -#include "hsa_atomic.h" -#include "kalmar_cpu_launch.h" #include "hcc_features.hpp" #ifndef __HC__ @@ -50,32 +47,10 @@ class AmPointerInfo; using namespace Kalmar::enums; using namespace Kalmar::CLAMP; - // forward declaration class accelerator; class accelerator_view; class completion_future; -template class extent; -template class tiled_extent; -template class array_view; -template class array; - - - -// namespace alias -// namespace hc::fast_math is an alias of namespace Kalmar::fast_math -namespace fast_math = Kalmar::fast_math; - -// namespace hc::precise_math is an alias of namespace Kalmar::precise_math -namespace precise_math = Kalmar::precise_math; - -// type alias - -/** - * Represents a unique position in N-dimensional space. - */ -template -using index = Kalmar::index; using runtime_exception = Kalmar::runtime_exception; using invalid_compute_domain = Kalmar::invalid_compute_domain; @@ -661,64 +636,6 @@ class accelerator_view { std::shared_ptr pQueue; friend class accelerator; - template friend class array; - template friend class array_view; - - template friend - void* Kalmar::mcw_cxxamp_get_kernel(const std::shared_ptr&, const Kernel&); - template friend - void Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory(const std::shared_ptr&, size_t *, size_t *, const Kernel&, void*, size_t); - template friend - std::shared_ptr Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(const std::shared_ptr&, size_t *, size_t *, const Kernel&, void*, size_t); - template friend - void Kalmar::mcw_cxxamp_launch_kernel(const std::shared_ptr&, size_t *, size_t *, const Kernel&); - template friend - std::shared_ptr Kalmar::mcw_cxxamp_launch_kernel_async(const std::shared_ptr&, size_t *, size_t *, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - completion_future launch_cpu_task_async(const std::shared_ptr&, Kernel const&, extent const&); -#endif - - // non-tiled parallel_for_each - // generic version - template friend - completion_future parallel_for_each(const accelerator_view&, const extent&, const Kernel&); - - // 1D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&); - - // 2D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&); - - // 3D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&); - - // tiled parallel_for_each, 3D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&); - - // tiled parallel_for_each, 2D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&); - - // tiled parallel_for_each, 1D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&); - - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -public: -#endif - __attribute__((annotate("user_deserialize"))) - accelerator_view() __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - throw runtime_exception("errorMsg_throw", 0); -#endif - } }; // ------------------------------------------------------------------------ @@ -1310,7 +1227,7 @@ class completion_future { // void then(const functor& func) const; template void then(const functor & func) { -#if __KALMAR_ACCELERATOR__ != 1 +#if __HCC_ACCELERATOR__ != 1 // could only assign once if (__thread_then == nullptr) { // spawn a new thread to wait on the future and then execute the callback functor @@ -1424,63 +1341,6 @@ class completion_future { : __amp_future(__future), __thread_then(nullptr), __asyncOp(nullptr) {} friend class Kalmar::HSAQueue; - - // non-tiled parallel_for_each - // generic version - template friend - completion_future parallel_for_each(const accelerator_view&, const extent&, const Kernel&); - - // 1D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&); - - // 2D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&); - - // 3D specialization - template friend - completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&); - - // tiled parallel_for_each, 3D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&); - - // tiled parallel_for_each, 2D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&); - - // tiled parallel_for_each, 1D version - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&); - - // copy_async - template friend - completion_future copy_async(const array_view& src, const array_view& dest); - template friend - completion_future copy_async(const array& src, array& dest); - template friend - completion_future copy_async(const array& src, const array_view& dest); - template friend - completion_future copy_async(const array_view& src, const array_view& dest); - template friend - completion_future copy_async(const array_view& src, array& dest); - - template friend - completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array& dest); - template friend - completion_future copy_async(InputIter srcBegin, InputIter srcEnd, const array_view& dest); - template friend - completion_future copy_async(InputIter srcBegin, array& dest); - template friend - completion_future copy_async(InputIter srcBegin, const array_view& dest); - template friend - completion_future copy_async(const array& src, OutputIter destBegin); - template friend - completion_future copy_async(const array_view& src, OutputIter destBegin); - - // array_view - template friend class array_view; // accelerator_view friend class accelerator_view; @@ -1609,1291 +1469,482 @@ accelerator_view::copy2d_async_ext(const void *src, void *dst, size_t width, siz }; // ------------------------------------------------------------------------ -// extent +// Intrinsic functions for HSAIL instructions // ------------------------------------------------------------------------ /** - * Represents a unique position in N-dimensional space. + * Fetch the size of a wavefront * - * @tparam N The dimension to this extent applies. Special constructors are - * supplied for the cases where @f$N \in \{ 1,2,3 \}@f$, but N can - * be any integer greater than or equal to 1. + * @return The size of a wavefront. */ -template -class extent { -public: - /** - * A static member of extent that contains the rank of this extent. - */ - static const int rank = N; - - /** - * The element type of extent. - */ - typedef int value_type; - - /** - * Default constructor. The value at each dimension is initialized to zero. - * Thus, "extent<3> ix;" initializes the variable to the position (0,0,0). - */ - extent() __CPU__ __HC__ : base_() { - static_assert(N > 0, "Dimensionality must be positive"); - }; - - /** - * Copy constructor. Constructs a new extent from the supplied argument. - * - * @param other An object of type extent from which to initialize this - * new extent. - */ - extent(const extent& other) __CPU__ __HC__ - : base_(other.base_) {} - - /** @{ */ - /** - * Constructs an extent with the coordinate values provided by @f$e_{0..2}@f$. - * These are specialized constructors that are only valid when the rank of - * the extent @f$N \in \{1,2,3\}@f$. Invoking a specialized constructor - * whose argument @f$count \ne N@f$ will result in a compilation error. - * - * @param[in] e0 The component values of the extent vector. - */ - explicit extent(int e0) __CPU__ __HC__ - : base_(e0) {} - - template - explicit extent(_Tp ... __t) __CPU__ __HC__ - : base_(__t...) { - static_assert(sizeof...(__t) <= 3, "Can only supply at most 3 individual coordinates in the constructor"); - static_assert(sizeof...(__t) == N, "rank should be consistency"); - } - - /** @} */ - - /** - * Constructs an extent with the coordinate values provided the array of - * int component values. If the coordinate array length @f$\ne@f$ N, the - * behavior is undefined. If the array value is NULL or not a valid - * pointer, the behavior is undefined. - * - * @param[in] components An array of N int values. - */ - explicit extent(const int components[]) __CPU__ __HC__ - : base_(components) {} - - /** - * Constructs an extent with the coordinate values provided the array of - * int component values. If the coordinate array length @f$\ne@f$ N, the - * behavior is undefined. If the array value is NULL or not a valid - * pointer, the behavior is undefined. - * - * @param[in] components An array of N int values. - */ - explicit extent(int components[]) __CPU__ __HC__ - : base_(components) {} - - /** - * Assigns the component values of "other" to this extent object. - * - * @param[in] other An object of type extent from which to copy into - * this extent. - * @return Returns *this. - */ - extent& operator=(const extent& other) __CPU__ __HC__ { - base_.operator=(other.base_); - return *this; - } - - /** @{ */ - /** - * Returns the extent component value at position c. - * - * @param[in] c The dimension axis whose coordinate is to be accessed. - * @return A the component value at position c. - */ - int operator[] (unsigned int c) const __CPU__ __HC__ { - return base_[c]; - } - int& operator[] (unsigned int c) __CPU__ __HC__ { - return base_[c]; - } - - /** @} */ +#define __HSA_WAVEFRONT_SIZE__ (64) +extern "C" unsigned int __wavesize() __HC__; - /** - * Tests whether the index "idx" is properly contained within this extent - * (with an assumed origin of zero). - * - * @param[in] idx An object of type index - * @return Returns true if the "idx" is contained within the space defined - * by this extent (with an assumed origin of zero). - */ - bool contains(const index& idx) const __CPU__ __HC__ { - return Kalmar::amp_helper, extent>::contains(idx, *this); - } - /** - * This member function returns the total linear size of this extent (in - * units of elements), which is computed as: - * extent[0] * extent[1] ... * extent[N-1] - */ - unsigned int size() const __CPU__ __HC__ { - return Kalmar::index_helper>::count_size(*this); - } +#if __hcc_backend__==HCC_BACKEND_AMDGPU +extern "C" inline unsigned int __wavesize() __HC__ { + return __HSA_WAVEFRONT_SIZE__; +} +#endif - /** @{ */ - /** - * Produces a tiled_extent object with the tile extents given by t0, t1, - * and t2. - * - * tile(t0, t1, t2) is only supported on extent<1>. It will produce a - * compile-time error if used on an extent where N @f$\ne@f$ 3. - * tile(t0, t1) is only supported on extent<2>. It will produce a - * compile-time error if used on an extent where N @f$\ne@f$ 2. - * tile(t0) is only supported on extent<1>. It will produce a - * compile-time error if used on an extent where N @f$\ne@f$ 1. - */ - tiled_extent<1> tile(int t0) const; - tiled_extent<2> tile(int t0, int t1) const; - tiled_extent<3> tile(int t0, int t1, int t2) const; +/** + * Count number of 1 bits in the input + * + * @param[in] input An unsinged 32-bit integer. + * @return Number of 1 bits in the input. + */ +extern "C" inline unsigned int __popcount_u32_b32(unsigned int input) __HC__ { + return __builtin_popcount(input); +} - /** @} */ +/** + * Count number of 1 bits in the input + * + * @param[in] input An unsinged 64-bit integer. + * @return Number of 1 bits in the input. + */ +extern "C" inline unsigned int __popcount_u32_b64(unsigned long long int input) __HC__ { + return __builtin_popcountl(input); +} - /** @{ */ - /** - * Produces a tiled_extent object with the tile extents given by t0, t1, - * and t2, plus a certain amount of dynamic group segment. - */ - tiled_extent<1> tile_with_dynamic(int t0, int dynamic_size) const; - tiled_extent<2> tile_with_dynamic(int t0, int t1, int dynamic_size) const; - tiled_extent<3> tile_with_dynamic(int t0, int t1, int t2, int dynamic_size) const; +/** @{ */ +/** + * Extract a range of bits + * + * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + */ +extern "C" inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ { + uint32_t offset = src1 & 31; + uint32_t width = src2 & 31; + return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width); +} - /** @} */ +extern "C" inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__ { + uint64_t offset = src1 & 63; + uint64_t width = src2 & 63; + return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width); +} - /** @{ */ - /** - * Compares two objects of extent. - * - * The expression - * leftExt @f$\oplus@f$ rightExt - * is true if leftExt[i] @f$\oplus@f$ rightExt[i] for every i from 0 to N-1. - * - * @param[in] other The right-hand extent to be compared. - */ - bool operator==(const extent& other) const __CPU__ __HC__ { - return Kalmar::index_helper >::equal(*this, other); - } - bool operator!=(const extent& other) const __CPU__ __HC__ { - return !(*this == other); - } +extern "C" int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__; - /** @} */ +extern "C" int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__; +/** @} */ - /** @{ */ - /** - * Adds (or subtracts) an object of type extent from this extent to form - * a new extent. The result extent is such that for a given operator @f$\oplus@f$, - * result[i] = this[i] @f$\oplus@f$ ext[i] - * - * @param[in] ext The right-hand extent to be added or subtracted. - */ - extent& operator+=(const extent& __r) __CPU__ __HC__ { - base_.operator+=(__r.base_); - return *this; - } - extent& operator-=(const extent& __r) __CPU__ __HC__ { - base_.operator-=(__r.base_); - return *this; - } - extent& operator*=(const extent& __r) __CPU__ __HC__ { - base_.operator*=(__r.base_); - return *this; - } - extent& operator/=(const extent& __r) __CPU__ __HC__ { - base_.operator/=(__r.base_); - return *this; - } - extent& operator%=(const extent& __r) __CPU__ __HC__ { - base_.operator%=(__r.base_); - return *this; - } +/** @{ */ +/** + * Replace a range of bits + * + * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + */ +extern "C" inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__ { + uint32_t offset = src2 & 31; + uint32_t width = src3 & 31; + uint32_t mask = (1 << width) - 1; + return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset)); +} - /** @} */ +extern "C" inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__ { + uint64_t offset = src2 & 63; + uint64_t width = src3 & 63; + uint64_t mask = (1 << width) - 1; + return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset)); +} - /** @{ */ - /** - * Adds (or subtracts) an object of type index from this extent to form - * a new extent. The result extent is such that for a given operator @f$\oplus@f$, - * result[i] = this[i] @f$\oplus@f$ idx[i] - * - * @param[in] idx The right-hand index to be added or subtracted. - */ - extent operator+(const index& idx) __CPU__ __HC__ { - extent __r = *this; - __r += idx; - return __r; - } - extent operator-(const index& idx) __CPU__ __HC__ { - extent __r = *this; - __r -= idx; - return __r; - } - extent& operator+=(const index& idx) __CPU__ __HC__ { - base_.operator+=(idx.base_); - return *this; - } - extent& operator-=(const index& idx) __CPU__ __HC__ { - base_.operator-=(idx.base_); - return *this; - } +extern "C" int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__; - /** @} */ +extern "C" int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__; +/** @} */ - /** @{ */ - /** - * For a given operator @f$\oplus@f$, produces the same effect as - * (*this) = (*this) @f$\oplus@f$ value - * - * The return value is "*this". - * - * @param[in] value The right-hand int of the arithmetic operation. - */ - extent& operator+=(int value) __CPU__ __HC__ { - base_.operator+=(value); - return *this; - } - extent& operator-=(int value) __CPU__ __HC__ { - base_.operator-=(value); - return *this; - } - extent& operator*=(int value) __CPU__ __HC__ { - base_.operator*=(value); - return *this; - } - extent& operator/=(int value) __CPU__ __HC__ { - base_.operator/=(value); - return *this; - } - extent& operator%=(int value) __CPU__ __HC__ { - base_.operator%=(value); - return *this; - } +/** @{ */ +/** + * Create a bit mask that can be used with bitselect + * + * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + */ +extern "C" unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__; - /** @} */ +extern "C" uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__; +/** @} */ - /** @{ */ - /** - * For a given operator @f$\oplus@f$, produces the same effect as - * (*this) = (*this) @f$\oplus@f$ 1 - * - * For prefix increment and decrement, the return value is "*this". - * Otherwise a new extent is returned. - */ - extent& operator++() __CPU__ __HC__ { - base_.operator+=(1); - return *this; - } - extent operator++(int) __CPU__ __HC__ { - extent ret = *this; - base_.operator+=(1); - return ret; - } - extent& operator--() __CPU__ __HC__ { - base_.operator-=(1); - return *this; - } - extent operator--(int) __CPU__ __HC__ { - extent ret = *this; - base_.operator-=(1); - return ret; - } +/** @{ */ +/** + * Reverse the bits + * + * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + */ - /** @} */ +unsigned int __bitrev_b32(unsigned int src0) [[hc]] __asm("llvm.bitreverse.i32"); -private: - typedef Kalmar::index_impl::type> base; - base base_; - template friend struct Kalmar::index_helper; - template friend struct Kalmar::amp_helper; -}; +uint64_t __bitrev_b64(uint64_t src0) [[hc]] __asm("llvm.bitreverse.i64"); -// ------------------------------------------------------------------------ -// global functions for extent -// ------------------------------------------------------------------------ +/** @} */ /** @{ */ /** - * Adds (or subtracts) two objects of extent to form a new extent. The - * result extent is such that for a given operator @f$\oplus@f$, - * result[i] = leftExt[i] @f$\oplus@f$ rightExt[i] - * for every i from 0 to N-1. + * Do bit field selection * - * @param[in] lhs The left-hand extent to be compared. - * @param[in] rhs The right-hand extent to be compared. + * Please refer to HSA PRM 5.7 for more detailed specification of these functions. */ -// FIXME: the signature is not entirely the same as defined in: -// C++AMP spec v1.2 #1253 -template -extent operator+(const extent& lhs, const extent& rhs) __CPU__ __HC__ { - extent __r = lhs; - __r += rhs; - return __r; -} -template -extent operator-(const extent& lhs, const extent& rhs) __CPU__ __HC__ { - extent __r = lhs; - __r -= rhs; - return __r; +extern "C" inline unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ { + return (src1 & src0) | (src2 & ~src0); } +extern "C" inline uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__ { + return (src1 & src0) | (src2 & ~src0); +} /** @} */ -/** @{ */ /** - * Binary arithmetic operations that produce a new extent that is the result - * of performing the corresponding binary arithmetic operation on the elements - * of the extent operands. The result extent is such that for a given - * operator @f$\oplus@f$, - * result[i] = ext[i] @f$\oplus@f$ value - * or - * result[i] = value @f$\oplus@f$ ext[i] - * for every i from 0 to N-1. + * Count leading zero bits in the input * - * @param[in] ext The extent operand - * @param[in] value The integer operand + * @param[in] input An unsigned 32-bit integer. + * @return Number of 0 bits until a 1 bit is found, counting start from the + * most significant bit. -1 if there is no 0 bit. */ -// FIXME: the signature is not entirely the same as defined in: -// C++AMP spec v1.2 #1259 -template -extent operator+(const extent& ext, int value) __CPU__ __HC__ { - extent __r = ext; - __r += value; - return __r; -} -template -extent operator+(int value, const extent& ext) __CPU__ __HC__ { - extent __r = ext; - __r += value; - return __r; -} -template -extent operator-(const extent& ext, int value) __CPU__ __HC__ { - extent __r = ext; - __r -= value; - return __r; -} -template -extent operator-(int value, const extent& ext) __CPU__ __HC__ { - extent __r(value); - __r -= ext; - return __r; -} -template -extent operator*(const extent& ext, int value) __CPU__ __HC__ { - extent __r = ext; - __r *= value; - return __r; -} -template -extent operator*(int value, const extent& ext) __CPU__ __HC__ { - extent __r = ext; - __r *= value; - return __r; -} -template -extent operator/(const extent& ext, int value) __CPU__ __HC__ { - extent __r = ext; - __r /= value; - return __r; -} -template -extent operator/(int value, const extent& ext) __CPU__ __HC__ { - extent __r(value); - __r /= ext; - return __r; -} -template -extent operator%(const extent& ext, int value) __CPU__ __HC__ { - extent __r = ext; - __r %= value; - return __r; -} -template -extent operator%(int value, const extent& ext) __CPU__ __HC__ { - extent __r(value); - __r %= ext; - return __r; +extern "C" inline unsigned int __firstbit_u32_u32(unsigned int input) __HC__ { + return input == 0 ? -1 : __builtin_clz(input); } -/** @} */ - -// ------------------------------------------------------------------------ -// tiled_extent -// ------------------------------------------------------------------------ /** - * Represents an extent subdivided into tiles. - * Tile sizes can be specified at runtime. + * Count leading zero bits in the input * - * @tparam N The dimension of the extent and the tile. + * @param[in] input An unsigned 64-bit integer. + * @return Number of 0 bits until a 1 bit is found, counting start from the + * most significant bit. -1 if there is no 0 bit. */ -template -class tiled_extent : public extent { -public: - static const int rank = N; - - /** - * Tile size for each dimension. - */ - int tile_dim[N]; - - /** - * Default constructor. The origin and extent is default-constructed and - * thus zero. - */ - tiled_extent() __CPU__ __HC__ : extent(), tile_dim{0} {} - - /** - * Copy constructor. Constructs a new tiled_extent from the supplied - * argument "other". - * - * @param[in] other An object of type tiled_extent from which to initialize - * this new extent. - */ - tiled_extent(const tiled_extent& other) __CPU__ __HC__ : extent(other) { - for (int i = 0; i < N; ++i) { - tile_dim[i] = other.tile_dim[i]; - } - } -}; +extern "C" inline unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__ { + return input == 0 ? -1 : __builtin_clzl(input); +} /** - * Represents an extent subdivided into tiles. - * Tile sizes can be specified at runtime. - * This class is 1D specialization of tiled_extent. + * Count leading zero bits in the input + * + * @param[in] input An signed 32-bit integer. + * @return Finds the first bit set in a positive integer starting from the + * most significant bit, or finds the first bit clear in a negative + * integer from the most significant bit. + * If no bits in the input are set, then dest is set to -1. */ -template <> -class tiled_extent<1> : public extent<1> { -private: - /** - * Size of dynamic group segment. - */ - unsigned int dynamic_group_segment_size; - -public: - static const int rank = 1; - - /** - * Tile size for each dimension. - */ - int tile_dim[1]; - - /** - * Default constructor. The origin and extent is default-constructed and - * thus zero. - */ - tiled_extent() __CPU__ __HC__ : extent(0), dynamic_group_segment_size(0), tile_dim{0} {} +extern "C" inline unsigned int __firstbit_u32_s32(int input) __HC__ { + if (input == 0) { + return -1; + } - /** - * Construct an tiled extent with the size of extent and the size of tile - * specified. - * - * @param[in] e0 Size of extent. - * @param[in] t0 Size of tile. - */ - tiled_extent(int e0, int t0) __CPU__ __HC__ : extent(e0), dynamic_group_segment_size(0), tile_dim{t0} {} + return input > 0 ? __firstbit_u32_u32(input) : __firstbit_u32_u32(~input); +} - /** - * Construct an tiled extent with the size of extent and the size of tile - * specified. - * - * @param[in] e0 Size of extent. - * @param[in] t0 Size of tile. - * @param[in] size Size of dynamic group segment. - */ - tiled_extent(int e0, int t0, int size) __CPU__ __HC__ : extent(e0), dynamic_group_segment_size(size), tile_dim{t0} {} - /** - * Copy constructor. Constructs a new tiled_extent from the supplied - * argument "other". - * - * @param[in] other An object of type tiled_extent from which to initialize - * this new extent. - */ - tiled_extent(const tiled_extent<1>& other) __CPU__ __HC__ : extent(other[0]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0]} {} +/** + * Count leading zero bits in the input + * + * @param[in] input An signed 64-bit integer. + * @return Finds the first bit set in a positive integer starting from the + * most significant bit, or finds the first bit clear in a negative + * integer from the most significant bit. + * If no bits in the input are set, then dest is set to -1. + */ +extern "C" inline unsigned int __firstbit_u32_s64(long long int input) __HC__ { + if (input == 0) { + return -1; + } + return input > 0 ? __firstbit_u32_u64(input) : __firstbit_u32_u64(~input); +} - /** - * Constructs a tiled_extent with the extent "ext". - * - * @param[in] ext The extent of this tiled_extent - * @param[in] t0 Size of tile. - */ - tiled_extent(const extent<1>& ext, int t0) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0} {} +/** @{ */ +/** + * Find the first bit set to 1 in a number starting from the + * least significant bit + * + * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + */ +extern "C" inline unsigned int __lastbit_u32_u32(unsigned int input) __HC__ { + return input == 0 ? -1 : __builtin_ctz(input); +} - /** - * Constructs a tiled_extent with the extent "ext". - * - * @param[in] ext The extent of this tiled_extent - * @param[in] t0 Size of tile. - * @param[in] size Size of dynamic group segment - */ - tiled_extent(const extent<1>& ext, int t0, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0} {} +extern "C" inline unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__ { + return input == 0 ? -1 : __builtin_ctzl(input); +} - /** - * Set the size of dynamic group segment. The function should be called - * in host code, prior to a kernel is dispatched. - * - * @param[in] size The amount of dynamic group segment needed. - */ - void set_dynamic_group_segment_size(unsigned int size) __CPU__ { - dynamic_group_segment_size = size; - } +extern "C" inline unsigned int __lastbit_u32_s32(int input) __HC__ { + return __lastbit_u32_u32(input); +} - /** - * Return the size of dynamic group segment in bytes. - */ - unsigned int get_dynamic_group_segment_size() const __CPU__ { - return dynamic_group_segment_size; - } -}; +extern "C" inline unsigned int __lastbit_u32_s64(unsigned long long input) __HC__ { + return __lastbit_u32_u64(input); +} +/** @} */ +/** @{ */ /** - * Represents an extent subdivided into tiles. - * Tile sizes can be specified at runtime. - * This class is 2D specialization of tiled_extent. + * Copy and interleave the lower half of the elements from + * each source into the desitionation + * + * Please refer to HSA PRM 5.9 for more detailed specification of these functions. */ -template <> -class tiled_extent<2> : public extent<2> { -private: - /** - * Size of dynamic group segment. - */ - unsigned int dynamic_group_segment_size; - -public: - static const int rank = 2; +extern "C" unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__; - /** - * Tile size for each dimension. - */ - int tile_dim[2]; +extern "C" uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__; - /** - * Default constructor. The origin and extent is default-constructed and - * thus zero. - */ - tiled_extent() __CPU__ __HC__ : extent(0, 0), dynamic_group_segment_size(0), tile_dim{0, 0} {} +extern "C" unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__; - /** - * Construct an tiled extent with the size of extent and the size of tile - * specified. - * - * @param[in] e0 Size of extent in the 1st dimension. - * @param[in] e1 Size of extent in the 2nd dimension. - * @param[in] t0 Size of tile in the 1st dimension. - * @param[in] t1 Size of tile in the 2nd dimension. - */ - tiled_extent(int e0, int e1, int t0, int t1) __CPU__ __HC__ : extent(e0, e1), dynamic_group_segment_size(0), tile_dim{t0, t1} {} +extern "C" uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__; - /** - * Construct an tiled extent with the size of extent and the size of tile - * specified. - * - * @param[in] e0 Size of extent in the 1st dimension. - * @param[in] e1 Size of extent in the 2nd dimension. - * @param[in] t0 Size of tile in the 1st dimension. - * @param[in] t1 Size of tile in the 2nd dimension. - * @param[in] size Size of dynamic group segment. - */ - tiled_extent(int e0, int e1, int t0, int t1, int size) __CPU__ __HC__ : extent(e0, e1), dynamic_group_segment_size(size), tile_dim{t0, t1} {} +extern "C" uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__; - /** - * Copy constructor. Constructs a new tiled_extent from the supplied - * argument "other". - * - * @param[in] other An object of type tiled_extent from which to initialize - * this new extent. - */ - tiled_extent(const tiled_extent<2>& other) __CPU__ __HC__ : extent(other[0], other[1]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1]} {} +extern "C" int __unpacklo_s8x4(int src0, int src1) __HC__; - /** - * Constructs a tiled_extent with the extent "ext". - * - * @param[in] ext The extent of this tiled_extent - * @param[in] t0 Size of tile in the 1st dimension. - * @param[in] t1 Size of tile in the 2nd dimension. - */ - tiled_extent(const extent<2>& ext, int t0, int t1) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0, t1} {} +extern "C" int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__; - /** - * Constructs a tiled_extent with the extent "ext". - * - * @param[in] ext The extent of this tiled_extent - * @param[in] t0 Size of tile in the 1st dimension. - * @param[in] t1 Size of tile in the 2nd dimension. - * @param[in] size Size of dynamic group segment. - */ - tiled_extent(const extent<2>& ext, int t0, int t1, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1} {} +extern "C" int __unpacklo_s16x2(int src0, int src1) __HC__; - /** - * Set the size of dynamic group segment. The function should be called - * in host code, prior to a kernel is dispatched. - * - * @param[in] size The amount of dynamic group segment needed. - */ - void set_dynamic_group_segment_size(unsigned int size) __CPU__ { - dynamic_group_segment_size = size; - } +extern "C" int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__; - /** - * Return the size of dynamic group segment in bytes. - */ - unsigned int get_dynamic_group_segment_size() const __CPU__ { - return dynamic_group_segment_size; - } -}; +extern "C" int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__; +/** @} */ +/** @{ */ /** - * Represents an extent subdivided into tiles. - * Tile sizes can be specified at runtime. - * This class is 3D specialization of tiled_extent. + * Copy and interleave the upper half of the elements from + * each source into the desitionation + * + * Please refer to HSA PRM 5.9 for more detailed specification of these functions. */ -template <> -class tiled_extent<3> : public extent<3> { -private: - /** - * Size of dynamic group segment. - */ - unsigned int dynamic_group_segment_size; +extern "C" unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__; -public: - static const int rank = 3; +extern "C" uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__; - /** - * Tile size for each dimension. - */ - int tile_dim[3]; +extern "C" unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__; - /** - * Default constructor. The origin and extent is default-constructed and - * thus zero. - */ - tiled_extent() __CPU__ __HC__ : extent(0, 0, 0), dynamic_group_segment_size(0), tile_dim{0, 0, 0} {} +extern "C" uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__; - /** - * Construct an tiled extent with the size of extent and the size of tile - * specified. - * - * @param[in] e0 Size of extent in the 1st dimension. - * @param[in] e1 Size of extent in the 2nd dimension. - * @param[in] e2 Size of extent in the 3rd dimension. - * @param[in] t0 Size of tile in the 1st dimension. - * @param[in] t1 Size of tile in the 2nd dimension. - * @param[in] t2 Size of tile in the 3rd dimension. - */ - tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2) __CPU__ __HC__ : extent(e0, e1, e2), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {} +extern "C" uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__; - /** - * Construct an tiled extent with the size of extent and the size of tile - * specified. - * - * @param[in] e0 Size of extent in the 1st dimension. - * @param[in] e1 Size of extent in the 2nd dimension. - * @param[in] e2 Size of extent in the 3rd dimension. - * @param[in] t0 Size of tile in the 1st dimension. - * @param[in] t1 Size of tile in the 2nd dimension. - * @param[in] t2 Size of tile in the 3rd dimension. - * @param[in] size Size of dynamic group segment. - */ - tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2, int size) __CPU__ __HC__ : extent(e0, e1, e2), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {} +extern "C" int __unpackhi_s8x4(int src0, int src1) __HC__; - /** - * Copy constructor. Constructs a new tiled_extent from the supplied - * argument "other". - * - * @param[in] other An object of type tiled_extent from which to initialize - * this new extent. - */ - tiled_extent(const tiled_extent<3>& other) __CPU__ __HC__ : extent(other[0], other[1], other[2]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1], other.tile_dim[2]} {} +extern "C" int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__; - /** - * Constructs a tiled_extent with the extent "ext". - * - * @param[in] ext The extent of this tiled_extent - * @param[in] t0 Size of tile in the 1st dimension. - * @param[in] t1 Size of tile in the 2nd dimension. - * @param[in] t2 Size of tile in the 3rd dimension. - */ - tiled_extent(const extent<3>& ext, int t0, int t1, int t2) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {} +extern "C" int __unpackhi_s16x2(int src0, int src1) __HC__; - /** - * Constructs a tiled_extent with the extent "ext". - * - * @param[in] ext The extent of this tiled_extent - * @param[in] t0 Size of tile in the 1st dimension. - * @param[in] t1 Size of tile in the 2nd dimension. - * @param[in] t2 Size of tile in the 3rd dimension. - * @param[in] size Size of dynamic group segment. - */ - tiled_extent(const extent<3>& ext, int t0, int t1, int t2, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {} +extern "C" int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__; - /** - * Set the size of dynamic group segment. The function should be called - * in host code, prior to a kernel is dispatched. - * - * @param[in] size The amount of dynamic group segment needed. - */ - void set_dynamic_group_segment_size(unsigned int size) __CPU__ { - dynamic_group_segment_size = size; - } +extern "C" int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__; +/** @} */ - /** - * Return the size of dynamic group segment in bytes. - */ - unsigned int get_dynamic_group_segment_size() const __CPU__ { - return dynamic_group_segment_size; - } -}; +/** @{ */ +/** + * Assign the elements of the packed value in src0, replacing + * the element specified by src2 with the value from src1 + * + * Please refer to HSA PRM 5.9 for more detailed specification of these functions. + */ +extern "C" unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; -// ------------------------------------------------------------------------ -// implementation of extent::tile() -// ------------------------------------------------------------------------ +extern "C" uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__; -template -inline -tiled_extent<1> extent::tile(int t0) const __CPU__ __HC__ { - static_assert(N == 1, "One-dimensional tile() method only available on extent<1>"); - return tiled_extent<1>(*this, t0); -} +extern "C" unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; -template -inline -tiled_extent<2> extent::tile(int t0, int t1) const __CPU__ __HC__ { - static_assert(N == 2, "Two-dimensional tile() method only available on extent<2>"); - return tiled_extent<2>(*this, t0, t1); -} +extern "C" uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__; -template -inline -tiled_extent<3> extent::tile(int t0, int t1, int t2) const __CPU__ __HC__ { - static_assert(N == 3, "Three-dimensional tile() method only available on extent<3>"); - return tiled_extent<3>(*this, t0, t1, t2); -} +extern "C" uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__; -// ------------------------------------------------------------------------ -// implementation of extent::tile_with_dynamic() -// ------------------------------------------------------------------------ +extern "C" int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__; -template -inline -tiled_extent<1> extent::tile_with_dynamic(int t0, int dynamic_size) const __CPU__ __HC__ { - static_assert(N == 1, "One-dimensional tile() method only available on extent<1>"); - return tiled_extent<1>(*this, t0, dynamic_size); -} +extern "C" int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__; -template -inline -tiled_extent<2> extent::tile_with_dynamic(int t0, int t1, int dynamic_size) const __CPU__ __HC__ { - static_assert(N == 2, "Two-dimensional tile() method only available on extent<2>"); - return tiled_extent<2>(*this, t0, t1, dynamic_size); -} +extern "C" int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__; -template -inline -tiled_extent<3> extent::tile_with_dynamic(int t0, int t1, int t2, int dynamic_size) const __CPU__ __HC__ { - static_assert(N == 3, "Three-dimensional tile() method only available on extent<3>"); - return tiled_extent<3>(*this, t0, t1, t2, dynamic_size); -} +extern "C" int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__; -// ------------------------------------------------------------------------ -// Intrinsic functions for HSAIL instructions -// ------------------------------------------------------------------------ +extern "C" int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__; + +extern "C" double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__; +/** @} */ +/** @{ */ /** - * Fetch the size of a wavefront + * Assign the elements specified by src1 from the packed value in src0 * - * @return The size of a wavefront. + * Please refer to HSA PRM 5.9 for more detailed specification of these functions. */ -#define __HSA_WAVEFRONT_SIZE__ (64) -extern "C" unsigned int __wavesize() __HC__; +extern "C" unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__; +extern "C" unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__; -#if __hcc_backend__==HCC_BACKEND_AMDGPU -extern "C" inline unsigned int __wavesize() __HC__ { - return __HSA_WAVEFRONT_SIZE__; -} -#endif +extern "C" unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__; + +extern "C" unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__; + +extern "C" unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__; + +extern "C" int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__; + +extern "C" int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__; + +extern "C" int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__; + +extern "C" int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__; + +extern "C" int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__; + +extern "C" float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__; +/** @} */ /** - * Count number of 1 bits in the input + * Align 32 bits within 64 bits of data on an arbitrary bit boundary * - * @param[in] input An unsinged 32-bit integer. - * @return Number of 1 bits in the input. + * Please refer to HSA PRM 5.15 for more detailed specification. */ -extern "C" inline unsigned int __popcount_u32_b32(unsigned int input) __HC__ { - return __builtin_popcount(input); -} +extern "C" unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; /** - * Count number of 1 bits in the input + * Align 32 bits within 64 bis of data on an arbitrary byte boundary * - * @param[in] input An unsinged 64-bit integer. - * @return Number of 1 bits in the input. + * Please refer to HSA PRM 5.15 for more detailed specification. */ -extern "C" inline unsigned int __popcount_u32_b64(unsigned long long int input) __HC__ { - return __builtin_popcountl(input); -} +extern "C" unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; -/** @{ */ /** - * Extract a range of bits + * Do linear interpolation and computes the unsigned 8-bit average of packed + * data * - * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + * Please refer to HSA PRM 5.15 for more detailed specification. */ -extern "C" inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ { - uint32_t offset = src1 & 31; - uint32_t width = src2 & 31; - return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width); -} - -extern "C" inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__ { - uint64_t offset = src1 & 63; - uint64_t width = src2 & 63; - return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width); -} - -extern "C" int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__; - -extern "C" int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__; -/** @} */ +extern "C" unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; -/** @{ */ /** - * Replace a range of bits + * Takes four floating-point number, convers them to + * unsigned integer values, and packs them into a packed u8x4 value * - * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + * Please refer to HSA PRM 5.15 for more detailed specification. */ -extern "C" inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__ { - uint32_t offset = src2 & 31; - uint32_t width = src3 & 31; - uint32_t mask = (1 << width) - 1; - return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset)); -} - -extern "C" inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__ { - uint64_t offset = src2 & 63; - uint64_t width = src3 & 63; - uint64_t mask = (1 << width) - 1; - return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset)); -} - -extern "C" int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__; - -extern "C" int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__; -/** @} */ +extern "C" unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__; -/** @{ */ /** - * Create a bit mask that can be used with bitselect + * Unpacks a single element from a packed u8x4 value and converts it to an f32. * - * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + * Please refer to HSA PRM 5.15 for more detailed specification. */ -extern "C" unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__; - -extern "C" uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__; -/** @} */ +extern "C" float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__; /** @{ */ /** - * Reverse the bits + * Computes the sum of the absolute differences of src0 and + * src1 and then adds src2 to the result * - * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + * Please refer to HSA PRM 5.15 for more detailed specification. */ +extern "C" unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; -unsigned int __bitrev_b32(unsigned int src0) [[hc]] __asm("llvm.bitreverse.i32"); - -uint64_t __bitrev_b64(uint64_t src0) [[hc]] __asm("llvm.bitreverse.i64"); +extern "C" unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; +extern "C" unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; /** @} */ -/** @{ */ /** - * Do bit field selection + * This function is mostly the same as sad except the sum of absolute + * differences is added to the most significant 16 bits of the result * - * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + * Please refer to HSA PRM 5.15 for more detailed specification. */ -extern "C" inline unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ { - return (src1 & src0) | (src2 & ~src0); -} - -extern "C" inline uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__ { - return (src1 & src0) | (src2 & ~src0); -} -/** @} */ +extern "C" unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; /** - * Count leading zero bits in the input - * - * @param[in] input An unsigned 32-bit integer. - * @return Number of 0 bits until a 1 bit is found, counting start from the - * most significant bit. -1 if there is no 0 bit. + * Get system timestamp */ -extern "C" inline unsigned int __firstbit_u32_u32(unsigned int input) __HC__ { - return input == 0 ? -1 : __builtin_clz(input); -} - +extern "C" uint64_t __clock_u64() __HC__; /** - * Count leading zero bits in the input + * Get hardware cycle count * - * @param[in] input An unsigned 64-bit integer. - * @return Number of 0 bits until a 1 bit is found, counting start from the - * most significant bit. -1 if there is no 0 bit. + * Notice the return value of this function is implementation defined. */ -extern "C" inline unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__ { - return input == 0 ? -1 : __builtin_clzl(input); -} +extern "C" uint64_t __cycle_u64() __HC__; /** - * Count leading zero bits in the input + * Get the count of the number of earlier (in flattened + * work-item order) active work-items within the same wavefront. * - * @param[in] input An signed 32-bit integer. - * @return Finds the first bit set in a positive integer starting from the - * most significant bit, or finds the first bit clear in a negative - * integer from the most significant bit. - * If no bits in the input are set, then dest is set to -1. + * @return The result will be in the range 0 to WAVESIZE - 1. */ -extern "C" inline unsigned int __firstbit_u32_s32(int input) __HC__ { - if (input == 0) { - return -1; - } - - return input > 0 ? __firstbit_u32_u32(input) : __firstbit_u32_u32(~input); -} - +extern "C" unsigned int __activelaneid_u32() __HC__; /** - * Count leading zero bits in the input + * Return a bit mask shows which active work-items in the + * wavefront have a non-zero input. The affected bit position within the + * registers of dest corresponds to each work-item's lane ID. * - * @param[in] input An signed 64-bit integer. - * @return Finds the first bit set in a positive integer starting from the - * most significant bit, or finds the first bit clear in a negative - * integer from the most significant bit. - * If no bits in the input are set, then dest is set to -1. + * The HSAIL instruction would return 4 64-bit registers but the current + * implementation would only return the 1st one and ignore the other 3 as + * right now all HSA agents have wavefront of size 64. + * + * @param[in] input An unsigned 32-bit integer. + * @return The bitmask calculated. */ -extern "C" inline unsigned int __firstbit_u32_s64(long long int input) __HC__ { - if (input == 0) { - return -1; - } - - return input > 0 ? __firstbit_u32_u64(input) : __firstbit_u32_u64(~input); -} +extern "C" uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__; -/** @{ */ /** - * Find the first bit set to 1 in a number starting from the - * least significant bit + * Count the number of active work-items in the current + * wavefront that have a non-zero input. * - * Please refer to HSA PRM 5.7 for more detailed specification of these functions. + * @param[in] input An unsigned 32-bit integer. + * @return The number of active work-items in the current wavefront that have + * a non-zero input. */ -extern "C" inline unsigned int __lastbit_u32_u32(unsigned int input) __HC__ { - return input == 0 ? -1 : __builtin_ctz(input); +extern "C" inline unsigned int __activelanecount_u32_b1(unsigned int input) __HC__ { + return __popcount_u32_b64(__activelanemask_v4_b64_b1(input)); } -extern "C" inline unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__ { - return input == 0 ? -1 : __builtin_ctzl(input); -} +// ------------------------------------------------------------------------ +// Wavefront Vote Functions +// ------------------------------------------------------------------------ -extern "C" inline unsigned int __lastbit_u32_s32(int input) __HC__ { - return __lastbit_u32_u32(input); +/** + * Evaluate predicate for all active work-items in the + * wavefront and return non-zero if and only if predicate evaluates to non-zero + * for any of them. + */ +extern "C" bool __ockl_wfany_i32(int) __HC__; +extern "C" inline int __any(int predicate) __HC__ { + return __ockl_wfany_i32(predicate); } -extern "C" inline unsigned int __lastbit_u32_s64(unsigned long long input) __HC__ { - return __lastbit_u32_u64(input); +/** + * Evaluate predicate for all active work-items in the + * wavefront and return non-zero if and only if predicate evaluates to non-zero + * for all of them. + */ +extern "C" bool __ockl_wfall_i32(int) __HC__; +extern "C" inline int __all(int predicate) __HC__ { + return __ockl_wfall_i32(predicate); } -/** @} */ -/** @{ */ /** - * Copy and interleave the lower half of the elements from - * each source into the desitionation - * - * Please refer to HSA PRM 5.9 for more detailed specification of these functions. + * Evaluate predicate for all active work-items in the + * wavefront and return an integer whose Nth bit is set if and only if + * predicate evaluates to non-zero for the Nth work-item of the wavefront and + * the Nth work-item is active. */ -extern "C" unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__; - -extern "C" uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__; - -extern "C" unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__; -extern "C" uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__; +// XXX from llvm/include/llvm/IR/InstrTypes.h +#define ICMP_NE 33 +__attribute__((convergent)) +unsigned long long __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) [[hc]] __asm("llvm.amdgcn.icmp.i32"); +extern "C" inline uint64_t __ballot(int predicate) __HC__ { + return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE); +} -extern "C" uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__; +// ------------------------------------------------------------------------ +// Wavefront Shuffle Functions +// ------------------------------------------------------------------------ -extern "C" int __unpacklo_s8x4(int src0, int src1) __HC__; - -extern "C" int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__; - -extern "C" int __unpacklo_s16x2(int src0, int src1) __HC__; - -extern "C" int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__; - -extern "C" int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__; -/** @} */ - -/** @{ */ -/** - * Copy and interleave the upper half of the elements from - * each source into the desitionation - * - * Please refer to HSA PRM 5.9 for more detailed specification of these functions. - */ -extern "C" unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__; - -extern "C" uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__; - -extern "C" unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__; - -extern "C" uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__; - -extern "C" uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__; - -extern "C" int __unpackhi_s8x4(int src0, int src1) __HC__; - -extern "C" int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__; - -extern "C" int __unpackhi_s16x2(int src0, int src1) __HC__; - -extern "C" int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__; - -extern "C" int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__; -/** @} */ - -/** @{ */ -/** - * Assign the elements of the packed value in src0, replacing - * the element specified by src2 with the value from src1 - * - * Please refer to HSA PRM 5.9 for more detailed specification of these functions. - */ -extern "C" unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; - -extern "C" uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__; - -extern "C" unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; - -extern "C" uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__; - -extern "C" uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__; - -extern "C" int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__; - -extern "C" int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__; - -extern "C" int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__; - -extern "C" int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__; - -extern "C" int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__; - -extern "C" double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__; -/** @} */ - -/** @{ */ -/** - * Assign the elements specified by src1 from the packed value in src0 - * - * Please refer to HSA PRM 5.9 for more detailed specification of these functions. - */ -extern "C" unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__; - -extern "C" unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__; - -extern "C" unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__; - -extern "C" unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__; - -extern "C" unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__; - -extern "C" int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__; - -extern "C" int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__; - -extern "C" int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__; - -extern "C" int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__; - -extern "C" int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__; - -extern "C" float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__; -/** @} */ - -/** - * Align 32 bits within 64 bits of data on an arbitrary bit boundary - * - * Please refer to HSA PRM 5.15 for more detailed specification. - */ -extern "C" unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; - -/** - * Align 32 bits within 64 bis of data on an arbitrary byte boundary - * - * Please refer to HSA PRM 5.15 for more detailed specification. - */ -extern "C" unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; - -/** - * Do linear interpolation and computes the unsigned 8-bit average of packed - * data - * - * Please refer to HSA PRM 5.15 for more detailed specification. - */ -extern "C" unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; - -/** - * Takes four floating-point number, convers them to - * unsigned integer values, and packs them into a packed u8x4 value - * - * Please refer to HSA PRM 5.15 for more detailed specification. - */ -extern "C" unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__; - -/** - * Unpacks a single element from a packed u8x4 value and converts it to an f32. - * - * Please refer to HSA PRM 5.15 for more detailed specification. - */ -extern "C" float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__; - -/** @{ */ -/** - * Computes the sum of the absolute differences of src0 and - * src1 and then adds src2 to the result - * - * Please refer to HSA PRM 5.15 for more detailed specification. - */ -extern "C" unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; - -extern "C" unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; - -extern "C" unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; -/** @} */ - -/** - * This function is mostly the same as sad except the sum of absolute - * differences is added to the most significant 16 bits of the result - * - * Please refer to HSA PRM 5.15 for more detailed specification. - */ -extern "C" unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__; - -/** - * Get system timestamp - */ -extern "C" __attribute__((always_inline)) -std::uint64_t __ockl_memrealtime_u64(void); - -extern "C" inline __attribute((always_inline)) std::uint64_t __clock_u64() __HC__ { - return __ockl_memrealtime_u64(); -} - - -/** - * Get hardware cycle count - * - * Notice the return value of this function is implementation defined. - */ -extern "C" __attribute__((always_inline)) -std::uint64_t __ockl_memtime_u64(void); - -extern "C" inline __attribute((always_inline)) std::uint64_t __cycle_u64() __HC__ { - return __ockl_memtime_u64(); -} - -/** - * Get the count of the number of earlier (in flattened - * work-item order) active work-items within the same wavefront. - * - * @return The result will be in the range 0 to WAVESIZE - 1. - */ -extern "C" unsigned int __activelaneid_u32() __HC__; - -/** - * Return a bit mask shows which active work-items in the - * wavefront have a non-zero input. The affected bit position within the - * registers of dest corresponds to each work-item's lane ID. - * - * The HSAIL instruction would return 4 64-bit registers but the current - * implementation would only return the 1st one and ignore the other 3 as - * right now all HSA agents have wavefront of size 64. - * - * @param[in] input An unsigned 32-bit integer. - * @return The bitmask calculated. - */ -extern "C" uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__; - -/** - * Count the number of active work-items in the current - * wavefront that have a non-zero input. - * - * @param[in] input An unsigned 32-bit integer. - * @return The number of active work-items in the current wavefront that have - * a non-zero input. - */ -extern "C" inline unsigned int __activelanecount_u32_b1(unsigned int input) __HC__ { - return __popcount_u32_b64(__activelanemask_v4_b64_b1(input)); -} - -// ------------------------------------------------------------------------ -// Wavefront Vote Functions -// ------------------------------------------------------------------------ - -/** - * Evaluate predicate for all active work-items in the - * wavefront and return non-zero if and only if predicate evaluates to non-zero - * for any of them. - */ -extern "C" bool __ockl_wfany_i32(int) __HC__; -extern "C" inline int __any(int predicate) __HC__ { - return __ockl_wfany_i32(predicate); -} - -/** - * Evaluate predicate for all active work-items in the - * wavefront and return non-zero if and only if predicate evaluates to non-zero - * for all of them. - */ -extern "C" bool __ockl_wfall_i32(int) __HC__; -extern "C" inline int __all(int predicate) __HC__ { - return __ockl_wfall_i32(predicate); -} - -/** - * Evaluate predicate for all active work-items in the - * wavefront and return an integer whose Nth bit is set if and only if - * predicate evaluates to non-zero for the Nth work-item of the wavefront and - * the Nth work-item is active. - */ - -// XXX from llvm/include/llvm/IR/InstrTypes.h -#define ICMP_NE 33 -__attribute__((convergent)) -unsigned long long __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) [[hc]] __asm("llvm.amdgcn.icmp.i32"); -extern "C" inline uint64_t __ballot(int predicate) __HC__ { - return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE); -} - -// ------------------------------------------------------------------------ -// Wavefront Shuffle Functions -// ------------------------------------------------------------------------ - -// utility union type -union __u { - int i; - unsigned int u; - float f; -}; +// utility union type +union __u { + int i; + unsigned int u; + float f; +}; /** @{ */ /** @@ -3085,8 +2136,6 @@ inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ return __amdgcn_ds_bpermute(index<<2, var); } -#endif - inline unsigned int __shfl(unsigned int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ { __u tmp; tmp.u = var; tmp.i = __shfl(tmp.i, srcLane, width); @@ -3100,6 +2149,8 @@ inline float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __ return tmp.f; } +#endif + // FIXME: support half type /** @} */ @@ -3134,8 +2185,6 @@ inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WA return __amdgcn_ds_bpermute(index<<2, var); } -#endif - inline unsigned int __shfl_up(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ { __u tmp; tmp.u = var; tmp.i = __shfl_up(tmp.i, delta, width); @@ -3148,6 +2197,8 @@ inline float __shfl_up(float var, const unsigned int delta, const int width=__HS return tmp.f; } +#endif + // FIXME: support half type /** @} */ @@ -3183,8 +2234,6 @@ inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_ return __amdgcn_ds_bpermute(index<<2, var); } -#endif - inline unsigned int __shfl_down(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ { __u tmp; tmp.u = var; tmp.i = __shfl_down(tmp.i, delta, width); @@ -3197,6 +2246,7 @@ inline float __shfl_down(float var, const unsigned int delta, const int width=__ return tmp.f; } +#endif // FIXME: support half type /** @} */ @@ -3229,8 +2279,6 @@ inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) _ return __amdgcn_ds_bpermute(index<<2, var); } -#endif - inline float __shfl_xor(float var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ { __u tmp; tmp.f = var; tmp.i = __shfl_xor(tmp.i, laneMask, width); @@ -3246,6 +2294,8 @@ inline unsigned int __shfl_xor(unsigned int var, int laneMask, int width=__HSA_W return tmp.u; } +#endif + /** * Multiply two unsigned integers (x,y) but only the lower 24 bits will be used in the multiplication. * @@ -3327,4616 +2377,4 @@ extern "C" void* get_group_segment_base_pointer() __HC__; * Fetch the address of the beginning of dynamic group segment. */ extern "C" void* get_dynamic_group_segment_base_pointer() __HC__; - -// ------------------------------------------------------------------------ -// utility class for tiled_barrier -// ------------------------------------------------------------------------ - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -template -void bar_wrapper(Ker *f, Ti *t) -{ - (*f)(*t); -} - -struct barrier_t { - std::unique_ptr ctx; - int idx; - barrier_t (int a) : - ctx(new ucontext_t[a + 1]) {} - template - void setctx(int x, char *stack, Ker& f, Ti* tidx, int S) { - getcontext(&ctx[x]); - ctx[x].uc_stack.ss_sp = stack; - ctx[x].uc_stack.ss_size = S; - ctx[x].uc_link = &ctx[x - 1]; - makecontext(&ctx[x], (void (*)(void))bar_wrapper, 2, &f, tidx); - } - void swap(int a, int b) { - swapcontext(&ctx[a], &ctx[b]); - } - void wait() __HC__ { - --idx; - swapcontext(&ctx[idx + 1], &ctx[idx]); - } -}; -#endif - - -// ------------------------------------------------------------------------ -// tiled_barrier -// ------------------------------------------------------------------------ - -/** - * The tile_barrier class is a capability class that is only creatable by the - * system, and passed to a tiled parallel_for_each function object as part of - * the tiled_index parameter. It provides member functions, such as wait, whose - * purpose is to synchronize execution of threads running within the thread - * tile. - */ -class tile_barrier { -public: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - using pb_t = std::shared_ptr; - tile_barrier(pb_t pb) : pbar(pb) {} - - /** - * Copy constructor. Constructs a new tile_barrier from the supplied - * argument "other". - * - * @param[in] other An object of type tile_barrier from which to initialize - * this. - */ - tile_barrier(const tile_barrier& other) __CPU__ __HC__ : pbar(other.pbar) {} -#else - - /** - * Copy constructor. Constructs a new tile_barrier from the supplied - * argument "other". - * - * @param[in] other An object of type tile_barrier from which to initialize - * this. - */ - tile_barrier(const tile_barrier& other) __CPU__ __HC__ {} -#endif - - /** - * Blocks execution of all threads in the thread tile until all threads in - * the tile have reached this call. Establishes a memory fence on all - * tile_static and global memory operations executed by the threads in the - * tile such that all memory operations issued prior to hitting the barrier - * are visible to all other threads after the barrier has completed and - * none of the memory operations occurring after the barrier are executed - * before hitting the barrier. This is identical to - * wait_with_all_memory_fence(). - */ - void wait() const __HC__ { -#if __KALMAR_ACCELERATOR__ == 1 - wait_with_all_memory_fence(); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif - } - - /** - * Blocks execution of all threads in the thread tile until all threads in - * the tile have reached this call. Establishes a memory fence on all - * tile_static and global memory operations executed by the threads in the - * tile such that all memory operations issued prior to hitting the barrier - * are visible to all other threads after the barrier has completed and - * none of the memory operations occurring after the barrier are executed - * before hitting the barrier. This is identical to wait(). - */ - void wait_with_all_memory_fence() const __HC__ { -#if __KALMAR_ACCELERATOR__ == 1 - amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif - } - - /** - * Blocks execution of all threads in the thread tile until all threads in - * the tile have reached this call. Establishes a memory fence on global - * memory operations (but not tile-static memory operations) executed by - * the threads in the tile such that all global memory operations issued - * prior to hitting the barrier are visible to all other threads after the - * barrier has completed and none of the global memory operations occurring - * after the barrier are executed before hitting the barrier. - */ - void wait_with_global_memory_fence() const __HC__ { -#if __KALMAR_ACCELERATOR__ == 1 - amp_barrier(CLK_GLOBAL_MEM_FENCE); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif - } - - /** - * Blocks execution of all threads in the thread tile until all threads in - * the tile have reached this call. Establishes a memory fence on - * tile-static memory operations (but not global memory operations) - * executed by the threads in the tile such that all tile_static memory - * operations issued prior to hitting the barrier are visible to all other - * threads after the barrier has completed and none of the tile-static - * memory operations occurring after the barrier are executed before - * hitting the barrier. - */ - void wait_with_tile_static_memory_fence() const __HC__ { -#if __KALMAR_ACCELERATOR__ == 1 - amp_barrier(CLK_LOCAL_MEM_FENCE); -#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - pbar->wait(); -#endif - } - -private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - tile_barrier() __CPU__ __HC__ = default; - pb_t pbar; -#else - tile_barrier() __HC__ {} -#endif - - template friend - class tiled_index; -}; - -// ------------------------------------------------------------------------ -// other memory fences -// ------------------------------------------------------------------------ - -/** - * Establishes a thread-tile scoped memory fence for both global and - * tile-static memory operations. This function does not imply a barrier and - * is therefore permitted in divergent code. - */ -// FIXME: this functions has not been implemented. -void all_memory_fence(const tile_barrier&) __HC__; - -/** - * Establishes a thread-tile scoped memory fence for global (but not - * tile-static) memory operations. This function does not imply a barrier and - * is therefore permitted in divergent code. - */ -// FIXME: this functions has not been implemented. -void global_memory_fence(const tile_barrier&) __HC__; - -/** - * Establishes a thread-tile scoped memory fence for tile-static (but not - * global) memory operations. This function does not imply a barrier and is - * therefore permitted in divergent code. - */ -// FIXME: this functions has not been implemented. -void tile_static_memory_fence(const tile_barrier&) __HC__; - -// ------------------------------------------------------------------------ -// tiled_index -// ------------------------------------------------------------------------ - -/** - * Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional - * tiles. - * - * @tparam N Tile dimension. - */ -template -class tiled_index { -public: - /** - * A static member of tiled_index that contains the rank of this tiled - * extent, and is either 1, 2, or 3 depending on the specialization used. - */ - static const int rank = 3; - - /** - * Copy constructor. Constructs a new tiled_index from the supplied - * argument "other". - * - * @param[in] other An object of type tiled_index from which to initialize - * this. - */ - tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {} - - /** - * An index of rank 1, 2, or 3 that represents the global index within an - * extent. - */ - const index<3> global; - - /** - * An index of rank 1, 2, or 3 that represents the relative index within - * the current tile of a tiled extent. - */ - const index<3> local; - - /** - * An index of rank 1, 2, or 3 that represents the coordinates of the - * current tile of a tiled extent. - */ - const index<3> tile; - - /** - * An index of rank 1, 2, or 3 that represents the global coordinates of - * the origin of the current tile within a tiled extent. - */ - const index<3> tile_origin; - - /** - * An object which represents a barrier within the current tile of threads. - */ - const tile_barrier barrier; - - /** - * An index of rank 1, 2, 3 that represents the size of the tile. - */ - const index<3> tile_dim; - - /** - * Implicit conversion operator that converts a tiled_index into - * an index. The implicit conversion converts to the .global index - * member. - */ - operator const index<3>() const __CPU__ __HC__ { - return global; - } - - tiled_index(const index<3>& g) __CPU__ __HC__ : global(g) {} - -private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index(int a0, int a1, int a2, int b0, int b1, int b2, int c0, int c1, int c2, tile_barrier& pb, int D0, int D1, int D2) __CPU__ __HC__ - : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0), tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1, D2) {} -#endif - - __attribute__((annotate("__cxxamp_opencl_index"))) -#if __KALMAR_ACCELERATOR__ == 1 - __attribute__((always_inline)) tiled_index() __HC__ - : global(index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))), - local(index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))), - tile(index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))), - tile_origin(index<3>(amp_get_global_id(2) - amp_get_local_id(2), - amp_get_global_id(1) - amp_get_local_id(1), - amp_get_global_id(0) - amp_get_local_id(0))), - tile_dim(index<3>(amp_get_local_size(2), amp_get_local_size(1), amp_get_local_size(0))) -#elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index() __CPU__ __HC__ -#else - __attribute__((always_inline)) tiled_index() __HC__ -#endif // __KALMAR_ACCELERATOR__ - {} - - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent&, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void partitioned_task_tile_3D(K const&, tiled_extent<3> const&, int); -#endif -}; - - -/** - * Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional - * tiles. - * This class is 1D specialization of tiled_index. - */ -template<> -class tiled_index<1> { -public: - /** - * A static member of tiled_index that contains the rank of this tiled - * extent, and is either 1, 2, or 3 depending on the specialization used. - */ - static const int rank = 1; - - /** - * Copy constructor. Constructs a new tiled_index from the supplied - * argument "other". - * - * @param[in] other An object of type tiled_index from which to initialize - * this. - */ - tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {} - - /** - * An index of rank 1, 2, or 3 that represents the global index within an - * extent. - */ - const index<1> global; - - /** - * An index of rank 1, 2, or 3 that represents the relative index within - * the current tile of a tiled extent. - */ - const index<1> local; - - /** - * An index of rank 1, 2, or 3 that represents the coordinates of the - * current tile of a tiled extent. - */ - const index<1> tile; - - /** - * An index of rank 1, 2, or 3 that represents the global coordinates of - * the origin of the current tile within a tiled extent. - */ - const index<1> tile_origin; - - /** - * An object which represents a barrier within the current tile of threads. - */ - const tile_barrier barrier; - - /** - * An index of rank 1, 2, 3 that represents the size of the tile. - */ - const index<1> tile_dim; - - /** - * Implicit conversion operator that converts a tiled_index into - * an index. The implicit conversion converts to the .global index - * member. - */ - operator const index<1>() const __CPU__ __HC__ { - return global; - } - - tiled_index(const index<1>& g) __CPU__ __HC__ : global(g) {} - -private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index(int a, int b, int c, tile_barrier& pb, int D0) __CPU__ __HC__ - : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_dim(D0) {} -#endif - - __attribute__((annotate("__cxxamp_opencl_index"))) -#if __KALMAR_ACCELERATOR__ == 1 - __attribute__((always_inline)) tiled_index() __HC__ - : global(index<1>(amp_get_global_id(0))), - local(index<1>(amp_get_local_id(0))), - tile(index<1>(amp_get_group_id(0))), - tile_origin(index<1>(amp_get_global_id(0) - amp_get_local_id(0))), - tile_dim(index<1>(amp_get_local_size(0))) -#elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index() __CPU__ __HC__ -#else - __attribute__((always_inline)) tiled_index() __HC__ -#endif // __KALMAR_ACCELERATOR__ - {} - - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void partitioned_task_tile_1D(K const&, tiled_extent<1> const&, int); -#endif -}; - -/** - * Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional - * tiles. - * This class is 2D specialization of tiled_index. - */ -template<> -class tiled_index<2> { -public: - /** - * A static member of tiled_index that contains the rank of this tiled - * extent, and is either 1, 2, or 3 depending on the specialization used. - */ - static const int rank = 2; - - /** - * Copy constructor. Constructs a new tiled_index from the supplied - * argument "other". - * - * @param[in] other An object of type tiled_index from which to initialize - * this. - */ - tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {} - - /** - * An index of rank 1, 2, or 3 that represents the global index within an - * extent. - */ - const index<2> global; - - /** - * An index of rank 1, 2, or 3 that represents the relative index within - * the current tile of a tiled extent. - */ - const index<2> local; - - /** - * An index of rank 1, 2, or 3 that represents the coordinates of the - * current tile of a tiled extent. - */ - const index<2> tile; - - /** - * An index of rank 1, 2, or 3 that represents the global coordinates of - * the origin of the current tile within a tiled extent. - */ - const index<2> tile_origin; - - /** - * An object which represents a barrier within the current tile of threads. - */ - const tile_barrier barrier; - - /** - * An index of rank 1, 2, 3 that represents the size of the tile. - */ - const index<2> tile_dim; - - /** - * Implicit conversion operator that converts a tiled_index into - * an index. The implicit conversion converts to the .global index - * member. - */ - operator const index<2>() const __CPU__ __HC__ { - return global; - } - - tiled_index(const index<2>& g) __CPU__ __HC__ : global(g) {} - -private: -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index(int a0, int a1, int b0, int b1, int c0, int c1, tile_barrier& pb, int D0, int D1) __CPU__ __HC__ - : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1) {} -#endif - - __attribute__((annotate("__cxxamp_opencl_index"))) -#if __KALMAR_ACCELERATOR__ == 1 - __attribute__((always_inline)) tiled_index() __HC__ - : global(index<2>(amp_get_global_id(1), amp_get_global_id(0))), - local(index<2>(amp_get_local_id(1), amp_get_local_id(0))), - tile(index<2>(amp_get_group_id(1), amp_get_group_id(0))), - tile_origin(index<2>(amp_get_global_id(1) - amp_get_local_id(1), - amp_get_global_id(0) - amp_get_local_id(0))), - tile_dim(index<2>(amp_get_local_size(1), amp_get_local_size(0))) -#elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - __attribute__((always_inline)) tiled_index() __CPU__ __HC__ -#else - __attribute__((always_inline)) tiled_index() __HC__ -#endif // __KALMAR_ACCELERATOR__ - {} - - template friend - completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&); - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 - template friend - void partitioned_task_tile_2D(K const&, tiled_extent<2> const&, int); -#endif -}; - -#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 -#define SSIZE 1024 * 10 -template -struct cpu_helper -{ - static inline void call(const Kernel& k, index& idx, const extent& ext) __CPU__ __HC__ { - int i; - for (i = 0; i < ext[N]; ++i) { - idx[N] = i; - cpu_helper::call(k, idx, ext); - } - } -}; -template -struct cpu_helper -{ - static inline void call(const Kernel& k, const index& idx, const extent& ext) __CPU__ __HC__ { - (const_cast(k))(idx); - } -}; - -template -void partitioned_task(const Kernel& ker, const extent& ext, int part) { - index idx; - int start = ext[0] * part / Kalmar::NTHREAD; - int end = ext[0] * (part + 1) / Kalmar::NTHREAD; - for (int i = start; i < end; i++) { - idx[0] = i; - cpu_helper<1, Kernel, N>::call(ker, idx, ext); - } -} - -template -void partitioned_task_tile_1D(Kernel const& f, tiled_extent<1> const& ext, int part) { - int D0 = ext.tile_dim[0]; - int start = (ext[0] / D0) * part / Kalmar::NTHREAD; - int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD; - int stride = end - start; - if (stride == 0) - return; - char *stk = new char[D0 * SSIZE]; - tiled_index<1> *tidx = new tiled_index<1>[D0]; - tile_barrier::pb_t hc_bar = std::make_shared(D0); - tile_barrier tbar(hc_bar); - for (int tx = start; tx < end; tx++) { - int id = 0; - char *sp = stk; - tiled_index<1> *tip = tidx; - for (int x = 0; x < D0; x++) { - new (tip) tiled_index<1>(tx * D0 + x, x, tx, tbar, D0); - hc_bar->setctx(++id, sp, f, tip, SSIZE); - sp += SSIZE; - ++tip; - } - hc_bar->idx = 0; - while (hc_bar->idx == 0) { - hc_bar->idx = id; - hc_bar->swap(0, id); - } - } - delete [] stk; - delete [] tidx; -} - -template -void partitioned_task_tile_2D(Kernel const& f, tiled_extent<2> const& ext, int part) { - int D0 = ext.tile_dim[0]; - int D1 = ext.tile_dim[1]; - int start = (ext[0] / D0) * part / Kalmar::NTHREAD; - int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD; - int stride = end - start; - if (stride == 0) - return; - char *stk = new char[D1 * D0 * SSIZE]; - tiled_index<2> *tidx = new tiled_index<2>[D0 * D1]; - tile_barrier::pb_t hc_bar = std::make_shared(D0 * D1); - tile_barrier tbar(hc_bar); - - for (int tx = 0; tx < ext[1] / D1; tx++) - for (int ty = start; ty < end; ty++) { - int id = 0; - char *sp = stk; - tiled_index<2> *tip = tidx; - for (int x = 0; x < D1; x++) - for (int y = 0; y < D0; y++) { - new (tip) tiled_index<2>(D1 * tx + x, D0 * ty + y, x, y, tx, ty, tbar, D0, D1); - hc_bar->setctx(++id, sp, f, tip, SSIZE); - ++tip; - sp += SSIZE; - } - hc_bar->idx = 0; - while (hc_bar->idx == 0) { - hc_bar->idx = id; - hc_bar->swap(0, id); - } - } - delete [] stk; - delete [] tidx; -} - -template -void partitioned_task_tile_3D(Kernel const& f, tiled_extent<3> const& ext, int part) { - int D0 = ext.tile_dim[0]; - int D1 = ext.tile_dim[1]; - int D2 = ext.tile_dim[2]; - int start = (ext[0] / D0) * part / Kalmar::NTHREAD; - int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD; - int stride = end - start; - if (stride == 0) - return; - char *stk = new char[D2 * D1 * D0 * SSIZE]; - tiled_index<3> *tidx = new tiled_index<3>[D0 * D1 * D2]; - tile_barrier::pb_t hc_bar = std::make_shared(D0 * D1 * D2); - tile_barrier tbar(hc_bar); - - for (int i = 0; i < ext[2] / D2; i++) - for (int j = 0; j < ext[1] / D1; j++) - for(int k = start; k < end; k++) { - int id = 0; - char *sp = stk; - tiled_index<3> *tip = tidx; - for (int x = 0; x < D2; x++) - for (int y = 0; y < D1; y++) - for (int z = 0; z < D0; z++) { - new (tip) tiled_index<3>(D2 * i + x, - D1 * j + y, - D0 * k + z, - x, y, z, i, j, k, tbar, D0, D1, D2); - hc_bar->setctx(++id, sp, f, tip, SSIZE); - ++tip; - sp += SSIZE; - } - hc_bar->idx = 0; - while (hc_bar->idx == 0) { - hc_bar->idx = id; - hc_bar->swap(0, id); - } - } - delete [] stk; - delete [] tidx; -} - -template -completion_future launch_cpu_task_async(const std::shared_ptr& pQueue, Kernel const& f, - extent const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task, std::cref(f), std::cref(compute_domain), i); - // FIXME wrap the above operation into the completion_future object - return completion_future(); -} - -template -completion_future launch_cpu_task_async(const std::shared_ptr& pQueue, Kernel const& f, - tiled_extent<1> const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task_tile_1D, - std::cref(f), std::cref(compute_domain), i); - // FIXME wrap the above operation into the completion_future object - return completion_future(); -} - -template -completion_future launch_cpu_task_async(const std::shared_ptr& pQueue, Kernel const& f, - tiled_extent<2> const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task_tile_2D, - std::cref(f), std::cref(compute_domain), i); - // FIXME wrap the above operation into the completion_future object - return completion_future(); -} - -template -completion_future launch_cpu_task_async(const std::shared_ptr& pQueue, Kernel const& f, - tiled_extent<3> const& compute_domain) -{ - Kalmar::CPUKernelRAII obj(pQueue, f); - for (int i = 0; i < Kalmar::NTHREAD; ++i) - obj[i] = std::thread(partitioned_task_tile_3D, - std::cref(f), std::cref(compute_domain), i); - // FIXME wrap the above operation into the completion_future object - return completion_future(); -} - -#endif - -// ------------------------------------------------------------------------ -// utility helper classes for array_view -// ------------------------------------------------------------------------ - -template -struct projection_helper -{ - // array_view, where N>1 - // array_view operator[](int i) const __CPU__ __HC__ - static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher"); - typedef array_view result_type; - static result_type project(array_view& now, int stride) __CPU__ __HC__ { - int ext[N - 1], i, idx[N - 1], ext_o[N - 1]; - for (i = N - 1; i > 0; --i) { - ext_o[i - 1] = now.extent[i]; - ext[i - 1] = now.extent_base[i]; - idx[i - 1] = now.index_base[i]; - } - stride += now.index_base[0]; - extent ext_now(ext_o); - extent ext_base(ext); - index idx_base(idx); - return result_type (now.cache, ext_now, ext_base, idx_base, - now.offset + ext_base.size() * stride); - } - static result_type project(const array_view& now, int stride) __CPU__ __HC__ { - int ext[N - 1], i, idx[N - 1], ext_o[N - 1]; - for (i = N - 1; i > 0; --i) { - ext_o[i - 1] = now.extent[i]; - ext[i - 1] = now.extent_base[i]; - idx[i - 1] = now.index_base[i]; - } - stride += now.index_base[0]; - extent ext_now(ext_o); - extent ext_base(ext); - index idx_base(idx); - return result_type (now.cache, ext_now, ext_base, idx_base, - now.offset + ext_base.size() * stride); - } -}; - -template -struct projection_helper -{ - // array_view - // T& operator[](int i) const __CPU__ __HC__; - typedef T& result_type; - static result_type project(array_view& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - now.cache.get_cpu_access(true); -#endif - T *ptr = reinterpret_cast(now.cache.get() + i + now.offset + now.index_base[0]); - return *ptr; - } - static result_type project(const array_view& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - now.cache.get_cpu_access(true); -#endif - T *ptr = reinterpret_cast(now.cache.get() + i + now.offset + now.index_base[0]); - return *ptr; - } -}; - -template -struct projection_helper -{ - // array_view, where N>1 - // array_view operator[](int i) const __CPU__ __HC__; - static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher"); - typedef array_view const_result_type; - static const_result_type project(array_view& now, int stride) __CPU__ __HC__ { - int ext[N - 1], i, idx[N - 1], ext_o[N - 1]; - for (i = N - 1; i > 0; --i) { - ext_o[i - 1] = now.extent[i]; - ext[i - 1] = now.extent_base[i]; - idx[i - 1] = now.index_base[i]; - } - stride += now.index_base[0]; - extent ext_now(ext_o); - extent ext_base(ext); - index idx_base(idx); - auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base, - now.offset + ext_base.size() * stride); - return ret; - } - static const_result_type project(const array_view& now, int stride) __CPU__ __HC__ { - int ext[N - 1], i, idx[N - 1], ext_o[N - 1]; - for (i = N - 1; i > 0; --i) { - ext_o[i - 1] = now.extent[i]; - ext[i - 1] = now.extent_base[i]; - idx[i - 1] = now.index_base[i]; - } - stride += now.index_base[0]; - extent ext_now(ext_o); - extent ext_base(ext); - index idx_base(idx); - auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base, - now.offset + ext_base.size() * stride); - return ret; - } -}; - -template -struct projection_helper -{ - // array_view - // const T& operator[](int i) const __CPU__ __HC__; - typedef const T& const_result_type; - static const_result_type project(array_view& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - now.cache.get_cpu_access(); -#endif - const T *ptr = reinterpret_cast(now.cache.get() + i + now.offset + now.index_base[0]); - return *ptr; - } - static const_result_type project(const array_view& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - now.cache.get_cpu_access(); -#endif - const T *ptr = reinterpret_cast(now.cache.get() + i + now.offset + now.index_base[0]); - return *ptr; - } -}; - -// ------------------------------------------------------------------------ -// utility helper classes for array_view -// ------------------------------------------------------------------------ - -template -struct __has_data -{ -private: - struct two {char __lx; char __lxx;}; - template static char test(decltype(std::declval().data())); - template static two test(...); -public: - static const bool value = sizeof(test(0)) == 1; -}; - -template -struct __has_size -{ -private: - struct two {char __lx; char __lxx;}; - template static char test(decltype(&C::size)); - template static two test(...); -public: - static const bool value = sizeof(test(0)) == 1; -}; - -template -struct __is_container -{ - using _T = typename std::remove_reference::type; - static const bool value = __has_size<_T>::value && __has_data<_T>::value; -}; - - -// ------------------------------------------------------------------------ -// utility helper classes for array -// ------------------------------------------------------------------------ - -template -struct array_projection_helper -{ - // array, where N>1 - // array_view operator[](int i0) __CPU__ __HC__; - // array_view operator[](int i0) const __CPU__ __HC__; - static_assert(N > 1, "projection_helper is only supported on array with a rank of 2 or higher"); - typedef array_view result_type; - typedef array_view const_result_type; - static result_type project(array& now, int stride) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if( stride < 0) - throw runtime_exception("errorMsg_throw", 0); -#endif - int comp[N - 1], i; - for (i = N - 1; i > 0; --i) - comp[i - 1] = now.extent[i]; - extent ext(comp); - int offset = ext.size() * stride; -#if __KALMAR_ACCELERATOR__ != 1 - if( offset >= now.extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - return result_type(now.m_device, ext, ext, index(), offset); - } - static const_result_type project(const array& now, int stride) __CPU__ __HC__ { - int comp[N - 1], i; - for (i = N - 1; i > 0; --i) - comp[i - 1] = now.extent[i]; - extent ext(comp); - int offset = ext.size() * stride; - return const_result_type(now.m_device, ext, ext, index(), offset); - } -}; - -template -struct array_projection_helper -{ - // array - // T& operator[](int i0) __CPU__ __HC__; - // const T& operator[](int i0) const __CPU__ __HC__; - typedef T& result_type; - typedef const T& const_result_type; - static result_type project(array& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - now.m_device.synchronize(true); -#endif - T *ptr = reinterpret_cast(now.m_device.get() + i); - return *ptr; - } - static const_result_type project(const array& now, int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - now.m_device.synchronize(); -#endif - const T *ptr = reinterpret_cast(now.m_device.get() + i); - return *ptr; - } -}; - -template -const extent& check(const extent& ext) -{ -#if __KALMAR_ACCELERATOR__ != 1 - for (int i = 0; i < N; i++) - { - if(ext[i] <=0) - throw runtime_exception("errorMsg_throw", 0); - } -#endif - return ext; -} - -// ------------------------------------------------------------------------ -// forward declarations of copy routines used by array / array_view -// ------------------------------------------------------------------------ - -template -void copy(const array_view& src, const array_view& dest); - -template -void copy(const array_view& src, const array_view& dest); - -template -void copy(const array& src, const array_view& dest); - -template -void copy(const array& src, array& dest); - -template -void copy(const array_view& src, array& dest); - -template -void copy(const array_view& src, array& dest); - -template -void copy(InputIter srcBegin, InputIter srcEnd, const array_view& dest); - -template -void copy(InputIter srcBegin, InputIter srcEnd, array& dest); - -template -void copy(InputIter srcBegin, const array_view& dest); - -template -void copy(InputIter srcBegin, array& dest); - -template -void copy(const array_view &src, OutputIter destBegin); - -template -void copy(const array &src, OutputIter destBegin); - -// ------------------------------------------------------------------------ -// array -// ------------------------------------------------------------------------ - -/** - * Represents an N-dimensional region of memory (with type T) located on an - * accelerator. - * - * @tparam T The element type of this array - * @tparam N The dimensionality of the array, defaults to 1 if elided. - */ -template -class array { - static_assert(!std::is_const::value, "array is not supported"); -public: -#if __KALMAR_ACCELERATOR__ == 1 - typedef Kalmar::_data acc_buffer_t; -#else - typedef Kalmar::_data_host acc_buffer_t; -#endif - - /** - * The rank of this array. - */ - static const int rank = N; - - /** - * The element type of this array. - */ - typedef T value_type; - - /** - * There is no default constructor for array. - */ - array() = delete; - - /** - * Copy constructor. Constructs a new array from the supplied argument - * other. The new array is located on the same accelerator_view as the - * source array. A deep copy is performed. - * - * @param[in] other An object of type array from which to initialize - * this new array. - */ - array(const array& other) - : array(other.get_extent(), other.get_accelerator_view()) - { copy(other, *this); } - - /** - * Move constructor. Constructs a new array by moving from the - * supplied argument other. - * - * @param[in] other An object of type array from which to initialize - * this new array. - */ - array(array&& other) - : m_device(other.m_device), extent(other.extent) - { other.m_device.reset(); } - - /** - * Constructs a new array with the supplied extent, located on the default - * view of the default accelerator. If any components of the extent are - * non-positive, an exception will be thrown. - * - * @param[in] ext The extent in each dimension of this array. - */ - explicit array(const extent& ext) - : array(ext, accelerator(L"default").get_default_view()) {} - - /** @{ */ - /** - * Equivalent to construction using "array(extent(e0 [, e1 [, e2 ]]))". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - */ - explicit array(int e0) - : array(hc::extent(e0)) { static_assert(N == 1, "illegal"); } - explicit array(int e0, int e1) - : array(hc::extent(e0, e1)) {} - explicit array(int e0, int e1, int e2) - : array(hc::extent(e0, e1, e2)) {} - - /** @} */ - - /** @{ */ - /** - * Constructs a new array with the supplied extent, located on the default - * accelerator, initialized with the contents of a source container - * specified by a beginning and optional ending iterator. The source data - * is copied by value into this array as if by calling "copy()". - * - * If the number of available container elements is less than - * this->extent.size(), undefined behavior results. - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - */ - template - array(const extent& ext, InputIter srcBegin) - : array(ext, srcBegin, accelerator(L"default").get_default_view()) {} - template - array(const extent& ext, InputIter srcBegin, InputIter srcEnd) - : array(ext, srcBegin, srcEnd, accelerator(L"default").get_default_view()) {} - - /** @} */ - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - */ - template - array(int e0, InputIter srcBegin) - : array(hc::extent(e0), srcBegin) {} - template - array(int e0, InputIter srcBegin, InputIter srcEnd) - : array(hc::extent(e0), srcBegin, srcEnd) {} - template - array(int e0, int e1, InputIter srcBegin) - : array(hc::extent(e0, e1), srcBegin) {} - template - array(int e0, int e1, InputIter srcBegin, InputIter srcEnd) - : array(hc::extent(e0, e1), srcBegin, srcEnd) {} - template - array(int e0, int e1, int e2, InputIter srcBegin) - : array(hc::extent(e0, e1, e2), srcBegin) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd) - : array(hc::extent(e0, e1, e2), srcBegin, srcEnd) {} - - /** @} */ - - /** - * Constructs a new array, located on the default view of the default - * accelerator, initialized with the contents of the array_view "src". The - * extent of this array is taken from the extent of the source array_view. - * The "src" is copied by value into this array as if by calling - * "copy(src, *this)". - * - * @param[in] src An array_view object from which to copy the data into - * this array (and also to determine the extent of this - * array). - */ - explicit array(const array_view& src) - : array(src.get_extent(), accelerator(L"default").get_default_view()) - { copy(src, *this); } - - /** - * Constructs a new array with the supplied extent, located on the - * accelerator bound to the accelerator_view "av". - * - * Users can optionally specify the type of CPU access desired for "this" - * array thus requesting creation of an array that is accessible both on - * the specified accelerator_view "av" as well as the CPU (with the - * specified CPU access_type). If a value other than access_type_auto or - * access_type_none is specified for the cpu_access_type parameter and the - * accelerator corresponding to the accelerator_view "av" does not support - * cpu_shared_memory, a runtime_exception is thrown. The cpu_access_type - * parameter has a default value of access_type_auto which leaves it up to - * the implementation to decide what type of allowed CPU access should the - * array be created with. The actual CPU access_type allowed for the - * created array can be queried using the get_cpu_access_type member - * method. - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] av An accelerator_view object which specifies the location of - * this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - array(const extent& ext, accelerator_view av, access_type cpu_access_type = access_type_auto) -#if __KALMAR_ACCELERATOR__ == 1 - : m_device(ext.size()), extent(ext) {} -#else - : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type), extent(ext) {} -#endif - - /** @{ */ - /** - * Constructs an array instance based on the given pointer on the device memory. - */ - explicit array(int e0, void* accelerator_pointer) - : array(hc::extent(e0), accelerator(L"default").get_default_view(), accelerator_pointer) {} - explicit array(int e0, int e1, void* accelerator_pointer) - : array(hc::extent(e0, e1), accelerator(L"default").get_default_view(), accelerator_pointer) {} - explicit array(int e0, int e1, int e2, void* accelerator_pointer) - : array(hc::extent(e0, e1, e2), accelerator(L"default").get_default_view(), accelerator_pointer) {} - - explicit array(const extent& ext, void* accelerator_pointer) - : array(ext, accelerator(L"default").get_default_view(), accelerator_pointer) {} - /** @} */ - - /** - * Constructs an array instance based on the given pointer on the device memory. - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] av An accelerator_view object which specifies the location of - * this array. - * @param[in] accelerator_pointer The pointer to the device memory. - * @param[in] access_type The type of CPU access desired for this array. - */ - explicit array(const extent& ext, accelerator_view av, void* accelerator_pointer, access_type cpu_access_type = access_type_auto) -#if __KALMAR_ACCELERATOR__ == 1 - : m_device(ext.size(), accelerator_pointer), extent(ext) {} -#else - : m_device(av.pQueue, av.pQueue, check(ext).size(), accelerator_pointer, cpu_access_type), extent(ext) {} -#endif - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), av, cpu_access_type)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] av An accelerator_view object which specifies the location of - * this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - array(int e0, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0), av, cpu_access_type) {} - array(int e0, int e1, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1), av, cpu_access_type) {} - array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1, e2), av, cpu_access_type) {} - - /** @} */ - - /** - * Constructs a new array with the supplied extent, located on the - * accelerator bound to the accelerator_view "av", initialized with the - * contents of the source container specified by a beginning and optional - * ending iterator. The data is copied by value into this array as if by - * calling "copy()". - * - * Users can optionally specify the type of CPU access desired for "this" - * array thus requesting creation of an array that is accessible both on - * the specified accelerator_view "av" as well as the CPU (with the - * specified CPU access_type). If a value other than access_type_auto or - * access_type_none is specified for the cpu_access_type parameter and the - * accelerator corresponding to the accelerator_view "av" does not support - * cpu_shared_memory, a runtime_exception is thrown. The cpu_access_type - * parameter has a default value of access_type_auto which leaves it upto - * the implementation to decide what type of allowed CPU access should the - * array be created with. The actual CPU access_type allowed for the - * created array can be queried using the get_cpu_access_type member - * method. - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - template - array(const extent& ext, InputIter srcBegin, accelerator_view av, - access_type cpu_access_type = access_type_auto) - : array(ext, av, cpu_access_type) { copy(srcBegin, *this); } - template - array(const extent& ext, InputIter srcBegin, InputIter srcEnd, - accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(ext, av, cpu_access_type) { - if (ext.size() < std::distance(srcBegin, srcEnd)) - throw runtime_exception("errorMsg_throw", 0); - copy(srcBegin, srcEnd, *this); - } - - /** @} */ - - /** - * Constructs a new array initialized with the contents of the array_view - * "src". The extent of this array is taken from the extent of the source - * array_view. The "src" is copied by value into this array as if by - * calling "copy(src, *this)". The new array is located on the accelerator - * bound to the accelerator_view "av". - * - * Users can optionally specify the type of CPU access desired for "this" - * array thus requesting creation of an array that is accessible both on - * the specified accelerator_view "av" as well as the CPU (with the - * specified CPU access_type). If a value other than access_type_auto or - * access_type_none is specified for the cpu_access_type parameter and the - * accelerator corresponding to the accelerator_view “av†does not support - * cpu_shared_memory, a runtime_exception is thrown. The cpu_access_type - * parameter has a default value of access_type_auto which leaves it upto - * the implementation to decide what type of allowed CPU access should the - * array be created with. The actual CPU access_type allowed for the - * created array can be queried using the get_cpu_access_type member - * method. - * - * @param[in] src An array_view object from which to copy the data into - * this array (and also to determine the extent of this array). - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - array(const array_view& src, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(src.get_extent(), av, cpu_access_type) { copy(src, *this); } - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd], av, cpu_access_type)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] access_type The type of CPU access desired for this array. - */ - template - array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(extent(e0), srcBegin, av, cpu_access_type) {} - template - array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(extent(e0), srcBegin, srcEnd, av, cpu_access_type) {} - template - array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1), srcBegin, av, cpu_access_type) {} - template - array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1), srcBegin, srcEnd, av, cpu_access_type) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1, e2), srcBegin, av, cpu_access_type) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto) - : array(hc::extent(e0, e1, e2), srcBegin, srcEnd, av, cpu_access_type) {} - - /** @} */ - - /** - * Constructs a staging array with the given extent, which acts as a - * staging area between accelerator views "av" and "associated_av". If "av" - * is a cpu accelerator view, this will construct a staging array which is - * optimized for data transfers between the CPU and "associated_av". - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - array(const extent& ext, accelerator_view av, accelerator_view associated_av) -#if __KALMAR_ACCELERATOR__ == 1 - : m_device(ext.size()), extent(ext) {} -#else - : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto), extent(ext) {} -#endif - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), av, associated_av)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - array(int e0, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0), av, associated_av) {} - array(int e0, int e1, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1), av, associated_av) {} - array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1, e2), av, associated_av) {} - - /** @} */ - - /** @{ */ - /** - * Constructs a staging array with the given extent, which acts as a - * staging area between accelerator_views "av" (which must be the CPU - * accelerator) and "associated_av". The staging array will be initialized - * with the data specified by "src" as if by calling "copy(src, *this)". - * - * @param[in] ext The extent in each dimension of this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - template - array(const extent& ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(ext, av, associated_av) { copy(srcBegin, *this); } - template - array(const extent& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(ext, av, associated_av) { - if (ext.size() < std::distance(srcBegin, srcEnd)) - throw runtime_exception("errorMsg_throw", 0); - copy(srcBegin, srcEnd, *this); - } - - /** @} */ - - /** - * Constructs a staging array initialized with the array_view given by - * "src", which acts as a staging area between accelerator_views "av" - * (which must be the CPU accelerator) and "associated_av". The extent of - * this array is taken from the extent of the source array_view. The - * staging array will be initialized from "src" as if by calling - * "copy(src, *this)". - * - * @param[in] src An array_view object from which to copy the data into - * this array (and also to determine the extent of this - * array). - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - array(const array_view& src, accelerator_view av, accelerator_view associated_av) - : array(src.get_extent(), av, associated_av) - { copy(src, *this); } - - /** @{ */ - /** - * Equivalent to construction using - * "array(extent(e0 [, e1 [, e2 ]]), src, av, associated_av)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array. - * @param[in] srcBegin A beginning iterator into the source container. - * @param[in] srcEnd An ending iterator into the source container. - * @param[in] av An accelerator_view object which specifies the home - * location of this array. - * @param[in] associated_av An accelerator_view object which specifies a - * target device accelerator. - */ - template - array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(extent(e0), srcBegin, av, associated_av) {} - template - array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(extent(e0), srcBegin, srcEnd, av, associated_av) {} - template - array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1), srcBegin, av, associated_av) {} - template - array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1), srcBegin, srcEnd, av, associated_av) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1, e2), srcBegin, av, associated_av) {} - template - array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av) - : array(hc::extent(e0, e1, e2), srcBegin, srcEnd, av, associated_av) {} - - /** @} */ - - /** - * Access the extent that defines the shape of this array. - */ - extent get_extent() const __CPU__ __HC__ { return extent; } - - /** - * This property returns the accelerator_view representing the location - * where this array has been allocated. - */ - accelerator_view get_accelerator_view() const { return m_device.get_av(); } - - /** - * This property returns the accelerator_view representing the preferred - * target where this array can be copied. - */ - accelerator_view get_associated_accelerator_view() const { return m_device.get_stage(); } - - /** - * This property returns the CPU "access_type" allowed for this array. - */ - access_type get_cpu_access_type() const { return m_device.get_access(); } - - /** - * Assigns the contents of the array "other" to this array, using a deep - * copy. - * - * @param[in] other An object of type array from which to copy into - * this array. - * @return Returns *this. - */ - array& operator=(const array& other) { - if (this != &other) { - array arr(other); - *this = std::move(arr); - } - return *this; - } - - /** - * Moves the contents of the array "other" to this array. - * - * @param[in] other An object of type array from which to move into - * this array. - * @return Returns *this. - */ - array& operator=(array&& other) { - if (this != &other) { - extent = other.extent; - m_device = other.m_device; - other.m_device.reset(); - } - return *this; - } - - /** - * Assigns the contents of the array_view "src", as if by calling - * "copy(src, *this)". - * - * @param[in] src An object of type array_view from which to copy into - * this array. - * @return Returns *this. - */ - array& operator=(const array_view& src) { - array arr(src); - *this = std::move(arr); - return *this; - } - - /** - * Copies the contents of this array to the array given by "dest", as - * if by calling "copy(*this, dest)". - * - * @param[out] dest An object of type array to which to copy data - * from this array. - */ - void copy_to(array& dest) const { -#if __KALMAR_ACCELERATOR__ != 1 - for(int i = 0 ; i < N ; i++) - { - if (dest.extent[i] < this->extent[i] ) - throw runtime_exception("errorMsg_throw", 0); - } -#endif - copy(*this, dest); - } - - /** - * Copies the contents of this array to the array_view given by "dest", as - * if by calling "copy(*this, dest)". - * - * @param[out] dest An object of type array_view to which to copy data - * from this array. - */ - void copy_to(const array_view& dest) const { copy(*this, dest); } - - /** - * Returns a pointer to the raw data underlying this array. - * - * @return A (const) pointer to the first element in the linearized array. - */ - T* data() const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if (!m_device.get()) - return nullptr; - m_device.synchronize(true); -#endif - return reinterpret_cast(m_device.get()); - } - - /** - * Returns a pointer to the device memory underlying this array. - * - * @return A (const) pointer to the first element in the array on the - * device memory. - */ - T* accelerator_pointer() const __CPU__ __HC__ { - return reinterpret_cast(m_device.get_device_pointer()); - } - - /** - * Implicitly converts an array to a std::vector, as if by - * "copy(*this, vector)". - * - * @return An object of type vector which contains a copy of the data - * contained on the array. - */ - operator std::vector() const { - std::vector vec(extent.size()); - copy(*this, vec.data()); - return std::move(vec); - } - - /** @{ */ - /** - * Returns a reference to the element of this array that is at the location - * in N-dimensional space specified by "idx". Accessing array data on a - * location where it is not resident (e.g. from the CPU when it is resident - * on a GPU) results in an exception (in cpu context) or - * undefined behavior (in GPU context). - * - * @param[in] idx An object of type index from that specifies the - * location of the element. - */ - T& operator[](const index& idx) __CPU__ __HC__ { -#ifndef __KALMAR_ACCELERATOR__ - if (!m_device.get()) - throw runtime_exception("The array is not accessible on CPU.", 0); - m_device.synchronize(true); -#endif - T *ptr = reinterpret_cast(m_device.get()); - return ptr[Kalmar::amp_helper, hc::extent>::flatten(idx, extent)]; - } - T& operator()(const index& idx) __CPU__ __HC__ { - return (*this)[idx]; - } - - /** @} */ - - /** @{ */ - /** - * Returns a const reference to the element of this array that is at the - * location in N-dimensional space specified by "idx". Accessing array data - * on a location where it is not resident (e.g. from the CPU when it is - * resident on a GPU) results in an exception (in cpu context) - * or undefined behavior (in GPU context). - * - * @param[in] idx An object of type index from that specifies the - * location of the element. - */ - const T& operator[](const index& idx) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if (!m_device.get()) - throw runtime_exception("The array is not accessible on CPU.", 0); - m_device.synchronize(); -#endif - T *ptr = reinterpret_cast(m_device.get()); - return ptr[Kalmar::amp_helper, hc::extent>::flatten(idx, extent)]; - } - const T& operator()(const index& idx) const __CPU__ __HC__ { - return (*this)[idx]; - } - - /** @} */ - - /** @{ */ - /** - * Equivalent to - * "array::operator()(index(i0 [, i1 [, i2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the index into - * this array. - */ - T& operator()(int i0, int i1) __CPU__ __HC__ { - return (*this)[index<2>(i0, i1)]; - } - T& operator()(int i0, int i1, int i2) __CPU__ __HC__ { - return (*this)[index<3>(i0, i1, i2)]; - } - - /** @} */ - - /** @{ */ - /** - * Equivalent to - * "array::operator()(index(i0 [, i1 [, i2 ]])) const". - * - * @param[in] i0,i1,i2 The component values that will form the index into - * this array. - */ - const T& operator()(int i0, int i1) const __CPU__ __HC__ { - return (*this)[index<2>(i0, i1)]; - } - const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ { - return (*this)[index<3>(i0, i1, i2)]; - } - - /** @{ */ - /** - * This overload is defined for array where @f$N \ge 2@f$. - * This mode of indexing is equivalent to projecting on the - * most-significant dimension. It allows C-style indexing. For example: - * - * @code{.cpp} - * array myArray(myExtents, …); - * myArray[index<4>(5,4,3,2)] = 7; - * assert(myArray[5][4][3][2] == 7); - * @endcode - * - * @param i0 An integer that is the index into the most-significant - * dimension of this array. - * @return Returns an array_view whose dimension is one lower than that of - * this array. - */ - typename array_projection_helper::result_type - operator[] (int i) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if (!m_device.get()) - throw runtime_exception("The array is not accessible on CPU.", 0); - m_device.synchronize(); -#endif - return array_projection_helper::project(*this, i); - } - typename array_projection_helper::result_type - operator()(int i0) __CPU__ __HC__ { - return (*this)[i0]; - } - typename array_projection_helper::const_result_type - operator[] (int i) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if (!m_device.get()) - throw runtime_exception("The array is not accessible on CPU.", 0); - m_device.synchronize(); -#endif - return array_projection_helper::project(*this, i); - } - typename array_projection_helper::const_result_type - operator()(int i0) const __CPU__ __HC__ { - return (*this)[i0]; - } - - /** @} */ - - /** @{ */ - /** - * Returns a subsection of the source array view at the origin specified by - * "idx" and with the extent specified by "ext". - * - * Example: - * @code{.cpp} - * array a(extent<2>(200,100)); - * array_view v1(a); // v1.extent = <200,100> - * array_view v2 = v1.section(index<2>(15,25), extent<2>(40,50)); - * assert(v2(0,0) == v1(15,25)); - * @endcode - * - * @param[in] origin Provides the offset/origin of the resulting section. - * @param[in] ext Provides the extent of the resulting section. - * @return Returns a subsection of the source array at specified origin, - * and with the specified extent. - */ - array_view section(const index& origin, const extent& ext) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if ( !Kalmar::amp_helper, hc::extent>::contains(origin, ext ,this->extent) ) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(*this); - return av.section(origin, ext); - } - array_view section(const index& origin, const extent& ext) const __CPU__ __HC__ { - array_view av(*this); - return av.section(origin, ext); - } - - /** @} */ - - /** @{ */ - /** - * Equivalent to "section(idx, this->extent – idx)". - */ - array_view section(const index& idx) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if ( !Kalmar::amp_helper, hc::extent>::contains(idx, this->extent ) ) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(*this); - return av.section(idx); - } - array_view section(const index& idx) const __CPU__ __HC__ { - array_view av(*this); - return av.section(idx); - } - - /** @} */ - - /** @{ */ - /** - * Equivalent to "section(index(), ext)". - */ - array_view section(const extent& ext) __CPU__ __HC__ { - array_view av(*this); - return av.section(ext); - } - array_view section(const extent& ext) const __CPU__ __HC__ { - array_view av(*this); - return av.section(ext); - } - - /** @} */ - - /** @{ */ - /** - * Equivalent to - * "array::section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]])) const". - * - * @param[in] i0,i1,i2 The component values that will form the origin of - * the section - * @param[in] e0,e1,e2 The component values that will form the extent of - * the section - */ - array_view section(int i0, int e0) __CPU__ __HC__ { - static_assert(N == 1, "Rank must be 1"); - return section(index<1>(i0), hc::extent<1>(e0)); - } - array_view section(int i0, int e0) const __CPU__ __HC__ { - static_assert(N == 1, "Rank must be 1"); - return section(index<1>(i0), hc::extent<1>(e0)); - } - array_view section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ { - static_assert(N == 2, "Rank must be 2"); - return section(index<2>(i0, i1), hc::extent<2>(e0, e1)); - } - array_view section(int i0, int i1, int e0, int e1) __CPU__ __HC__ { - static_assert(N == 2, "Rank must be 2"); - return section(index<2>(i0, i1), hc::extent<2>(e0, e1)); - } - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) __CPU__ __HC__ { - static_assert(N == 3, "Rank must be 3"); - return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2)); - } - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ { - static_assert(N == 3, "Rank must be 3"); - return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2)); - } - - /** @} */ - - /** @{ */ - /** - * Sometimes it is desirable to view the data of an N-dimensional array as - * a linear array, possibly with a (unsafe) reinterpretation of the element - * type. This can be achieved through the reinterpret_as member function. - * Example: - * - * @code{.cpp} - * struct RGB { float r; float g; float b; }; - * array a = ...; - * array_view v = a.reinterpret_as(); - * assert(v.extent == 3*a.extent); - * @endcode - * - * The size of the reinterpreted ElementType must evenly divide into the - * total size of this array. - * - * @return Returns an array_view from this array with the element type - * reinterpreted from T to ElementType, and the rank reduced from N - * to 1. - */ - template - array_view reinterpret_as() __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); - if( (extent.size() * sizeof(T)) % sizeof(ElementType)) - throw runtime_exception("errorMsg_throw", 0); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(m_device), extent<1>(size), 0); - return av; - } - template - array_view reinterpret_as() const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(m_device), extent<1>(size), 0); - return av; - } - - /** @} */ - - /** @{ */ - /** - * An array of higher rank can be reshaped into an array of lower rank, or - * vice versa, using the view_as member function. Example: - * - * @code{.cpp} - * array a(100); - * array_view av = a.view_as(extent<2>(2,50)); - * @endcode - * - * @return Returns an array_view from this array with the rank changed - * to K from N. - */ - template array_view - view_as(const extent& viewExtent) __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(m_device, viewExtent, 0); - return av; - } - template array_view - view_as(const extent& viewExtent) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - const array_view av(m_device, viewExtent, 0); - return av; - } - - /** @} */ - - ~array() {} - - // FIXME: functions below may be considered to move to private - const acc_buffer_t& internal() const __CPU__ __HC__ { return m_device; } - int get_offset() const __CPU__ __HC__ { return 0; } - index get_index_base() const __CPU__ __HC__ { return index(); } -private: - template friend struct projection_helper; - template friend struct array_projection_helper; - acc_buffer_t m_device; - extent extent; - - template friend - void copy(const array&, const array_view&); - template friend - void copy(const array_view&, array&); -}; - -// ------------------------------------------------------------------------ -// array_view -// ------------------------------------------------------------------------ - -/** - * The array_view type represents a possibly cached view into the data - * held in an array, or a section thereof. It also provides such views - * over native CPU data. It exposes an indexing interface congruent to that of - * array. - */ -template -class array_view -{ -public: - typedef typename std::remove_const::type nc_T; -#if __KALMAR_ACCELERATOR__ == 1 - typedef Kalmar::_data acc_buffer_t; -#else - typedef Kalmar::_data_host acc_buffer_t; -#endif - - /** - * The rank of this array. - */ - static const int rank = N; - - /** - * The element type of this array. - */ - typedef T value_type; - - /** - * There is no default constructor for array_view. - */ - array_view() = delete; - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" array. The extent of the array_view is that of the src array, and - * the origin of the array view is at zero. - * - * @param[in] src An array which contains the data that this array_view is - * bound to. - */ - array_view(array& src) __CPU__ __HC__ - : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {} - - // FIXME: following interfaces were not implemented yet - // template - // explicit array_view::array_view(Container& src); - // template - // explicit array_view::array_view(value_type (&src) [Size]) __CPU__ __HC__; - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A template argument that must resolve to a linear - * container that supports .data() and .size() members (such - * as std::vector or std::array) - * @param[in] extent The extent of this array_view. - */ - template ::value>::type> - array_view(const extent& extent, Container& src) - : array_view(extent, src.data()) - { static_assert( std::is_same::value, "container element type and array view element type must match"); } - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than the - * size of extent, the behavior is undefined. - * @param[in] ext The extent of this array_view. - */ - array_view(const extent& ext, value_type* src) __CPU__ __HC__ -#if __KALMAR_ACCELERATOR__ == 1 - : cache((T *)(src)), extent(ext), extent_base(ext), offset(0) {} -#else - : cache(ext.size(), (T *)(src)), extent(ext), extent_base(ext), offset(0) {} -#endif - - /** - * Constructs an array_view which is not bound to a data source. The extent - * of the array_view is that given by the "extent" argument, and the origin - * of the array view is at zero. An array_view thus constructed represents - * uninitialized data and the underlying allocations are created lazily as - * the array_view is accessed on different locations (on an - * accelerator_view or on the CPU). - * - * @param[in] ext The extent of this array_view. - */ - explicit array_view(const extent& ext) - : cache(ext.size()), extent(ext), extent_base(ext), offset(0) {} - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A template argument that must resolve to a contiguousi - * container that supports .data() and .size() members (such - * as std::vector or std::array) - */ - template ::value>::type> - array_view(int e0, Container& src) - : array_view(hc::extent(e0), src) {} - template ::value>::type> - array_view(int e0, int e1, Container& src) - : array_view(hc::extent(e0, e1), src) {} - template ::value>::type> - array_view(int e0, int e1, int e2, Container& src) - : array_view(hc::extent(e0, e1, e2), src) {} - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than - * the size of extent, the behavior is undefined. - */ - array_view(int e0, value_type *src) __CPU__ __HC__ - : array_view(hc::extent(e0), src) {} - array_view(int e0, int e1, value_type *src) __CPU__ __HC__ - : array_view(hc::extent(e0, e1), src) {} - array_view(int e0, int e1, int e2, value_type *src) __CPU__ __HC__ - : array_view(hc::extent(e0, e1, e2), src) {} - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]))". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - */ - explicit array_view(int e0) : array_view(hc::extent(e0)) {} - explicit array_view(int e0, int e1) - : array_view(hc::extent(e0, e1)) {} - explicit array_view(int e0, int e1, int e2) - : array_view(hc::extent(e0, e1, e2)) {} - - /** - * Copy constructor. Constructs an array_view from the supplied argument - * other. A shallow copy is performed. - * - * @param[in] other An object of type array_view or - * array_view from which to initialize this - * new array_view. - */ - array_view(const array_view& other) __CPU__ __HC__ - : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {} - - /** - * Access the extent that defines the shape of this array_view. - */ - extent get_extent() const __CPU__ __HC__ { return extent; } - - /** - * Access the accelerator_view where the data source of the array_view is - * located. - * - * When the data source of the array_view is native CPU memory, the method - * returns accelerator(accelerator::cpu_accelerator).default_view. When the - * data source underlying the array_view is an array, the method returns - * the accelerator_view where the source array is located. - */ - accelerator_view get_source_accelerator_view() const { return cache.get_av(); } - - /** - * Assigns the contents of the array_view "other" to this array_view, using - * a shallow copy. Both array_views will refer to the same data. - * - * @param[in] other An object of type array_view from which to copy - * into this array. - * @return Returns *this. - */ - array_view& operator=(const array_view& other) __CPU__ __HC__ { - if (this != &other) { - cache = other.cache; - extent = other.extent; - index_base = other.index_base; - extent_base = other.extent_base; - offset = other.offset; - } - return *this; - } - - /** - * Copies the data referred to by this array_view to the array given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array to which to copy data from - * this array. - */ - void copy_to(array& dest) const { -#if __KALMAR_ACCELERATOR__ != 1 - for(int i= 0 ;i< N;i++) - { - if (dest.get_extent()[i] < this->extent[i]) - throw runtime_exception("errorMsg_throw", 0); - } -#endif - copy(*this, dest); - } - - /** - * Copies the contents of this array_view to the array_view given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array_view to which to copy data - * from this array. - */ - void copy_to(const array_view& dest) const { copy(*this, dest); } - - /** - * Returns a pointer to the first data element underlying this array_view. - * This is only available on array_views of rank 1. - * - * When the data source of the array_view is native CPU memory, the pointer - * returned by data() is valid for the lifetime of the data source. - * - * When the data source underlying the array_view is an array, or the array - * view is created without a data source, the pointer returned by data() in - * CPU context is ephemeral and is invalidated when the original data - * source or any of its views are accessed on an accelerator_view through a - * parallel_for_each or a copy operation. - * - * @return A pointer to the first element in the linearized array. - */ - T* data() const __CPU__ __HC__ { - -#if __KALMAR_ACCELERATOR__ != 1 - cache.get_cpu_access(true); -#endif - static_assert(N == 1, "data() is only permissible on array views of rank 1"); - return reinterpret_cast(cache.get() + offset + index_base[0]); - } - - /** - * Returns a pointer to the device memory underlying this array_view. - * - * @return A (const) pointer to the first element in the array_view on the - * device memory. - */ - T* accelerator_pointer() const __CPU__ __HC__ { - return reinterpret_cast(cache.get_device_pointer() + offset + index_base[0]); - } - - /** - * Calling this member function informs the array_view that its bound - * memory has been modified outside the array_view interface. This will - * render all cached information stale. - */ - void refresh() const { cache.refresh(); } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to its source data container. For - * example, for an array_view on system memory, if the data underlying the - * view are modified on a remote accelerator_view through a - * parallel_for_each invocation, calling synchronize ensures that the - * modifications are synchronized to the source data and will be visible - * through the system memory pointer which the array_view was created over. - * - * For writable array_view objects, callers of this functional can - * optionally specify the type of access desired on the source data - * container through the "type" parameter. For example specifying a - * "access_type_read" (which is also the default value of the parameter) - * indicates that the data has been synchronized to its source location - * only for reading. On the other hand, specifying an access_type of - * "access_type_read_write" synchronizes the data to its source location - * both for reading and writing; i.e. any modifications to the source data - * directly through the source data container are legal after synchronizing - * the array_view with write access and before subsequently accessing the - * array_view on another remote location. - * - * It is advisable to be precise about the access_type specified in the - * synchronize call; i.e. if only write access it required, specifying - * access_type_write may yield better performance that calling synchronize - * with "access_type_read_write" since the later may require any - * modifications made to the data on remote locations to be synchronized to - * the source location, which is unnecessary if the contents are intended - * to be overwritten without reading. - * - * @param[in] type An argument of type "access_type" which specifies the - * type of access on the data source that the array_view is - * synchronized for. - */ - // FIXME: type parameter is not implemented - void synchronize() const { cache.get_cpu_access(); } - - /** - * An asynchronous version of synchronize, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - // FIXME: type parameter is not implemented - completion_future synchronize_async() const { - std::future fut = std::async([&]() mutable { synchronize(); }); - return completion_future(fut.share()); - } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to the specified accelerator_view - * "av". For example, for an array_view on system memory, if the data - * underlying the view is modified on the CPU, and synchronize_to is called - * on "this" array_view, then the array_view contents are cached on the - * specified accelerator_view location. - * - * For writable array_view objects, callers of this functional can - * optionally specify the type of access desired on the specified target - * accelerator_view "av", through the "type" parameter. For example - * specifying a "access_type_read" (which is also the default value of the - * parameter) indicates that the data has been synchronized to "av" only - * for reading. On the other hand, specifying an access_type of - * "access_type_read_write" synchronizes the data to "av" both for reading - * and writing; i.e. any modifications to the data on "av" are legal after - * synchronizing the array_view with write access and before subsequently - * accessing the array_view on a location other than "av". - * - * It is advisable to be precise about the access_type specified in the - * synchronize call; i.e. if only write access it required, specifying - * access_type_write may yield better performance that calling synchronize - * with "access_type_read_write" since the later may require any - * modifications made to the data on remote locations to be synchronized to - * "av", which is unnecessary if the contents are intended to be - * immediately overwritten without reading. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - * @param[in] type An argument of type "access_type" which specifies the - * type of access on the data source that the array_view is - * synchronized for. - */ - // FIXME: type parameter is not implemented - void synchronize_to(const accelerator_view& av) const { -#if __KALMAR_ACCELERATOR__ != 1 - cache.sync_to(av.pQueue); -#endif - } - - /** - * An asynchronous version of synchronize_to, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - * @param[in] type An argument of type "access_type" which specifies the - * type of access on the data source that the array_view is - * synchronized for. - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - // FIXME: this method is not implemented yet - completion_future synchronize_to_async(const accelerator_view& av) const; - - /** - * Indicates to the runtime that it may discard the current logical - * contents of this array_view. This is an optimization hint to the runtime - * used to avoid copying the current contents of the view to a target - * accelerator_view, and its use is recommended if the existing content is - * not needed. - */ - void discard_data() const { -#if __KALMAR_ACCELERATOR__ != 1 - cache.discard(); -#endif - } - - /** @{ */ - /** - * Returns a reference to the element of this array_view that is at the - * location in N-dimensional space specified by "idx". - * - * @param[in] idx An object of type index that specifies the location of - * the element. - */ - T& operator[] (const index& idx) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - cache.get_cpu_access(true); -#endif - T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[Kalmar::amp_helper, hc::extent>::flatten(idx + index_base, extent_base)]; - } - - T& operator()(const index& idx) const __CPU__ __HC__ { - return (*this)[idx]; - } - - /** @} */ - - /** - * Returns a reference to the element of this array_view that is at the - * location in N-dimensional space specified by "idx". - * - * Unlike the other indexing operators for accessing the array_view on the - * CPU, this method does not implicitly synchronize this array_view's - * contents to the CPU. After accessing the array_view on a remote location - * or performing a copy operation involving this array_view, users are - * responsible to explicitly synchronize the array_view to the CPU before - * calling this method. Failure to do so results in undefined behavior. - */ - // FIXME: this method is not implemented - T& get_ref(const index& idx) const __CPU__ __HC__; - - /** @{ */ - /** - * Equivalent to - * "array_view::operator()(index(i0 [, i1 [, i2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the index into - * this array. - */ - T& operator() (int i0, int i1) const __CPU__ __HC__ { - static_assert(N == 2, "T& array_view::operator()(int,int) is only permissible on array_view"); - return (*this)[index<2>(i0, i1)]; - } - T& operator() (int i0, int i1, int i2) const __CPU__ __HC__ { - static_assert(N == 3, "T& array_view::operator()(int,int, int) is only permissible on array_view"); - return (*this)[index<3>(i0, i1, i2)]; - } - - /** @} */ - - /** @{ */ - /** - * This overload is defined for array_view where @f$N \ge 2@f$. - * - * This mode of indexing is equivalent to projecting on the - * most-significant dimension. It allows C-style indexing. For example: - * - * @code{.cpp} - * array myArray(myExtents, ...); - * - * myArray[index<4>(5,4,3,2)] = 7; - * assert(myArray[5][4][3][2] == 7); - * @endcode - * - * @param[in] i0 An integer that is the index into the most-significant - * dimension of this array. - * @return Returns an array_view whose dimension is one lower than that of - * this array_view. - */ - typename projection_helper::result_type - operator[] (int i) const __CPU__ __HC__ { - return projection_helper::project(*this, i); - } - typename projection_helper::result_type - operator() (int i0) const __CPU__ __HC__ { return (*this)[i0]; } - - /** @} */ - - /** - * Returns a subsection of the source array view at the origin specified by - * "idx" and with the extent specified by "ext". - * - * Example: - * - * @code{.cpp} - * array a(extent<2>(200,100)); - * array_view v1(a); // v1.extent = <200,100> - * array_view v2 = v1.section(index<2>(15,25), extent<2>(40,50)); - * assert(v2(0,0) == v1(15,25)); - * @endcode - * - * @param[in] idx Provides the offset/origin of the resulting section. - * @param[in] ext Provides the extent of the resulting section. - * @return Returns a subsection of the source array at specified origin, - * and with the specified extent. - */ - array_view section(const index& idx, - const extent& ext) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - if ( !Kalmar::amp_helper, hc::extent>::contains(idx, ext,this->extent ) ) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(cache, ext, extent_base, idx + index_base, offset); - return av; - } - - /** - * Equivalent to "section(idx, this->extent – idx)". - */ - array_view section(const index& idx) const __CPU__ __HC__ { - hc::extent ext(extent); - Kalmar::amp_helper, hc::extent>::minus(idx, ext); - return section(idx, ext); - } - - /** - * Equivalent to "section(index(), ext)". - */ - array_view section(const extent& ext) const __CPU__ __HC__ { - index idx; - return section(idx, ext); - } - - /** @{ */ - /** - * Equivalent to - * "section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the origin of - * the section - * @param[in] e0,e1,e2 The component values that will form the extent of - * the section - */ - array_view section(int i0, int e0) const __CPU__ __HC__ { - static_assert(N == 1, "Rank must be 1"); - return section(index<1>(i0), hc::extent<1>(e0)); - } - - array_view section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ { - static_assert(N == 2, "Rank must be 2"); - return section(index<2>(i0, i1), hc::extent<2>(e0, e1)); - } - - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ { - static_assert(N == 3, "Rank must be 3"); - return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2)); - } - - /** @} */ - - /** - * This member function is similar to "array::reinterpret_as", - * although it only supports array_views of rank 1 (only those guarantee - * that all elements are laid out contiguously). - * - * The size of the reinterpreted ElementType must evenly divide into the - * total size of this array_view. - * - * @return Returns an array_view from this array_view with the element - * type reinterpreted from T to ElementType. - */ - template - array_view reinterpret_as() const __CPU__ __HC__ { - static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); - if ( (extent.size() * sizeof(T)) % sizeof(ElementType)) - throw runtime_exception("errorMsg_throw", 0); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(cache), - extent<1>(size), - (offset + index_base[0])* sizeof(T) / sizeof(ElementType)); - return av; - } - - /** - * This member function is similar to "array::view_as", although it - * only supports array_views of rank 1 (only those guarantee that all - * elements are laid out contiguously). - * - * @return Returns an array_view from this array_view with the rank - * changed to K from 1. - */ - template - array_view view_as(extent viewExtent) const __CPU__ __HC__ { - static_assert(N == 1, "view_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 - if ( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(cache, viewExtent, offset + index_base[0]); - return av; - } - - ~array_view() __CPU__ __HC__ {} - - // FIXME: the following functions could be considered to move to private - const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; } - - int get_offset() const __CPU__ __HC__ { return offset; } - - index get_index_base() const __CPU__ __HC__ { return index_base; } - -private: - template friend struct projection_helper; - template friend struct array_projection_helper; - template friend class array; - template friend class array_view; - - template friend - bool is_flat(const array_view&) noexcept; - template friend - void copy(const array&, const array_view&); - template friend - void copy(InputIter, InputIter, const array_view&); - template friend - void copy(const array_view&, array&); - template friend - void copy(const array_view&, OutputIter); - template friend - void copy(const array_view& src, const array_view& dest); - - // used by view_as and reinterpret_as - array_view(const acc_buffer_t& cache, const hc::extent& ext, - int offset) __CPU__ __HC__ - : cache(cache), extent(ext), extent_base(ext), offset(offset) {} - - // used by section and projection - array_view(const acc_buffer_t& cache, const hc::extent& ext_now, - const hc::extent& ext_b, - const index& idx_b, int off) __CPU__ __HC__ - : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), - offset(off) {} - - acc_buffer_t cache; - hc::extent extent; - hc::extent extent_base; - index index_base; - int offset; -}; - -// ------------------------------------------------------------------------ -// array_view (read-only) -// ------------------------------------------------------------------------ - -/** - * The partial specialization array_view represents a view over - * elements of type const T with rank N. The elements are readonly. At the - * boundary of a call site (such as parallel_for_each), this form of array_view - * need only be copied to the target accelerator if it isn't already there. It - * will not be copied out. - */ -template -class array_view -{ -public: - typedef typename std::remove_const::type nc_T; - -#if __KALMAR_ACCELERATOR__ == 1 - typedef Kalmar::_data acc_buffer_t; -#else - typedef Kalmar::_data_host acc_buffer_t; -#endif - - /** - * The rank of this array. - */ - static const int rank = N; - - /** - * The element type of this array. - */ - typedef const T value_type; - - /** - * There is no default constructor for array_view. - */ - array_view() = delete; - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" array. The extent of the array_view is that of the src array, and - * the origin of the array view is at zero. - * - * @param[in] src An array which contains the data that this array_view is - * bound to. - */ - array_view(const array& src) __CPU__ __HC__ - : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {} - - // FIXME: following interfaces were not implemented yet - // template - // explicit array_view::array_view(const Container& src); - // template - // explicit array_view::array_view(const value_type (&src) [Size]) __CPU__ __HC__; - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A template argument that must resolve to a linear - * container that supports .data() and .size() members (such - * as std::vector or std::array) - * @param[in] extent The extent of this array_view. - */ - template ::value>::type> - array_view(const extent& extent, const Container& src) - : array_view(extent, src.data()) - { static_assert( std::is_same::type>::type, T>::value, "container element type and array view element type must match"); } - - /** - * Constructs an array_view which is bound to the data contained in the - * "src" container. The extent of the array_view is that given by the - * "extent" argument, and the origin of the array view is at zero. - * - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than the - * size of extent, the behavior is undefined. - * @param[in] ext The extent of this array_view. - */ - array_view(const extent& ext, const value_type* src) __CPU__ __HC__ -#if __KALMAR_ACCELERATOR__ == 1 - : cache((nc_T*)(src)), extent(ext), extent_base(ext), offset(0) {} -#else - : cache(ext.size(), src), extent(ext), extent_base(ext), offset(0) {} -#endif - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A template argument that must resolve to a contiguousi - * container that supports .data() and .size() members (such - * as std::vector or std::array) - */ - template ::value>::type> - array_view(int e0, Container& src) : array_view(hc::extent<1>(e0), src) {} - template ::value>::type> - array_view(int e0, int e1, Container& src) - : array_view(hc::extent(e0, e1), src) {} - template ::value>::type> - array_view(int e0, int e1, int e2, Container& src) - : array_view(hc::extent(e0, e1, e2), src) {} - - /** - * Equivalent to construction using - * "array_view(extent(e0 [, e1 [, e2 ]]), src)". - * - * @param[in] e0,e1,e2 The component values that will form the extent of - * this array_view. - * @param[in] src A pointer to the source data this array_view will bind - * to. If the number of elements pointed to is less than - * the size of extent, the behavior is undefined. - */ - array_view(int e0, const value_type *src) __CPU__ __HC__ - : array_view(hc::extent<1>(e0), src) {} - array_view(int e0, int e1, const value_type *src) __CPU__ __HC__ - : array_view(hc::extent<2>(e0, e1), src) {} - array_view(int e0, int e1, int e2, const value_type *src) __CPU__ __HC__ - : array_view(hc::extent<3>(e0, e1, e2), src) {} - - /** - * Copy constructor. Constructs an array_view from the supplied argument - * other. A shallow copy is performed. - * - * @param[in] other An object of type array_view or - * array_view from which to initialize this - * new array_view. - */ - array_view(const array_view& other) __CPU__ __HC__ - : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {} - - /** - * Copy constructor. Constructs an array_view from the supplied argument - * other. A shallow copy is performed. - * - * @param[in] other An object of type array_view from which to - * initialize this new array_view. - */ - array_view(const array_view& other) __CPU__ __HC__ - : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {} - - /** - * Access the extent that defines the shape of this array_view. - */ - extent get_extent() const __CPU__ __HC__ { return extent; } - - /** - * Access the accelerator_view where the data source of the array_view is - * located. - * - * When the data source of the array_view is native CPU memory, the method - * returns accelerator(accelerator::cpu_accelerator).default_view. When the - * data source underlying the array_view is an array, the method returns - * the accelerator_view where the source array is located. - */ - accelerator_view get_source_accelerator_view() const { return cache.get_av(); } - - /** @{ */ - /** - * Assigns the contents of the array_view "other" to this array_view, using - * a shallow copy. Both array_views will refer to the same data. - * - * @param[in] other An object of type array_view from which to copy - * into this array. - * @return Returns *this. - */ - array_view& operator=(const array_view& other) __CPU__ __HC__ { - cache = other.cache; - extent = other.extent; - index_base = other.index_base; - extent_base = other.extent_base; - offset = other.offset; - return *this; - } - - array_view& operator=(const array_view& other) __CPU__ __HC__ { - if (this != &other) { - cache = other.cache; - extent = other.extent; - index_base = other.index_base; - extent_base = other.extent_base; - offset = other.offset; - } - return *this; - } - - /** @} */ - - /** - * Copies the data referred to by this array_view to the array given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array to which to copy data from - * this array. - */ - void copy_to(array& dest) const { copy(*this, dest); } - - /** - * Copies the contents of this array_view to the array_view given by - * "dest", as if by calling "copy(*this, dest)" - * - * @param[in] dest An object of type array_view to which to copy data - * from this array. - */ - void copy_to(const array_view& dest) const { copy(*this, dest); } - - /** - * Returns a pointer to the first data element underlying this array_view. - * This is only available on array_views of rank 1. - * - * When the data source of the array_view is native CPU memory, the pointer - * returned by data() is valid for the lifetime of the data source. - * - * When the data source underlying the array_view is an array, or the array - * view is created without a data source, the pointer returned by data() in - * CPU context is ephemeral and is invalidated when the original data - * source or any of its views are accessed on an accelerator_view through a - * parallel_for_each or a copy operation. - * - * @return A const pointer to the first element in the linearized array. - */ - const T* data() const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - cache.get_cpu_access(); -#endif - static_assert(N == 1, "data() is only permissible on array views of rank 1"); - return reinterpret_cast(cache.get() + offset + index_base[0]); - } - - /** - * Returns a pointer to the device memory underlying this array_view. - * - * @return A (const) pointer to the first element in the array_view on the - * device memory. - */ - T* accelerator_pointer() const __CPU__ __HC__ { - return reinterpret_cast(cache.get_device_pointer() + offset + index_base[0]); - } - - /** - * Calling this member function informs the array_view that its bound - * memory has been modified outside the array_view interface. This will - * render all cached information stale. - */ - void refresh() const { cache.refresh(); } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to its source data container. For - * example, for an array_view on system memory, if the data underlying the - * view are modified on a remote accelerator_view through a - * parallel_for_each invocation, calling synchronize ensures that the - * modifications are synchronized to the source data and will be visible - * through the system memory pointer which the array_view was created over. - * - * For writable array_view objects, callers of this functional can - * optionally specify the type of access desired on the source data - * container through the "type" parameter. For example specifying a - * "access_type_read" (which is also the default value of the parameter) - * indicates that the data has been synchronized to its source location - * only for reading. On the other hand, specifying an access_type of - * "access_type_read_write" synchronizes the data to its source location - * both for reading and writing; i.e. any modifications to the source data - * directly through the source data container are legal after synchronizing - * the array_view with write access and before subsequently accessing the - * array_view on another remote location. - * - * It is advisable to be precise about the access_type specified in the - * synchronize call; i.e. if only write access it required, specifying - * access_type_write may yield better performance that calling synchronize - * with "access_type_read_write" since the later may require any - * modifications made to the data on remote locations to be synchronized to - * the source location, which is unnecessary if the contents are intended - * to be overwritten without reading. - */ - void synchronize() const { cache.get_cpu_access(); } - - /** - * An asynchronous version of synchronize, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - completion_future synchronize_async() const { - std::future fut = std::async([&]() mutable { synchronize(); }); - return completion_future(fut.share()); - } - - /** - * Calling this member function synchronizes any modifications made to the - * data underlying "this" array_view to the specified accelerator_view - * "av". For example, for an array_view on system memory, if the data - * underlying the view is modified on the CPU, and synchronize_to is called - * on "this" array_view, then the array_view contents are cached on the - * specified accelerator_view location. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - */ - void synchronize_to(const accelerator_view& av) const { -#if __KALMAR_ACCELERATOR__ != 1 - cache.sync_to(av.pQueue); -#endif - } - - /** - * An asynchronous version of synchronize_to, which returns a completion - * future object. When the future is ready, the synchronization operation - * is complete. - * - * @param[in] av The target accelerator_view that "this" array_view is - * synchronized for access on. - * @param[in] type An argument of type "access_type" which specifies the - * type of access on the data source that the array_view is - * synchronized for. - * @return An object of type completion_future that can be used to - * determine the status of the asynchronous operation or can be - * used to chain other operations to be executed after the - * completion of the asynchronous operation. - */ - // FIXME: this method is not implemented yet - completion_future synchronize_to_async(const accelerator_view& av) const; - - /** @{ */ - /** - * Returns a const reference to the element of this array_view that is at - * the location in N-dimensional space specified by "idx". - * - * @param[in] idx An object of type index that specifies the location of - * the element. - */ - const T& operator[](const index& idx) const __CPU__ __HC__ { -#if __KALMAR_ACCELERATOR__ != 1 - cache.get_cpu_access(); -#endif - const T *ptr = reinterpret_cast(cache.get() + offset); - return ptr[Kalmar::amp_helper, hc::extent>::flatten(idx + index_base, extent_base)]; - } - const T& operator()(const index& idx) const __CPU__ __HC__ { - return (*this)[idx]; - } - - /** @} */ - - /** - * Returns a reference to the element of this array_view that is at the - * location in N-dimensional space specified by "idx". - * - * Unlike the other indexing operators for accessing the array_view on the - * CPU, this method does not implicitly synchronize this array_view's - * contents to the CPU. After accessing the array_view on a remote location - * or performing a copy operation involving this array_view, users are - * responsible to explicitly synchronize the array_view to the CPU before - * calling this method. Failure to do so results in undefined behavior. - */ - // FIXME: this method is not implemented - const T& get_ref(const index& idx) const __CPU__ __HC__; - - /** @{ */ - /** - * Equivalent to - * "array_view::operator()(index(i0 [, i1 [, i2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the index into - * this array. - */ - const T& operator()(int i0) const __CPU__ __HC__ { - static_assert(N == 1, "const T& array_view::operator()(int) is only permissible on array_view"); - return (*this)[index<1>(i0)]; - } - - const T& operator()(int i0, int i1) const __CPU__ __HC__ { - static_assert(N == 2, "const T& array_view::operator()(int,int) is only permissible on array_view"); - return (*this)[index<2>(i0, i1)]; - } - const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ { - static_assert(N == 3, "const T& array_view::operator()(int,int, int) is only permissible on array_view"); - return (*this)[index<3>(i0, i1, i2)]; - } - - /** @} */ - - /** @{ */ - /** - * This overload is defined for array_view where @f$N \ge 2@f$. - * - * This mode of indexing is equivalent to projecting on the - * most-significant dimension. It allows C-style indexing. For example: - * - * @code{.cpp} - * array myArray(myExtents, ...); - * - * myArray[index<4>(5,4,3,2)] = 7; - * assert(myArray[5][4][3][2] == 7); - * @endcode - * - * @param[in] i0 An integer that is the index into the most-significant - * dimension of this array. - * @return Returns an array_view whose dimension is one lower than that of - * this array_view. - */ - typename projection_helper::const_result_type - operator[] (int i) const __CPU__ __HC__ { - return projection_helper::project(*this, i); - } - - // FIXME: typename projection_helper::const_result_type - // operator() (int i0) const __CPU__ __HC__ - // is not implemented - - /** @} */ - - /** - * Returns a subsection of the source array view at the origin specified by - * "idx" and with the extent specified by "ext". - * - * Example: - * - * @code{.cpp} - * array a(extent<2>(200,100)); - * array_view v1(a); // v1.extent = <200,100> - * array_view v2 = v1.section(index<2>(15,25), extent<2>(40,50)); - * assert(v2(0,0) == v1(15,25)); - * @endcode - * - * @param[in] idx Provides the offset/origin of the resulting section. - * @param[in] ext Provides the extent of the resulting section. - * @return Returns a subsection of the source array at specified origin, - * and with the specified extent. - */ - array_view section(const index& idx, - const extent& ext) const __CPU__ __HC__ { - array_view av(cache, ext, extent_base, idx + index_base, offset); - return av; - } - - /** - * Equivalent to "section(idx, this->extent – idx)". - */ - array_view section(const index& idx) const __CPU__ __HC__ { - hc::extent ext(extent); - Kalmar::amp_helper, hc::extent>::minus(idx, ext); - return section(idx, ext); - } - - /** - * Equivalent to "section(index(), ext)". - */ - array_view section(const extent& ext) const __CPU__ __HC__ { - index idx; - return section(idx, ext); - } - - /** @{ */ - /** - * Equivalent to - * "section(index(i0 [, i1 [, i2 ]]), extent(e0 [, e1 [, e2 ]]))". - * - * @param[in] i0,i1,i2 The component values that will form the origin of - * the section - * @param[in] e0,e1,e2 The component values that will form the extent of - * the section - */ - array_view section(int i0, int e0) const __CPU__ __HC__ { - static_assert(N == 1, "Rank must be 1"); - return section(index<1>(i0), hc::extent<1>(e0)); - } - - array_view section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ { - static_assert(N == 2, "Rank must be 2"); - return section(index<2>(i0, i1), hc::extent<2>(e0, e1)); - } - - array_view section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ { - static_assert(N == 3, "Rank must be 3"); - return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2)); - } - - /** @} */ - - /** - * This member function is similar to "array::reinterpret_as", - * although it only supports array_views of rank 1 (only those guarantee - * that all elements are laid out contiguously). - * - * The size of the reinterpreted ElementType must evenly divide into the - * total size of this array_view. - * - * @return Returns an array_view from this array_view with the element - * type reinterpreted from T to ElementType. - */ - template - array_view reinterpret_as() const __CPU__ __HC__ { - static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 - static_assert( ! (std::is_pointer::value ),"can't use pointer in the kernel"); - static_assert( ! (std::is_same::value ),"can't use short in the kernel"); -#endif - int size = extent.size() * sizeof(T) / sizeof(ElementType); - using buffer_type = typename array_view::acc_buffer_t; - array_view av(buffer_type(cache), - extent<1>(size), - (offset + index_base[0])* sizeof(T) / sizeof(ElementType)); - return av; - } - - /** - * This member function is similar to "array::view_as", although it - * only supports array_views of rank 1 (only those guarantee that all - * elements are laid out contiguously). - * - * @return Returns an array_view from this array_view with the rank - * changed to K from 1. - */ - template - array_view view_as(extent viewExtent) const __CPU__ __HC__ { - static_assert(N == 1, "view_as is only permissible on array views of rank 1"); -#if __KALMAR_ACCELERATOR__ != 1 - if ( viewExtent.size() > extent.size()) - throw runtime_exception("errorMsg_throw", 0); -#endif - array_view av(cache, viewExtent, offset + index_base[0]); - return av; - } - - ~array_view() __CPU__ __HC__ {} - - // FIXME: the following functions may be considered to move to private - const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; } - - int get_offset() const __CPU__ __HC__ { return offset; } - - index get_index_base() const __CPU__ __HC__ { return index_base; } - -private: - template friend struct projection_helper; - template friend struct array_projection_helper; - template friend class array; - template friend class array_view; - - template friend - bool is_flat(const array_view&) noexcept; - template friend - void copy(const array&, const array_view&); - template - void copy(InputIter, InputIter, const array_view&); - template friend - void copy(const array_view&, array&); - template friend - void copy(const array_view&, OutputIter); - template friend - void copy(const array_view& src, const array_view& dest); - - // used by view_as and reinterpret_as - array_view(const acc_buffer_t& cache, const hc::extent& ext, - int offset) __CPU__ __HC__ - : cache(cache), extent(ext), extent_base(ext), offset(offset) {} - - // used by section and projection - array_view(const acc_buffer_t& cache, const hc::extent& ext_now, - const extent& ext_b, - const index& idx_b, int off) __CPU__ __HC__ - : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), - offset(off) {} - - acc_buffer_t cache; - hc::extent extent; - hc::extent extent_base; - index index_base; - int offset; -}; - -// ------------------------------------------------------------------------ -// utility functions for copy -// ------------------------------------------------------------------------ - -template -static inline bool is_flat(const array_view& av) noexcept { - return av.extent == av.extent_base && av.index_base == index(); -} - -template -static inline bool is_flat(const array_view& av) noexcept { return true; } - -template -struct copy_input -{ - void operator()(InputIter& It, T* ptr, const extent& ext, - const extent& base, const index& idx) - { - size_t stride = 1; - for (int i = dim; i < N; i++) - stride *= base[i]; - ptr += stride * idx[dim - 1]; - for (int i = 0; i < ext[dim - 1]; i++) { - copy_input()(It, ptr, ext, base, idx); - ptr += stride; - } - } -}; - -template -struct copy_input -{ - void operator()(InputIter& It, T* ptr, const extent& ext, - const extent& base, const index& idx) - { - InputIter end = It; - std::advance(end, ext[N - 1]); - std::copy(It, end, ptr + idx[N - 1]); - It = end; - } -}; - -template -struct copy_output -{ - void operator()(const T* ptr, OutputIter& It, const extent& ext, - const extent& base, const index& idx) - { - size_t stride = 1; - for (int i = dim; i < N; i++) - stride *= base[i]; - ptr += stride * idx[dim - 1]; - for (int i = 0; i < ext[dim - 1]; i++) { - copy_output()(ptr, It, ext, base, idx); - ptr += stride; - } - } -}; - -template -struct copy_output -{ - void operator()(const T* ptr, OutputIter& It, const extent& ext, - const extent& base, const index& idx) - { - ptr += idx[N - 1]; - It = std::copy(ptr, ptr + ext[N - 1], It); - } -}; - -template -struct copy_bidir -{ - void operator()(const T* src, T* dst, const extent& ext, - const extent& base1, const index& idx1, - const extent& base2, const index& idx2) - { - size_t stride1 = 1; - for (int i = dim; i < N; i++) - stride1 *= base1[i]; - src += stride1 * idx1[dim - 1]; - - size_t stride2 = 1; - for (int i = dim; i < N; i++) - stride2 *= base2[i]; - dst += stride2 * idx2[dim - 1]; - - for (int i = 0; i < ext[dim - 1]; i++) { - copy_bidir()(src, dst, ext, base1, idx1, base2, idx2); - src += stride1; - dst += stride2; - } - } -}; - -template -struct copy_bidir -{ - void operator()(const T* src, T* dst, const extent& ext, - const extent& base1, const index& idx1, - const extent& base2, const index& idx2) - { - src += idx1[N - 1]; - dst += idx2[N - 1]; - std::copy(src, src + ext[N - 1], dst); - } -}; - -template -struct do_copy -{ - template