diff --git a/.gitmodules b/.gitmodules
index 91e2b92ead7..c6f328bbd8a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "llvm"]
 	path = compiler
 	url = https://github.com/RadeonOpenCompute/llvm.git
-	branch = amd-common
+	branch = feature_hc_next
 [submodule "lld"]
 	path = lld
 	url = https://github.com/RadeonOpenCompute/lld.git
@@ -9,11 +9,11 @@
 [submodule "clang"]
 	path = clang
 	url = https://github.com/RadeonOpenCompute/hcc-clang-upgrade.git
-	branch = clang_tot_upgrade
+	branch = feature_trampolines_are_for_babies
 [submodule "compiler-rt"]
 	path = compiler-rt
 	url = https://github.com/RadeonOpenCompute/compiler-rt
-    branch = amd-hcc
+    branch = amd-common
 [submodule "rocdl"]
 	path = rocdl
 	url = http://github.com/RadeonOpenCompute/ROCm-Device-Libs.git
@@ -21,4 +21,4 @@
 [submodule "clang-tools-extra"]
 	path = clang-tools-extra
 	url = https://github.com/RadeonOpenCompute/clang-tools-extra.git
-	branch = amd-hcc
+	branch = amd-common
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9e35332c39..65d0d20ead6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,30 +1,39 @@
-cmake_minimum_required( VERSION 3.0 )
-project (HCC)
+cmake_minimum_required(VERSION 3.0)
+project(HCC LANGUAGES CXX)
+
+set(CXX_EXTENSIONS OFF)
 
 include(GNUInstallDirs)
 
-SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/scripts/cmake")
-MESSAGE("Module path: ${CMAKE_MODULE_PATH}")
+set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/scripts/cmake")
+message("Module path: ${CMAKE_MODULE_PATH}")
 
 # set as release build by default
-IF (NOT CMAKE_BUILD_TYPE)
-  SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: Release Debug" FORCE)
-ENDIF(NOT CMAKE_BUILD_TYPE)
-
-# Use two different methods to determine host distribution: lsb_release and if that fails grep /etc/os-release
-find_program( lsb_executable lsb_release )
+if (NOT CMAKE_BUILD_TYPE)
+  set(
+    CMAKE_BUILD_TYPE Release
+    CACHE STRING "Choose the type of build, options are: Release Debug" FORCE)
+endif()
 
-if( lsb_executable )
-  execute_process( COMMAND ${lsb_executable} -is OUTPUT_VARIABLE DISTRO_ID OUTPUT_STRIP_TRAILING_WHITESPACE )
-  execute_process( COMMAND ${lsb_executable} -rs OUTPUT_VARIABLE DISTRO_RELEASE OUTPUT_STRIP_TRAILING_WHITESPACE )
+# Use two different methods to determine host distribution: lsb_release and if
+# that fails grep /etc/os-release
+find_program(lsb_executable lsb_release)
+
+if (lsb_executable)
+  execute_process(
+    COMMAND ${lsb_executable} -is
+    OUTPUT_VARIABLE DISTRO_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
+  execute_process(
+    COMMAND ${lsb_executable} -rs
+    OUTPUT_VARIABLE DISTRO_RELEASE OUTPUT_STRIP_TRAILING_WHITESPACE)
 else()
-  if( EXISTS "/etc/os-release" )
-    file( STRINGS "/etc/os-release" DISTRO_ID REGEX "^ID=" )
-    file( STRINGS "/etc/os-release" DISTRO_RELEASE REGEX "^VERSION_ID=" )
-    string( REPLACE "ID=" "" DISTRO_ID "${DISTRO_ID}" )
-    string( REPLACE "VERSION_ID=" "" DISTRO_RELEASE "${DISTRO_RELEASE}" )
-  endif( )
-endif( )
+  if (EXISTS "/etc/os-release")
+    file(STRINGS "/etc/os-release" DISTRO_ID REGEX "^ID=")
+    file(STRINGS "/etc/os-release" DISTRO_RELEASE REGEX "^VERSION_ID=")
+    string(REPLACE "ID=" "" DISTRO_ID "${DISTRO_ID}")
+    string(REPLACE "VERSION_ID=" "" DISTRO_RELEASE "${DISTRO_RELEASE}")
+  endif()
+endif()
 
 # Accepted values for DISTRO_ID: trusty (Ubuntu 14.04), xenial (Ubuntu 16.06), fd23 (Fedora 23)
 string(TOLOWER "${DISTRO_ID}" DISTRO_ID )
@@ -213,9 +222,9 @@ if (NOT HCC_VERSION_STRING)
 endif()
 
 # Set HCC version string. The rule for version string is:
-# HCC_VERSION_MAJOR . HCC_VERSION_MINOR . HCC_VERSION_PATCH-KALMAR_SDK_COMIT-KALMAR_FRONTEND_COMMIT-KALMAR_BACKEND_COMMIT
+# HCC_VERSION_MAJOR . HCC_VERSION_MINOR . HCC_VERSION_PATCH-HC_SDK_COMIT-HC_FRONTEND_COMMIT-HC_BACKEND_COMMIT
 add_version_info_from_git(HCC_VERSION_STRING
-  HCC_VERSION_PATCH KALMAR_SDK_COMMIT KALMAR_FRONTEND_COMMIT KALMAR_BACKEND_COMMIT)
+  HCC_VERSION_PATCH HC_SDK_COMMIT HC_FRONTEND_COMMIT HC_BACKEND_COMMIT)
 
 # set default installation path
 set(INSTALL_DIR_NAME "hcc")
@@ -290,7 +299,7 @@ MESSAGE(STATUS "HCC configured with AMDGPU targets: ${AMDGPU_TARGET}")
 # - AMDGPU : for HSA systems configured with Lightning backend
 #################
 
-set(KALMAR_BACKEND "HCC_BACKEND_AMDGPU")
+set(HC_BACKEND "HCC_BACKEND_AMDGPU")
 
 #########################
 # build target: world
@@ -357,14 +366,16 @@ add_custom_command(TARGET clang_links POST_BUILD
 )
 
 # install certain LLVM libraries needed by HIP
+# TODO: why HIP needs this random soup is unclear; the HC specific passes are
+#       definitely not "certain LLVM libraries needed by HIP".
 install(PROGRAMS $<TARGET_FILE:LLVMAMDGPUDesc>
                  $<TARGET_FILE:LLVMAMDGPUUtils>
                  $<TARGET_FILE:LLVMMC>
                  $<TARGET_FILE:LLVMCore>
                  $<TARGET_FILE:LLVMSupport>
-                 $<TARGET_FILE:LLVMSelectAcceleratorCode>
                  $<TARGET_FILE:LLVMPromotePointerKernArgsToGlobal>
-                 $<TARGET_FILE:LLVMHello>
+                 $<TARGET_FILE:LLVMSelectAcceleratorCode>
+                 $<TARGET_FILE:LLVMUndefineGlobalsInAcceleratorCode>
         DESTINATION lib
         COMPONENT compiler
 )
@@ -383,6 +394,7 @@ add_custom_target(world DEPENDS clang_links)
 
 # move headers to build dir before building rocdl and hcc lib
 add_subdirectory(include)
+add_subdirectory(third_party)
 
 # build the integrated ROCm Device Library
 set(AMDHSACOD ${ROCM_ROOT}/bin/amdhsacod CACHE FILEPATH "Specify the amdhsacod tool")
diff --git a/Jenkinsfile b/Jenkinsfile
index 0771b62b435..c5ec339393d 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -97,7 +97,7 @@ node( 'hcctest' )
             -DHSA_AMDGPU_GPU_TARGET="gfx900;gfx803" \
             -DNUM_TEST_THREADS="4" \
             ../..
-          make -j\$(nproc)
+          make -j2
         """
 
       // Cap the maximum amount of testing, in case of hangs
diff --git a/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp b/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp
index b4e4f0f27e7..b10ee0f3170 100644
--- a/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp
+++ b/benchmarks/AcceleratorViewCopy/avstress_0x18.cpp
@@ -1,8 +1,8 @@
-// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 -DRUNMASK=0x18 && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out
-#include <hc.hpp>
-#include <hc_am.hpp>
+// RUN: %hc %s -o %t.out -DRUNMASK=0x18 && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
-#include "/opt/rocm/include/hsa/hsa.h"
+#include <hsa/hsa.h>
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp b/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp
index b6ef2edcdb3..69b14eb873e 100644
--- a/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp
+++ b/benchmarks/AcceleratorViewCopy/avstress_0xFF.cpp
@@ -1,8 +1,8 @@
-// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 -DRUNMASK=0xff && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out
-#include <hc.hpp>
-#include <hc_am.hpp>
+// RUN: %hc %s -o %t.out -DRUNMASK=0xff && HCC_SERIALIZE_KERNEL=0x3 HCC_SERIALIZE_COPY=0x3 %t.out
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
-#include "/opt/rocm/include/hsa/hsa.h"
+#include <hsa/hsa.h>
 
 #include <algorithm>
 #include <cstddef>
diff --git a/benchmarks/RuntimeOverheads/kernel_dispatch_latency.cpp b/benchmarks/RuntimeOverheads/kernel_dispatch_latency.cpp
index 0fd6cb70014..c7f0f56ea17 100644
--- a/benchmarks/RuntimeOverheads/kernel_dispatch_latency.cpp
+++ b/benchmarks/RuntimeOverheads/kernel_dispatch_latency.cpp
@@ -1,7 +1,7 @@
-// RUN: %hc %s -lhc_am -o %t.out && %t.out
+// RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 #include <iostream>
 #include <random>
@@ -23,10 +23,7 @@ bool test1() {
     // launch kernel
     hc::extent<1> e(1024);
     clock_gettime(CLOCK_REALTIME, &begin);
-    hc::completion_future fut = hc::parallel_for_each(
-      e,
-      [=](hc::index<1> idx) restrict(amp) {
-    });
+    auto fut = hc::parallel_for_each(e, [=](hc::index<1>) [[hc]] {});
     fut.wait();
     ret &= (fut.is_ready() == true);
 
@@ -77,11 +74,8 @@ bool test2() {
     // launch kernel
     hc::extent<1> e(vecSize);
     clock_gettime(CLOCK_REALTIME, &begin);
-    hc::completion_future fut = hc::parallel_for_each(
-      e,
-      [=](hc::index<1> idx) restrict(amp) {
-        p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]];
-  
+    auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] {
+      p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]];
     });
     fut.wait();
     ret &= (fut.is_ready() == true);
@@ -104,10 +98,7 @@ bool test2() {
 void init() {
     // launch an empty kernel to initialize everything
     hc::extent<1> e(1024);
-    hc::completion_future fut = hc::parallel_for_each(
-      e,
-      [=](hc::index<1> idx) restrict(amp) {
-    });
+    auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] {});
     fut.wait();
 }
 
diff --git a/benchmarks/RuntimeOverheads/kernel_enqueue_overhead.cpp b/benchmarks/RuntimeOverheads/kernel_enqueue_overhead.cpp
index 266708fab9f..ed7a47fb93a 100644
--- a/benchmarks/RuntimeOverheads/kernel_enqueue_overhead.cpp
+++ b/benchmarks/RuntimeOverheads/kernel_enqueue_overhead.cpp
@@ -1,7 +1,7 @@
-// RUN: %hc %s -lhc_am -o %t.out && %t.out
+// RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 #include <iostream>
 #include <random>
@@ -23,10 +23,7 @@ bool test1() {
     // launch kernel
     hc::extent<1> e(1024);
     clock_gettime(CLOCK_REALTIME, &begin);
-    hc::completion_future fut = hc::parallel_for_each(
-      e,
-      [=](hc::index<1> idx) restrict(amp) {
-    });
+    auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] {});
     clock_gettime(CLOCK_REALTIME, &end);
     time_spent_once = ((end.tv_sec - begin.tv_sec) * 1000 * 1000) + ((end.tv_nsec - begin.tv_nsec) / 1000);
     time_spent += time_spent_once;
@@ -77,11 +74,8 @@ bool test2() {
     // launch kernel
     hc::extent<1> e(vecSize);
     clock_gettime(CLOCK_REALTIME, &begin);
-    hc::completion_future fut = hc::parallel_for_each(
-      e,
-      [=](hc::index<1> idx) restrict(amp) {
-        p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]];
-  
+    auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] {
+      p_c[idx[0]] = p_a[idx[0]] + p_b[idx[0]];
     });
     clock_gettime(CLOCK_REALTIME, &end);
     time_spent_once = ((end.tv_sec - begin.tv_sec) * 1000 * 1000) + ((end.tv_nsec - begin.tv_nsec) / 1000);
@@ -104,10 +98,7 @@ bool test2() {
 void init() {
     // launch an empty kernel to initialize everything
     hc::extent<1> e(1024);
-    hc::completion_future fut = hc::parallel_for_each(
-      e,
-      [=](hc::index<1> idx) restrict(amp) {
-    });
+    auto fut = hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] {});
     fut.wait();
 }
 
diff --git a/benchmarks/benchEmptyKernel/bench.cpp b/benchmarks/benchEmptyKernel/bench.cpp
index 1c729024f8a..80e3f2758e7 100644
--- a/benchmarks/benchEmptyKernel/bench.cpp
+++ b/benchmarks/benchEmptyKernel/bench.cpp
@@ -22,10 +22,9 @@
 
 #define BENCH_HSA 1
 
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
-#include "hc.hpp"
-#include "hc_am.hpp"
-#include "grid_launch.hpp"
 #include <iostream>
 #include <fstream>
 #include <string>
@@ -242,7 +241,7 @@ int main(int argc, char* argv[]) {
   // timing for null kernel launch appears later
 
   hc::parallel_for_each(av, hc::extent<3>(lp.grid_dim.x*lp.group_dim.x,1,1).tile(lp.group_dim.x,1,1),
-  [=](hc::index<3>& idx) __HC__ {
+  [=](hc::index<3>& idx) [[hc]] {
   }).wait();
 
   // Setting lp.cf to completion_future so we can track completion: (NULL ignores all synchronization)
@@ -258,7 +257,7 @@ int main(int argc, char* argv[]) {
         hc::completion_future cf;
         for (int j=0; j<p_burst_count ;j++) {
             cf = hc::parallel_for_each(av, hc::extent<3>(lp.grid_dim.x*lp.group_dim.x,1,1).tile(lp.group_dim.x,1,1),
-            [=](hc::index<3>& idx) __HC__ {
+            [=](hc::index<3>& idx) [[hc]] {
             });
         };
         cf.wait(hc::hcWaitModeActive);
@@ -281,7 +280,7 @@ int main(int argc, char* argv[]) {
         hc::completion_future cf;
         for (int j=0; j<p_burst_count ;j++) {
             cf = hc::parallel_for_each(av, hc::extent<3>(lp.grid_dim.x*lp.group_dim.x,1,1).tile(lp.group_dim.x,1,1),
-            [=](hc::index<3>& idx) __HC__ {
+            [=](hc::index<3>& idx) [[hc]] {
             });
         };
         cf.wait(hc::hcWaitModeBlocked);
diff --git a/benchmarks/benchEmptyKernel/hsacodelib.CPP b/benchmarks/benchEmptyKernel/hsacodelib.CPP
index 208d8e8277a..c246dd68b42 100644
--- a/benchmarks/benchEmptyKernel/hsacodelib.CPP
+++ b/benchmarks/benchEmptyKernel/hsacodelib.CPP
@@ -2,8 +2,7 @@
 #include <fstream>
 #include <assert.h>
 
-#include <hc.hpp>
-#include <grid_launch.h>
+#include <hc/hc.hpp>
 
 #include <hsa/hsa.h>
 
diff --git a/benchmarks/benchEmptyKernel/nullkernel.cpp b/benchmarks/benchEmptyKernel/nullkernel.cpp
index f1ec520cfd4..44d186b7637 100644
--- a/benchmarks/benchEmptyKernel/nullkernel.cpp
+++ b/benchmarks/benchEmptyKernel/nullkernel.cpp
@@ -1,10 +1,8 @@
 // RUN: %hc --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 -fPIC -shared %S/nullkernel.cpp -o %T/nullkernel
 // RUN: HCC_HOME=%llvm_libs_dir/../../ %extractkernel -i %T/nullkernel
 
-#include "hc.hpp"
-#include "grid_launch.hpp"
+#include <hc/hc.hpp>
 
-__attribute__((hc_grid_launch))
 void nullkernel(const grid_launch_parm lp, float* Ad) {
     if (Ad) {
         Ad[0] = 42;
diff --git a/clang b/clang
index 4600645e1c6..37ff576f91b 160000
--- a/clang
+++ b/clang
@@ -1 +1 @@
-Subproject commit 4600645e1c652ab8324f7e6c1b99502ab036de78
+Subproject commit 37ff576f91b0f7ad0030171b9da310d3ced757e9
diff --git a/clang-tools-extra b/clang-tools-extra
index f4b9e0b89f9..0254eba919e 160000
--- a/clang-tools-extra
+++ b/clang-tools-extra
@@ -1 +1 @@
-Subproject commit f4b9e0b89f99ffbe7bcf4e8d5ac08f61e65b5a98
+Subproject commit 0254eba919ec417ad27f4e475c758d4d10c1d77d
diff --git a/cmake-tests/CMakeLists.txt b/cmake-tests/CMakeLists.txt
index a851247e092..d8e5bb3a019 100644
--- a/cmake-tests/CMakeLists.txt
+++ b/cmake-tests/CMakeLists.txt
@@ -17,7 +17,7 @@ endif()
 set_target_properties(cmake-test PROPERTIES LINK_FLAGS ${new_cmake_test_link_flags})
 
 if(TARGET hccrt)
-    add_dependencies(cmake-test clang_links rocdl_links mcwamp_hsa mcwamp)
+    add_dependencies(cmake-test clang_links rocdl_links)
     target_link_libraries(cmake-test hccrt hc_am)
 else()
     # Append default hcc installation
diff --git a/cmake-tests/cmake-test.cpp b/cmake-tests/cmake-test.cpp
index da33e9e8dfd..8c8fbc485ed 100644
--- a/cmake-tests/cmake-test.cpp
+++ b/cmake-tests/cmake-test.cpp
@@ -1,6 +1,6 @@
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 #include <iostream>
 #include <string>
 #include <cmath>
diff --git a/compiler b/compiler
index 1474588ae16..d74f56ca4f3 160000
--- a/compiler
+++ b/compiler
@@ -1 +1 @@
-Subproject commit 1474588ae16aed5e8311d26335d8d41fa08ef0b6
+Subproject commit d74f56ca4f35917356bfbedcb153feea7647cd73
diff --git a/compiler-rt b/compiler-rt
index ae38e94c812..2264b759866 160000
--- a/compiler-rt
+++ b/compiler-rt
@@ -1 +1 @@
-Subproject commit ae38e94c8126c896ddbb7aadf0644f35666e97ef
+Subproject commit 2264b759866e950be23bb12c3ea50d515134b8ae
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index b9e9da2463f..945f1f335da 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -34,15 +34,15 @@ execute_process(COMMAND date +%y%W
 # get commit information
 execute_process(COMMAND git rev-parse --short HEAD
                 WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/..
-                OUTPUT_VARIABLE KALMAR_DRIVER_COMMIT
+                OUTPUT_VARIABLE HC_DRIVER_COMMIT
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
 execute_process(COMMAND git rev-parse --short HEAD
                 WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/../compiler/tools/clang
-                OUTPUT_VARIABLE KALMAR_COMPILER_COMMIT
+                OUTPUT_VARIABLE HC_COMPILER_COMMIT
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
 
 # set HCC version string
-set(HCC_VERSION_STRING "${HCC_VERSION_MAJOR}.${HCC_VERSION_MINOR}.${HCC_VERSION_PATCH}-${KALMAR_DRIVER_COMMIT}-${KALMAR_COMPILER_COMMIT}")
+set(HCC_VERSION_STRING "${HCC_VERSION_MAJOR}.${HCC_VERSION_MINOR}.${HCC_VERSION_PATCH}-${HC_DRIVER_COMMIT}-${HC_COMPILER_COMMIT}")
 
 # show HCC version string
 MESSAGE("========================================")
diff --git a/hc2/external/elfio/elf_types.hpp b/hc2/external/elfio/elf_types.hpp
index 1b90c4c57dd..1301cf4315f 100644
--- a/hc2/external/elfio/elf_types.hpp
+++ b/hc2/external/elfio/elf_types.hpp
@@ -460,53 +460,84 @@ typedef uint64_t Elf64_Off;
 #define STN_UNDEF 0
 
 // Relocation types
-#define R_386_NONE         0
-#define R_X86_64_NONE      0
-#define R_386_32           1
-#define R_X86_64_64        1
-#define R_386_PC32         2
-#define R_X86_64_PC32      2
-#define R_386_GOT32        3
-#define R_X86_64_GOT32     3
-#define R_386_PLT32        4
-#define R_X86_64_PLT32     4
-#define R_386_COPY         5
-#define R_X86_64_COPY      5
-#define R_386_GLOB_DAT     6
-#define R_X86_64_GLOB_DAT  6
-#define R_386_JMP_SLOT     7
-#define R_X86_64_JUMP_SLOT 7
-#define R_386_RELATIVE     8
-#define R_X86_64_RELATIVE  8
-#define R_386_GOTOFF       9
-#define R_X86_64_GOTPCREL  9
-#define R_386_GOTPC       10
-#define R_X86_64_32       10
-#define R_X86_64_32S      11
-#define R_X86_64_16       12
-#define R_X86_64_PC16     13
-#define R_X86_64_8        14
-#define R_X86_64_PC8      15
-#define R_X86_64_DTPMOD64 16
-#define R_X86_64_DTPOFF64 17
-#define R_X86_64_TPOFF64  18
-#define R_X86_64_TLSGD    19
-#define R_X86_64_TLSLD    20
-#define R_X86_64_DTPOFF32 21
-#define R_X86_64_GOTTPOFF 22
-#define R_X86_64_TPOFF32  23
-#define R_X86_64_PC64     24
-#define R_X86_64_GOTOFF64 25
-#define R_X86_64_GOTPC32  26
-#define R_X86_64_GOT64    27
-#define R_X86_64_GOTPCREL64      28
-#define R_X86_64_GOTPC64  29
-#define R_X86_64_GOTPLT64 30
-#define R_X86_64_PLTOFF64 31
-#define R_X86_64_GOTPC32_TLSDESC 34
-#define R_X86_64_TLSDESC_CALL    35
-#define R_X86_64_TLSDESC         36
-#define R_X86_64_IRELATIVE       37
+#define R_386_NONE             0
+#define R_X86_64_NONE          0
+#define R_386_32               1
+#define R_X86_64_64            1
+#define R_386_PC32             2
+#define R_X86_64_PC32          2
+#define R_386_GOT32            3
+#define R_X86_64_GOT32         3
+#define R_386_PLT32            4
+#define R_X86_64_PLT32         4
+#define R_386_COPY             5
+#define R_X86_64_COPY          5
+#define R_386_GLOB_DAT         6
+#define R_X86_64_GLOB_DAT      6
+#define R_386_JMP_SLOT         7
+#define R_X86_64_JUMP_SLOT     7
+#define R_386_RELATIVE         8
+#define R_X86_64_RELATIVE      8
+#define R_386_GOTOFF           9
+#define R_X86_64_GOTPCREL      9
+#define R_386_GOTPC           10
+#define R_X86_64_32           10
+#define R_386_32PLT           11
+#define R_X86_64_32S          11
+#define R_X86_64_16           12
+#define R_X86_64_PC16         13
+#define R_386_TLS_TPOFF       14
+#define R_X86_64_8            14
+#define R_386_TLS_IE          15
+#define R_X86_64_PC8          15
+#define R_386_TLS_GOTIE       16
+#define R_X86_64_DTPMOD64     16
+#define R_386_TLS_LE          17
+#define R_X86_64_DTPOFF64     17
+#define R_386_TLS_GD          18
+#define R_X86_64_TPOFF64      18
+#define R_386_TLS_LDM         19
+#define R_X86_64_TLSGD        19
+#define R_386_16              20
+#define R_X86_64_TLSLD        20
+#define R_386_PC16            21
+#define R_X86_64_DTPOFF32     21
+#define R_386_8               22
+#define R_X86_64_GOTTPOFF     22
+#define R_386_PC8             23
+#define R_X86_64_TPOFF32      23
+#define R_386_TLS_GD_32       24
+#define R_X86_64_PC64         24
+#define R_386_TLS_GD_PUSH     25
+#define R_X86_64_GOTOFF64     25
+#define R_386_TLS_GD_CALL     26
+#define R_X86_64_GOTPC32      26
+#define R_386_TLS_GD_POP      27
+#define R_X86_64_GOT64        27
+#define R_386_TLS_LDM_32      28
+#define R_X86_64_GOTPCREL64   28
+#define R_386_TLS_LDM_PUSH    29
+#define R_X86_64_GOTPC64      29
+#define R_386_TLS_LDM_CALL    30
+#define R_X86_64_GOTPLT64     30
+#define R_386_TLS_LDM_POP     31
+#define R_X86_64_PLTOFF64     31
+#define R_386_TLS_LDO_32      32
+#define R_386_TLS_IE_32       33
+#define R_386_TLS_LE_32       34
+#define R_X86_64_GOTPC32_TLSDESC  34
+#define R_386_TLS_DTPMOD32    35
+#define R_X86_64_TLSDESC_CALL 35
+#define R_386_TLS_DTPOFF32    36
+#define R_X86_64_TLSDESC      36
+#define R_386_TLS_TPOFF32     37
+#define R_X86_64_IRELATIVE    37
+#define R_386_SIZE32          38
+#define R_386_TLS_GOTDESC     39
+#define R_386_TLS_DESC_CALL   40
+#define R_386_TLS_DESC        41
+#define R_386_IRELATIVE       42
+#define R_386_GOT32X          43
 #define R_X86_64_GNU_VTINHERIT  250
 #define R_X86_64_GNU_VTENTRY    251
 
diff --git a/hc2/external/elfio/elfio.hpp b/hc2/external/elfio/elfio.hpp
index b59295b342c..508f8e77d03 100644
--- a/hc2/external/elfio/elfio.hpp
+++ b/hc2/external/elfio/elfio.hpp
@@ -51,19 +51,21 @@ THE SOFTWARE.
 TYPE                                           \
 get_##FNAME() const                            \
 {                                              \
-    return header->get_##FNAME();              \
+  return header? header->get_##FNAME() : 0;    \
 }
 
 #define ELFIO_HEADER_ACCESS_GET_SET( TYPE, FNAME ) \
 TYPE                                               \
 get_##FNAME() const                                \
 {                                                  \
-    return header->get_##FNAME();                  \
+  return header? header->get_##FNAME() : 0;        \
 }                                                  \
 void                                               \
 set_##FNAME( TYPE val )                            \
-{                                                  \
-    header->set_##FNAME( val );                    \
+{ 						   \
+  if (header) { 			    	   \
+      header->set_##FNAME( val );                  \
+  } 						   \
 }                                                  \
 
 namespace ELFIO {
@@ -112,11 +114,9 @@ class elfio
     {
         clean();
 
-        unsigned char e_ident[EI_NIDENT];
-
-        // Read ELF file signature
-        stream.seekg( 0 );
-        stream.read( reinterpret_cast<char*>( &e_ident ), sizeof( e_ident ) );
+	unsigned char e_ident[EI_NIDENT];
+	// Read ELF file signature
+	stream.read( reinterpret_cast<char*>( &e_ident ), sizeof( e_ident ) );
 
         // Is it ELF file?
         if ( stream.gcount() != sizeof( e_ident ) ||
@@ -133,7 +133,6 @@ class elfio
         }
 
         convertor.setup( e_ident[EI_DATA] );
-
         header = create_header( e_ident[EI_CLASS], e_ident[EI_DATA] );
         if ( 0 == header ) {
             return false;
@@ -143,9 +142,8 @@ class elfio
         }
 
         load_sections( stream );
-        load_segments( stream );
-
-        return true;
+        bool is_still_good = load_segments( stream );
+        return is_still_good;
     }
 
 //------------------------------------------------------------------------------
@@ -153,12 +151,11 @@ class elfio
     {
         std::ofstream f( file_name.c_str(), std::ios::out | std::ios::binary );
 
-        if ( !f ) {
+        if ( !f || !header) {
             return false;
         }
 
         bool is_still_good = true;
-
         // Define layout specific header fields
         // The position of the segment table is fixed after the header.
         // The position of the section table is variable and needs to be fixed
@@ -172,6 +169,8 @@ class elfio
         current_file_pos = header->get_header_size() +
                     header->get_segment_entry_size() * header->get_segments_num();
 
+        calc_segment_alignment();
+
         is_still_good = layout_segments_and_their_sections();
         is_still_good = is_still_good && layout_sections_without_segments();
         is_still_good = is_still_good && layout_section_table();
@@ -248,6 +247,45 @@ class elfio
         }
     }
 
+//------------------------------------------------------------------------------
+  private:
+      bool is_offset_in_section( Elf64_Off offset, const section* sec ) const {
+          return offset >= sec->get_offset() && offset < sec->get_offset()+sec->get_size();
+      }
+
+//------------------------------------------------------------------------------
+  public:
+
+      //! returns an empty string if no problems are detected,
+      //! or a string containing an error message if problems are found
+      std::string validate() const {
+
+          // check for overlapping sections in the file
+          for ( int i = 0; i < sections.size(); ++i) {
+              for ( int j = i+1; j < sections.size(); ++j ) {
+                  const section* a = sections[i];
+                  const section* b = sections[j];
+                  if (   !(a->get_type() & SHT_NOBITS)
+                      && !(b->get_type() & SHT_NOBITS)
+                      && (a->get_size() > 0)
+                      && (b->get_size() > 0)
+                      && (a->get_offset() > 0)
+                      && (b->get_offset() > 0)) {
+                      if (   is_offset_in_section( a->get_offset(), b )
+                          || is_offset_in_section( a->get_offset()+a->get_size()-1, b )
+                          || is_offset_in_section( b->get_offset(), a )
+                          || is_offset_in_section( b->get_offset()+b->get_size()-1, a )) {
+                          return "Sections " + a->get_name() + " and " + b->get_name() + " overlap in file";
+                      }
+                  }
+              }
+          }
+
+          // more checks to be added here...
+
+          return "";
+      }
+
 //------------------------------------------------------------------------------
   private:
 //------------------------------------------------------------------------------
@@ -382,6 +420,18 @@ class elfio
         return num;
     }
 
+//------------------------------------------------------------------------------
+    //! Checks whether the addresses of the section entirely fall within the given segment.
+    //! It doesn't matter if the addresses are memory addresses, or file offsets,
+    //!  they just need to be in the same address space
+    bool is_sect_in_seg ( Elf64_Off sect_begin, Elf_Xword sect_size, Elf64_Off seg_begin, Elf64_Off seg_end ) {
+        return seg_begin <= sect_begin
+                && sect_begin + sect_size <= seg_end
+                && sect_begin < seg_end;  // this is important criteria when sect_size == 0
+                                          // Example:  seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11)
+                                          //           sect_begin=12, sect_size=0  -> shall return false!
+    }
+
 //------------------------------------------------------------------------------
     bool load_segments( std::istream& stream )
     {
@@ -417,14 +467,11 @@ class elfio
                 // SHF_ALLOC sections are matched based on the virtual address
                 // otherwise the file offset is matched
                 if( psec->get_flags() & SHF_ALLOC
-                      ? (segVBaseAddr <= psec->get_address()
-                          && psec->get_address() + psec->get_size()
-                           <= segVEndAddr)
-                      : (segBaseOffset <= psec->get_offset()
-                          && psec->get_offset() + psec->get_size()
-                           <= segEndOffset)) {
-                      seg->add_section_index( psec->get_index(),
-                                              psec->get_addr_align() );
+                      ? is_sect_in_seg( psec->get_address(), psec->get_size(), segVBaseAddr,  segVEndAddr )
+                      : is_sect_in_seg( psec->get_offset(),  psec->get_size(), segBaseOffset, segEndOffset )) {
+                      // Alignment of segment shall not be updated, to preserve original value
+                      // It will be re-calculated on saving.
+                      seg->add_section_index( psec->get_index(), 0 );
                 }
             }
 
@@ -517,6 +564,9 @@ class elfio
         for( size_t i = 0; i < worklist.size(); ++i ) {
             if( i != nextSlot && worklist[i]->is_offset_initialized()
                 && worklist[i]->get_offset() == 0 ) {
+                if (worklist[nextSlot]->get_offset() == 0) {
+                    ++nextSlot;
+                }
                 std::swap(worklist[i],worklist[nextSlot]);
                 ++nextSlot;
             }
@@ -570,6 +620,20 @@ class elfio
     }
 
 
+//------------------------------------------------------------------------------
+    void calc_segment_alignment( )
+    {
+        for( std::vector<segment*>::iterator s = segments_.begin(); s != segments_.end(); ++s ) {
+            segment* seg = *s;
+            for ( int i = 0; i < seg->get_sections_num(); ++i ) {
+                section* sect = sections_[ seg->get_section_index_at(i) ];
+                if ( sect->get_addr_align() > seg->get_align() ) {
+                    seg->set_align( sect->get_addr_align() );
+                }
+            }
+        }
+    }
+
 //------------------------------------------------------------------------------
     bool layout_segments_and_their_sections( )
     {
@@ -606,11 +670,12 @@ class elfio
             // have to be aligned
             else if ( seg->get_sections_num()
                      && !section_generated[seg->get_section_index_at( 0 )] ) {
-                Elf64_Off cur_page_alignment = current_file_pos % seg->get_align();
-                Elf64_Off req_page_alignment = seg->get_virtual_address() % seg->get_align();
+                Elf_Xword align = seg->get_align() > 0 ? seg->get_align() : 1;
+                Elf64_Off cur_page_alignment = current_file_pos % align;
+                Elf64_Off req_page_alignment = seg->get_virtual_address() % align;
                 Elf64_Off error              = req_page_alignment - cur_page_alignment;
 
-                current_file_pos += ( seg->get_align() + error ) % seg->get_align();
+                current_file_pos += ( seg->get_align() + error ) % align;
                 seg_start_pos = current_file_pos;
             }
             else if ( seg->get_sections_num() ) {
@@ -633,14 +698,20 @@ class elfio
                 // Fix up the alignment
                 if ( !section_generated[index] && sec->is_address_initialized()
                     && SHT_NOBITS != sec->get_type()
-                    && SHT_NULL != sec->get_type() ) {
+                    && SHT_NULL != sec->get_type()
+                    && 0 != sec->get_size() ) {
                     // Align the sections based on the virtual addresses
                     // when possible (this is what matters for execution)
                     Elf64_Off req_offset = sec->get_address() - seg->get_virtual_address();
                     Elf64_Off cur_offset = current_file_pos - seg_start_pos;
+                    if ( req_offset < cur_offset) {
+                         // something has gone awfully wrong, abort!
+                         // secAlign would turn out negative, seeking backwards and overwriting previous data
+                         return false;
+                    }
                     secAlign             = req_offset - cur_offset;
                 }
-                else if (!section_generated[index]) {
+                else if (!section_generated[index] && !sec->is_address_initialized() ) {
                     // If no address has been specified then only the section
                     // alignment constraint has to be matched
 					Elf_Xword align = sec->get_addr_align();
@@ -650,7 +721,7 @@ class elfio
                     Elf64_Off error = current_file_pos % align;
                     secAlign = ( align - error ) % align;
                 }
-                else {
+                else if (section_generated[index] ) {
                     // Alignment for already generated sections
                     secAlign = sec->get_offset() - seg_start_pos - segment_filesize;
                 }
@@ -685,7 +756,15 @@ class elfio
             }
 
             seg->set_file_size( segment_filesize );
-            seg->set_memory_size( segment_memory );
+
+            // If we already have a memory size from loading an elf file (value > 0),
+            // it must not shrink!
+            // Memory size may be bigger than file size and it is the loader's job to do something
+            // with the surplus bytes in memory, like initializing them with a defined value.
+            if ( seg->get_memory_size() < segment_memory ) {
+                seg->set_memory_size( segment_memory );
+            }
+
             seg->set_offset(seg_start_pos);
         }
 
@@ -775,6 +854,16 @@ class elfio
             return parent->sections_.end();
         }
 
+//------------------------------------------------------------------------------
+        std::vector<section*>::const_iterator begin() const {
+            return parent->sections_.cbegin();
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<section*>::const_iterator end() const {
+            return parent->sections_.cend();
+        }
+
 //------------------------------------------------------------------------------
       private:
         elfio* parent;
@@ -820,6 +909,16 @@ class elfio
             return parent->segments_.end();
         }
 
+//------------------------------------------------------------------------------
+        std::vector<segment*>::const_iterator begin() const {
+            return parent->segments_.cbegin();
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<segment*>::const_iterator end() const {
+            return parent->segments_.cend();
+        }
+
 //------------------------------------------------------------------------------
       private:
         elfio* parent;
diff --git a/hc2/external/elfio/elfio_dump.hpp b/hc2/external/elfio/elfio_dump.hpp
index 04948529603..d98c1ff1881 100644
--- a/hc2/external/elfio/elfio_dump.hpp
+++ b/hc2/external/elfio/elfio_dump.hpp
@@ -429,18 +429,22 @@ class dump
 //------------------------------------------------------------------------------
     static void
     header( std::ostream& out, const elfio& reader )
-    {
-        out << "ELF Header"     << std::endl                               << std::endl
-            << "  Class:      " << str_class( reader.get_class() )         << std::endl
-            << "  Encoding:   " << str_endian( reader.get_encoding() )     << std::endl
-            << "  ELFVersion: " << str_version( reader.get_elf_version() ) << std::endl
-            << "  Type:       " << str_type( reader.get_type() )           << std::endl
-            << "  Machine:    " << str_machine( reader.get_machine() )     << std::endl
-            << "  Version:    " << str_version( reader.get_version() )     << std::endl
-            << "  Entry:      " << "0x" << std::hex << reader.get_entry()  << std::endl
-            << "  Flags:      " << "0x" << std::hex << reader.get_flags()  << std::endl
-            << std::endl;
-    }
+     {
+	if (!reader.get_header_size()) 
+	 {
+	    return;
+	 }
+	out << "ELF Header"   << std::endl                               << std::endl
+	  << "  Class:      " << str_class( reader.get_class() )         << std::endl
+	  << "  Encoding:   " << str_endian( reader.get_encoding() )     << std::endl
+	  << "  ELFVersion: " << str_version( reader.get_elf_version() ) << std::endl
+	  << "  Type:       " << str_type( reader.get_type() )           << std::endl
+	  << "  Machine:    " << str_machine( reader.get_machine() )     << std::endl
+	  << "  Version:    " << str_version( reader.get_version() )     << std::endl
+	  << "  Entry:      " << "0x" << std::hex << reader.get_entry()  << std::endl
+	  << "  Flags:      " << "0x" << std::hex << reader.get_flags()  << std::endl
+	  << std::endl;
+     }
 
 //------------------------------------------------------------------------------
     static void
@@ -728,7 +732,7 @@ class dump
                 if ( dyn_no > 0 ) {
                     out << "Dynamic section (" << sec->get_name() << ")" << std::endl;
                     out << "[  Nr ] Tag              Name/Value" << std::endl;
-                    for ( int i = 0; i < dyn_no; ++i ) {
+                    for ( Elf_Xword i = 0; i < dyn_no; ++i ) {
                         Elf_Xword   tag   = 0;
                         Elf_Xword   value = 0;
                         std::string str;
diff --git a/hc2/external/elfio/elfio_dynamic.hpp b/hc2/external/elfio/elfio_dynamic.hpp
index 6f2d041e0fc..64f13b9ce7a 100644
--- a/hc2/external/elfio/elfio_dynamic.hpp
+++ b/hc2/external/elfio/elfio_dynamic.hpp
@@ -26,13 +26,14 @@ THE SOFTWARE.
 namespace ELFIO {
 
 //------------------------------------------------------------------------------
-class dynamic_section_accessor
+template< class S >
+class dynamic_section_accessor_template
 {
   public:
 //------------------------------------------------------------------------------
-    dynamic_section_accessor( const elfio& elf_file_, section* section_ ) :
-                              elf_file( elf_file_ ),
-                              dynamic_section( section_ )
+    dynamic_section_accessor_template( const elfio& elf_file_, S* section_ ) :
+                                       elf_file( elf_file_ ),
+                                       dynamic_section( section_ )
     {
     }
 
@@ -245,9 +246,12 @@ class dynamic_section_accessor
 //------------------------------------------------------------------------------
   private:
     const elfio& elf_file;
-    section*     dynamic_section;
+    S*           dynamic_section;
 };
 
+using dynamic_section_accessor = dynamic_section_accessor_template<section>;
+using const_dynamic_section_accessor = dynamic_section_accessor_template<const section>;
+
 } // namespace ELFIO
 
 #endif // ELFIO_DYNAMIC_HPP
diff --git a/hc2/external/elfio/elfio_header.hpp b/hc2/external/elfio/elfio_header.hpp
index d689a8899f7..e8713cd7894 100644
--- a/hc2/external/elfio/elfio_header.hpp
+++ b/hc2/external/elfio/elfio_header.hpp
@@ -38,11 +38,11 @@ class elf_header
     ELFIO_GET_ACCESS_DECL( unsigned char, class              );
     ELFIO_GET_ACCESS_DECL( unsigned char, elf_version        );
     ELFIO_GET_ACCESS_DECL( unsigned char, encoding           );
-    ELFIO_GET_ACCESS_DECL( Elf_Word,      version            );
     ELFIO_GET_ACCESS_DECL( Elf_Half,      header_size        );
     ELFIO_GET_ACCESS_DECL( Elf_Half,      section_entry_size );
     ELFIO_GET_ACCESS_DECL( Elf_Half,      segment_entry_size );
 
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Word,      version         );
     ELFIO_GET_SET_ACCESS_DECL( unsigned char, os_abi          );
     ELFIO_GET_SET_ACCESS_DECL( unsigned char, abi_version     );
     ELFIO_GET_SET_ACCESS_DECL( Elf_Half,      type            );
@@ -86,8 +86,6 @@ template< class T > class elf_header_impl : public elf_header
         header.e_ident[EI_CLASS]   = elf_header_impl_types<T>::file_class;
         header.e_ident[EI_DATA]    = encoding;
         header.e_ident[EI_VERSION] = EV_CURRENT;
-        header.e_version           = EV_CURRENT;
-        header.e_version           = (*convertor)( header.e_version );
         header.e_ehsize            = ( sizeof( header ) );
         header.e_ehsize            = (*convertor)( header.e_ehsize );
         header.e_shstrndx          = (*convertor)( (Elf_Half)1 );
@@ -95,6 +93,8 @@ template< class T > class elf_header_impl : public elf_header
         header.e_shentsize         = sizeof( typename elf_header_impl_types<T>::Shdr_type );
         header.e_phentsize         = (*convertor)( header.e_phentsize );
         header.e_shentsize         = (*convertor)( header.e_shentsize );
+
+		set_version( EV_CURRENT );
     }
 
     bool
@@ -119,11 +119,11 @@ template< class T > class elf_header_impl : public elf_header
     ELFIO_GET_ACCESS( unsigned char, class,              header.e_ident[EI_CLASS] );
     ELFIO_GET_ACCESS( unsigned char, elf_version,        header.e_ident[EI_VERSION] );
     ELFIO_GET_ACCESS( unsigned char, encoding,           header.e_ident[EI_DATA] );
-    ELFIO_GET_ACCESS( Elf_Word,      version,            header.e_version );
     ELFIO_GET_ACCESS( Elf_Half,      header_size,        header.e_ehsize );
     ELFIO_GET_ACCESS( Elf_Half,      section_entry_size, header.e_shentsize );
     ELFIO_GET_ACCESS( Elf_Half,      segment_entry_size, header.e_phentsize );
 
+    ELFIO_GET_SET_ACCESS( Elf_Word,      version,         header.e_version);
     ELFIO_GET_SET_ACCESS( unsigned char, os_abi,          header.e_ident[EI_OSABI] );
     ELFIO_GET_SET_ACCESS( unsigned char, abi_version,     header.e_ident[EI_ABIVERSION] );
     ELFIO_GET_SET_ACCESS( Elf_Half,      type,            header.e_type );
diff --git a/hc2/external/elfio/elfio_note.hpp b/hc2/external/elfio/elfio_note.hpp
index 35c6fe344cc..8619c7385db 100644
--- a/hc2/external/elfio/elfio_note.hpp
+++ b/hc2/external/elfio/elfio_note.hpp
@@ -38,12 +38,13 @@ namespace ELFIO {
 //------------------------------------------------------------------------------
 
 //------------------------------------------------------------------------------
-class note_section_accessor
+template< class S >
+class note_section_accessor_template
 {
   public:
 //------------------------------------------------------------------------------
-    note_section_accessor( const elfio& elf_file_, section* section_ ) :
-                           elf_file( elf_file_ ), note_section( section_ )
+    note_section_accessor_template( const elfio& elf_file_, S* section_ ) :
+                                    elf_file( elf_file_ ), note_section( section_ )
     {
         process_section();
     }
@@ -71,10 +72,10 @@ class note_section_accessor
         int align = sizeof( Elf_Word );
 
         const endianess_convertor& convertor = elf_file.get_convertor();
-        type = convertor( *(Elf_Word*)( pData + 2*align ) );
-        Elf_Word namesz = convertor( *(Elf_Word*)( pData ) );
-        descSize = convertor( *(Elf_Word*)( pData + sizeof( namesz ) ) );
-        Elf_Word max_name_size = note_section->get_size() - note_start_positions[index];
+        type = convertor( *(const Elf_Word*)( pData + 2*align ) );
+        Elf_Word namesz = convertor( *(const Elf_Word*)( pData ) );
+        descSize = convertor( *(const Elf_Word*)( pData + sizeof( namesz ) ) );
+        Elf_Xword max_name_size = note_section->get_size() - note_start_positions[index];
         if ( namesz            > max_name_size ||
              namesz + descSize > max_name_size ) {
             return false;
@@ -144,9 +145,9 @@ class note_section_accessor
         while ( current + 3*align <= size ) {
             note_start_positions.push_back( current );
             Elf_Word namesz = convertor(
-                            *(Elf_Word*)( data + current ) );
+                            *(const Elf_Word*)( data + current ) );
             Elf_Word descsz = convertor(
-                            *(Elf_Word*)( data + current + sizeof( namesz ) ) );
+                            *(const Elf_Word*)( data + current + sizeof( namesz ) ) );
 
             current += 3*sizeof( Elf_Word ) +
                        ( ( namesz + align - 1 ) / align ) * align +
@@ -157,10 +158,13 @@ class note_section_accessor
 //------------------------------------------------------------------------------
   private:
     const elfio&           elf_file;
-    section*               note_section;
+    S*                     note_section;
     std::vector<Elf_Xword> note_start_positions;
 };
 
+using note_section_accessor = note_section_accessor_template<section>;
+using const_note_section_accessor = note_section_accessor_template<const section>;
+
 } // namespace ELFIO
 
 #endif // ELFIO_NOTE_HPP
diff --git a/hc2/external/elfio/elfio_relocation.hpp b/hc2/external/elfio/elfio_relocation.hpp
index d13d8b23c7f..238598e97ba 100644
--- a/hc2/external/elfio/elfio_relocation.hpp
+++ b/hc2/external/elfio/elfio_relocation.hpp
@@ -73,13 +73,14 @@ template<> struct get_sym_and_type< Elf64_Rela >
 
 
 //------------------------------------------------------------------------------
-class relocation_section_accessor
+template< class S >
+class relocation_section_accessor_template
 {
   public:
 //------------------------------------------------------------------------------
-    relocation_section_accessor( const elfio& elf_file_, section* section_ ) :
-                                 elf_file( elf_file_ ),
-                                 relocation_section( section_ )
+    relocation_section_accessor_template( const elfio& elf_file_, S* section_ ) :
+                                          elf_file( elf_file_ ),
+                                          relocation_section( section_ )
     {
     }
 
@@ -361,9 +362,12 @@ class relocation_section_accessor
 //------------------------------------------------------------------------------
   private:
     const elfio& elf_file;
-    section*     relocation_section;
+    S*           relocation_section;
 };
 
+using relocation_section_accessor = relocation_section_accessor_template<section>;
+using const_relocation_section_accessor = relocation_section_accessor_template<const section>;
+
 } // namespace ELFIO
 
 #endif // ELFIO_RELOCATION_HPP
diff --git a/hc2/external/elfio/elfio_section.hpp b/hc2/external/elfio/elfio_section.hpp
index b2c9b456b55..cb188c14d08 100644
--- a/hc2/external/elfio/elfio_section.hpp
+++ b/hc2/external/elfio/elfio_section.hpp
@@ -45,6 +45,17 @@ class section
     ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr,  address            );
     ELFIO_GET_SET_ACCESS_DECL( Elf_Xword,   size               );
     ELFIO_GET_SET_ACCESS_DECL( Elf_Word,    name_string_offset );
+    ELFIO_GET_ACCESS_DECL    ( Elf64_Off,   offset             );
+    size_t stream_size;
+    size_t get_stream_size() const
+     {
+	return stream_size;
+     }
+
+    void set_stream_size(size_t value)
+     {
+	stream_size = value;
+     }
 
     virtual const char* get_data() const                                = 0;
     virtual void        set_data( const char* pData, Elf_Word size )    = 0;
@@ -53,7 +64,7 @@ class section
     virtual void        append_data( const std::string& data )          = 0;
 
   protected:
-    ELFIO_GET_SET_ACCESS_DECL( Elf64_Off, offset );
+    ELFIO_SET_ACCESS_DECL( Elf64_Off, offset );
     ELFIO_SET_ACCESS_DECL( Elf_Half,  index  );
     
     virtual void load( std::istream&  f,
@@ -223,23 +234,29 @@ class section_impl : public section
           std::streampos header_offset )
     {
         std::fill_n( reinterpret_cast<char*>( &header ), sizeof( header ), '\0' );
+
+	stream.seekg ( 0, stream.end );
+	set_stream_size ( stream.tellg() );
+
         stream.seekg( header_offset );
         stream.read( reinterpret_cast<char*>( &header ), sizeof( header ) );
 
+
         Elf_Xword size = get_size();
-        if ( 0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type() ) {
-            try {
-                data = new char[size];
-            } catch (const std::bad_alloc&) {
-                data      = 0;
-                data_size = 0;
-            }
-            if ( 0 != size ) {
-                stream.seekg( (*convertor)( header.sh_offset ) );
-                stream.read( data, size );
-                data_size = size;
-            }
-        }
+	if ( 0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type() && size < get_stream_size()) {
+	    try {
+		data = new char[size + 1];
+	    } catch (const std::bad_alloc&) {
+		data      = 0;
+		data_size = 0;
+	    }
+	    if ( 0 != size ) {
+		stream.seekg( (*convertor)( header.sh_offset ) );
+		stream.read( data, size );
+		data[size] = 0; //ensure data is ended with 0 to avoid oob read
+		data_size = size;
+	    }
+	}
     }
 
 //------------------------------------------------------------------------------
diff --git a/hc2/external/elfio/elfio_segment.hpp b/hc2/external/elfio/elfio_segment.hpp
index 35f17e939bc..02d752a90b0 100644
--- a/hc2/external/elfio/elfio_segment.hpp
+++ b/hc2/external/elfio/elfio_segment.hpp
@@ -92,6 +92,21 @@ class segment_impl : public segment
     ELFIO_GET_SET_ACCESS( Elf_Xword,  file_size,        ph.p_filesz );
     ELFIO_GET_SET_ACCESS( Elf_Xword,  memory_size,      ph.p_memsz  );
     ELFIO_GET_ACCESS( Elf64_Off, offset, ph.p_offset );
+    size_t stream_size;
+
+//------------------------------------------------------------------------------
+    size_t
+    get_stream_size() const
+    {
+       return stream_size;
+    }
+
+//------------------------------------------------------------------------------
+    void 
+    set_stream_size(size_t value)
+    {
+       stream_size = value;
+    }
 
 //------------------------------------------------------------------------------
     Elf_Half
@@ -176,6 +191,10 @@ class segment_impl : public segment
     load( std::istream&  stream,
           std::streampos header_offset )
     {
+
+	stream.seekg ( 0, stream.end );
+	set_stream_size ( stream.tellg() );
+
         stream.seekg( header_offset );
         stream.read( reinterpret_cast<char*>( &ph ), sizeof( ph ) );
         is_offset_set = true;
@@ -183,14 +202,19 @@ class segment_impl : public segment
         if ( PT_NULL != get_type() && 0 != get_file_size() ) {
             stream.seekg( (*convertor)( ph.p_offset ) );
             Elf_Xword size = get_file_size();
-            try {
-                data = new char[size];
-            } catch (const std::bad_alloc&) {
-                data = 0;
-            }
-            if ( 0 != data ) {
-                stream.read( data, size );
-            }
+	    if ( size > get_stream_size() ) {
+		data = 0;
+	    } else {
+		try {
+		    data = new char[size + 1];
+		} catch (const std::bad_alloc&) {
+		    data = 0;
+		}
+		if ( 0 != data ) {
+		    stream.read( data, size );
+		    data[size] = 0;
+		}
+	    }
         }
     }
 
diff --git a/hc2/external/elfio/elfio_strings.hpp b/hc2/external/elfio/elfio_strings.hpp
index df952a2145d..552f000294f 100644
--- a/hc2/external/elfio/elfio_strings.hpp
+++ b/hc2/external/elfio/elfio_strings.hpp
@@ -30,12 +30,13 @@ THE SOFTWARE.
 namespace ELFIO {
 
 //------------------------------------------------------------------------------
-class string_section_accessor
+template< class S >
+class string_section_accessor_template
 {
   public:
 //------------------------------------------------------------------------------
-    string_section_accessor( section* section_ ) :
-                             string_section( section_ )
+    string_section_accessor_template( S* section_ ) :
+                                      string_section( section_ )
     {
     }
 
@@ -88,9 +89,12 @@ class string_section_accessor
 
 //------------------------------------------------------------------------------
   private:
-    section* string_section;
+    S* string_section;
 };
 
+using string_section_accessor = string_section_accessor_template<section>;
+using const_string_section_accessor = string_section_accessor_template<const section>;
+
 } // namespace ELFIO
 
 #endif // ELFIO_STRINGS_HPP
diff --git a/hc2/external/elfio/elfio_symbols.hpp b/hc2/external/elfio/elfio_symbols.hpp
index 80e498d8d59..d18756a9af9 100644
--- a/hc2/external/elfio/elfio_symbols.hpp
+++ b/hc2/external/elfio/elfio_symbols.hpp
@@ -26,13 +26,14 @@ THE SOFTWARE.
 namespace ELFIO {
 
 //------------------------------------------------------------------------------
-class symbol_section_accessor
+template< class S >
+class symbol_section_accessor_template
 {
   public:
 //------------------------------------------------------------------------------
-    symbol_section_accessor( const elfio& elf_file_, section* symbol_section_ ) :
-                             elf_file( elf_file_ ),
-                             symbol_section( symbol_section_ )
+    symbol_section_accessor_template( const elfio& elf_file_, S* symbol_section_ ) :
+                                      elf_file( elf_file_ ),
+                                      symbol_section( symbol_section_ )
     {
         find_hash_section();
     }
@@ -87,17 +88,17 @@ class symbol_section_accessor
         bool ret = false;
 
         if ( 0 != get_hash_table_index() ) {
-            Elf_Word nbucket = *(Elf_Word*)hash_section->get_data();
-            Elf_Word nchain  = *(Elf_Word*)( hash_section->get_data() +
+            Elf_Word nbucket = *(const Elf_Word*)hash_section->get_data();
+            Elf_Word nchain  = *(const Elf_Word*)( hash_section->get_data() +
                                    sizeof( Elf_Word ) );
             Elf_Word val     = elf_hash( (const unsigned char*)name.c_str() );
 
-            Elf_Word y   = *(Elf_Word*)( hash_section->get_data() +
+            Elf_Word y   = *(const Elf_Word*)( hash_section->get_data() +
                                ( 2 + val % nbucket ) * sizeof( Elf_Word ) );
             std::string   str;
             get_symbol( y, str, value, size, bind, type, section_index, other );
             while ( str != name && STN_UNDEF != y && y < nchain ) {
-                y = *(Elf_Word*)( hash_section->get_data() +
+                y = *(const Elf_Word*)( hash_section->get_data() +
                         ( 2 + nbucket + y ) * sizeof( Elf_Word ) );
                 get_symbol( y, str, value, size, bind, type, section_index, other );
             }
@@ -268,11 +269,14 @@ class symbol_section_accessor
 //------------------------------------------------------------------------------
   private:
     const elfio&   elf_file;
-    section*       symbol_section;
+    S*             symbol_section;
     Elf_Half       hash_section_index;
     const section* hash_section;
 };
 
+using symbol_section_accessor = symbol_section_accessor_template<section>;
+using const_symbol_section_accessor = symbol_section_accessor_template<const section>;
+
 } // namespace ELFIO
 
 #endif // ELFIO_SYMBOLS_HPP
diff --git a/hc2/external/elfio/elfio_utils.hpp b/hc2/external/elfio/elfio_utils.hpp
index f8423bd1475..2baf5a77ccb 100644
--- a/hc2/external/elfio/elfio_utils.hpp
+++ b/hc2/external/elfio/elfio_utils.hpp
@@ -174,7 +174,7 @@ class endianess_convertor {
     get_host_encoding() const
     {
         static const int tmp = 1;
-        if ( 1 == *(char*)&tmp ) {
+        if ( 1 == *(const char*)&tmp ) {
             return ELFDATA2LSB;
         }
         else {
diff --git a/hc2/headers/types/program_state.hpp b/hc2/headers/types/program_state.hpp
index 6ea79b2c20d..818db1b1f3c 100644
--- a/hc2/headers/types/program_state.hpp
+++ b/hc2/headers/types/program_state.hpp
@@ -15,7 +15,7 @@
 
 #include <hsa/hsa.h>
 
-#include "../../external/elfio/elfio.hpp"
+#include <elfio/elfio.hpp>
 
 #include <link.h>
 
@@ -127,18 +127,20 @@ namespace hc2
         static
         int copy_kernel_sections_(dl_phdr_info* x, size_t, void* kernels)
         {
-            static constexpr const char kernel[] = ".kernel";
-
             auto out = static_cast<T*>(kernels);
 
             ELFIO::elfio tmp;
-            if (tmp.load(x->dlpi_name)) {
-                for (auto&& y : tmp.sections) {
-                    if (y->get_name() == kernel) {
-                        out->emplace_back(
-                            y->get_data(), y->get_data() + y->get_size());
-                    }
-                }
+
+            if (!tmp.load(x->dlpi_name)) return 0;
+
+            for (auto&& y : tmp.sections) {
+                static constexpr const char kernel[] = ".kernel";
+
+                if (y->get_name() != kernel) continue;
+
+                out->emplace_back(y->get_data(), y->get_data() + y->get_size());
+
+                return 0;
             }
 
             return 0;
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 933af298f33..ca8a71e19d8 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -1,42 +1,47 @@
+# Handle HC Legacy, for HIP compatibility (forever?)
 # Put all hcc headers into the hcc-headers target
 # .h and .hpp headers
-FILE(GLOB H_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.h*)
-# .inl headers
-FILE(GLOB INL_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.inl)
-# N4494 headers
-FILE(GLOB N4494 ${CMAKE_CURRENT_SOURCE_DIR}/coordinate
-                ${CMAKE_CURRENT_SOURCE_DIR}/array_view)
-
-
-set(HCC_HEADERS)
-#Obtain the names of each Header File
-foreach(InFName ${H_HEADERS} ${INL_HEADERS} ${N4494})
-  STRING(REGEX REPLACE ${CMAKE_CURRENT_SOURCE_DIR}/ "" OutFName ${InFName})
-  set(HCC_HEADERS ${HCC_HEADERS} "${OutFName}")
-endforeach(InFName)
-
+set(HCC_headers
+    array_view
+    coordinate
+    hc_am_internal.hpp
+    hc_am.hpp
+    hc_defines.h
+    hc_math.hpp
+    hc_printf.hpp
+    hc_rt_debug.h
+    hc.hpp
+    hcc_features.hpp
+    kalmar_aligned_alloc.h
+    kalmar_exception.h
+    kalmar_runtime.h)
 
 # Set location for output directory
 set(output_dir "${PROJECT_BINARY_DIR}/include")
 set(out_files)
-foreach( f ${HCC_HEADERS} )
-  set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
-  set( dst ${output_dir}/${f} )
-  add_custom_command(OUTPUT ${dst}
-    DEPENDS ${src}
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
-    COMMENT "Copying HCC's ${f}...")
-  list(APPEND out_files ${dst})
-endforeach( f )
+foreach(f ${HCC_headers})
+    set(src ${CMAKE_CURRENT_SOURCE_DIR}/${f})
+    set(dst ${output_dir}/${f})
+    add_custom_command(
+        OUTPUT ${dst}
+        DEPENDS ${src}
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+        COMMENT "Copying HCC's ${f}...")
+    list(APPEND out_files ${dst})
+endforeach()
 
 # Create target for hcc-headers and set dependencies
 add_custom_target(hcc-headers ALL DEPENDS ${out_files})
 add_dependencies(world hcc-headers)
 
 # Install command for headers
-install(FILES ${HCC_HEADERS}
-  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
-  DESTINATION include)
+install(
+    FILES ${HCC_headers}
+    PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+    DESTINATION include)
+
+# Handle pSTL
+add_subdirectory(experimental)
 
-# PSTL headers
-ADD_SUBDIRECTORY(experimental)
+# Handle HC Next (post ROCm 2.0)
+add_subdirectory(hc)
\ No newline at end of file
diff --git a/include/coordinate b/include/coordinate
index 959e6d624e0..6b326682424 100644
--- a/include/coordinate
+++ b/include/coordinate
@@ -59,34 +59,34 @@ class __coordinate_leaf {
     ptrdiff_t __idx;
     int dummy;
 public:
-    explicit __coordinate_leaf(ptrdiff_t __t) restrict(amp,cpu) : __idx(__t) {}
+    explicit __coordinate_leaf(ptrdiff_t __t) [[cpu, hc]] : __idx(__t) {}
 
-    __coordinate_leaf& operator=(const ptrdiff_t __t) restrict(amp,cpu) {
+    __coordinate_leaf& operator=(const ptrdiff_t __t) [[cpu, hc]] {
         __idx = __t;
         return *this;
     }
-    __coordinate_leaf& operator+=(const ptrdiff_t __t) restrict(amp,cpu) {
+    __coordinate_leaf& operator+=(const ptrdiff_t __t) [[cpu, hc]] {
         __idx += __t;
         return *this;
     }
-    __coordinate_leaf& operator-=(const ptrdiff_t __t) restrict(amp,cpu) {
+    __coordinate_leaf& operator-=(const ptrdiff_t __t) [[cpu, hc]] {
         __idx -= __t;
         return *this;
     }
-    __coordinate_leaf& operator*=(const ptrdiff_t __t) restrict(amp,cpu) {
+    __coordinate_leaf& operator*=(const ptrdiff_t __t) [[cpu, hc]] {
         __idx *= __t;
         return *this;
     }
-    __coordinate_leaf& operator/=(const ptrdiff_t __t) restrict(amp,cpu) {
+    __coordinate_leaf& operator/=(const ptrdiff_t __t) [[cpu, hc]] {
         __idx /= __t;
         return *this;
     }
-    ptrdiff_t& get()       restrict(amp,cpu) { return __idx; }
-    const ptrdiff_t& get() const restrict(amp,cpu) { return __idx; }
+    ptrdiff_t& get()       [[cpu, hc]] { return __idx; }
+    const ptrdiff_t& get() const [[cpu, hc]] { return __idx; }
 };
 
 template <class ..._Tp>
-inline void __std_swallow(_Tp&&...) /*noexcept*/ restrict(amp,cpu) {}
+inline void __std_swallow(_Tp&&...) /*noexcept*/ [[cpu, hc]] {}
 
 inline const ptrdiff_t coordinate_mul()
 {
@@ -111,61 +111,61 @@ struct __coordinate_impl<__std_indices<N...>>
 {
 private:
     template<typename ..._Up>
-        explicit __coordinate_impl(_Up... __u) restrict(amp,cpu)
+        explicit __coordinate_impl(_Up... __u) [[cpu, hc]]
         : __coordinate_leaf<N>(__u)... {}
 
 public:
-    __coordinate_impl() restrict(amp,cpu)
+    __coordinate_impl() [[cpu, hc]]
         : __coordinate_leaf<N>(0)... {}
 
-    __coordinate_impl(initializer_list<ptrdiff_t> il) restrict(amp,cpu) :
+    __coordinate_impl(initializer_list<ptrdiff_t> il) [[cpu, hc]] :
         __coordinate_leaf<N>(*(il.begin() + N))... {}
 
-    __coordinate_impl(const __coordinate_impl& other) restrict(amp,cpu)
+    __coordinate_impl(const __coordinate_impl& other) [[cpu, hc]]
         : __coordinate_impl(static_cast<const __coordinate_leaf<N>&>(other).get()...) {}
 
-    __coordinate_impl(ptrdiff_t component) restrict(amp,cpu)
+    __coordinate_impl(ptrdiff_t component) [[cpu, hc]]
         : __coordinate_leaf<N>(component)... {}
 
-    const ptrdiff_t& operator[] (size_t c) const restrict(amp,cpu) {
+    const ptrdiff_t& operator[] (size_t c) const [[cpu, hc]] {
         return static_cast<const __coordinate_leaf<0>&>(*((const __coordinate_leaf<0> *)this + c)).get();
     }
-    ptrdiff_t& operator[] (size_t c) restrict(amp,cpu) {
+    ptrdiff_t& operator[] (size_t c) [[cpu, hc]] {
         return static_cast<__coordinate_leaf<0>&>(*((__coordinate_leaf<0> *)this + c)).get();
     }
-    __coordinate_impl& operator=(const __coordinate_impl& __t) restrict(amp,cpu) {
+    __coordinate_impl& operator=(const __coordinate_impl& __t) [[cpu, hc]] {
         __std_swallow(__coordinate_leaf<N>::operator=(static_cast<const __coordinate_leaf<N>&>(__t).get())...);
         return *this;
     }
-    __coordinate_impl& operator+=(const __coordinate_impl& __t) restrict(amp,cpu) {
+    __coordinate_impl& operator+=(const __coordinate_impl& __t) [[cpu, hc]] {
         __std_swallow(__coordinate_leaf<N>::operator+=(static_cast<const __coordinate_leaf<N>&>(__t).get())...);
         return *this;
     }
-    __coordinate_impl& operator-=(const __coordinate_impl& __t) restrict(amp,cpu) {
+    __coordinate_impl& operator-=(const __coordinate_impl& __t) [[cpu, hc]] {
         __std_swallow(__coordinate_leaf<N>::operator-=(static_cast<const __coordinate_leaf<N>&>(__t).get())...);
         return *this;
     }
-    __coordinate_impl& operator*=(const __coordinate_impl& __t) restrict(amp,cpu) {
+    __coordinate_impl& operator*=(const __coordinate_impl& __t) [[cpu, hc]] {
         __std_swallow(__coordinate_leaf<N>::operator*=(static_cast<const __coordinate_leaf<N>&>(__t).get())...);
         return *this;
     }
-    __coordinate_impl& operator/=(const __coordinate_impl& __t) restrict(amp,cpu) {
+    __coordinate_impl& operator/=(const __coordinate_impl& __t) [[cpu, hc]] {
         __std_swallow(__coordinate_leaf<N>::operator/=(static_cast<const __coordinate_leaf<N>&>(__t).get())...);
         return *this;
     }
-    __coordinate_impl& operator+=(const ptrdiff_t __t) restrict(amp,cpu) {
+    __coordinate_impl& operator+=(const ptrdiff_t __t) [[cpu, hc]] {
         __std_swallow(__coordinate_leaf<N>::operator+=(__t)...);
         return *this;
     }
-    __coordinate_impl& operator-=(const ptrdiff_t __t) restrict(amp,cpu) {
+    __coordinate_impl& operator-=(const ptrdiff_t __t) [[cpu, hc]] {
         __std_swallow(__coordinate_leaf<N>::operator-=(__t)...);
         return *this;
     }
-    __coordinate_impl& operator*=(const ptrdiff_t __t) restrict(amp,cpu) {
+    __coordinate_impl& operator*=(const ptrdiff_t __t) [[cpu, hc]] {
         __std_swallow(__coordinate_leaf<N>::operator*=(__t)...);
         return *this;
     }
-    __coordinate_impl& operator/=(const ptrdiff_t __t) restrict(amp,cpu) {
+    __coordinate_impl& operator/=(const ptrdiff_t __t) [[cpu, hc]] {
         __std_swallow(__coordinate_leaf<N>::operator/=(__t)...);
         return *this;
     }
@@ -186,14 +186,14 @@ public:
     }
 };
  
-extern "C" __attribute__((const)) uint32_t amp_get_global_id(unsigned int n) restrict(amp);
+extern "C" __attribute__((const)) uint32_t amp_get_global_id(unsigned int n) [[hc]];
 
 template<size_t N> class offset;
 
 template<size_t N, typename _Tp>
 struct offset_helper
 {
-    static inline void set(_Tp& now) restrict(amp,cpu) {
+    static inline void set(_Tp& now) [[cpu, hc]] {
         now[N - 1] = static_cast<size_t>(amp_get_global_id(_Tp::rank - N));
         offset_helper<N - 1, _Tp>::set(now);
     }
@@ -201,7 +201,7 @@ struct offset_helper
 template<typename _Tp>
 struct offset_helper<1, _Tp>
 {
-    static inline void set(_Tp& now) restrict(amp,cpu) {
+    static inline void set(_Tp& now) [[cpu, hc]] {
         now[0] = static_cast<size_t>(amp_get_global_id(_Tp::rank - 1));
     }
 };
@@ -216,75 +216,75 @@ public:
     using size_type           = size_t;
     using value_type          = ptrdiff_t;
 
-    offset() /*noexcept*/ restrict(amp,cpu) : base_() {}
+    offset() /*noexcept*/ [[cpu, hc]] : base_() {}
 
     template <size_t K = N, class = typename enable_if<K == 1>::type>
-    offset(value_type v) /*noexcept*/ restrict(amp,cpu) : base_(v) {}
+    offset(value_type v) /*noexcept*/ [[cpu, hc]] : base_(v) {}
 
-    offset(initializer_list<value_type> il) restrict(amp,cpu) : base_(il)
+    offset(initializer_list<value_type> il) [[cpu, hc]] : base_(il)
     { 
 #if __KALMAR_ACCELERATOR__ != 1
         assert(il.size() == N);
 #endif
     };
 
-    reference       operator[](size_type n) restrict(amp,cpu) {
+    reference       operator[](size_type n) [[cpu, hc]] {
 #if __KALMAR_ACCELERATOR__ != 1
         assert(n < N);
 #endif
         return base_[n];
     }
-    const_reference operator[](size_type n) const restrict(amp,cpu) {
+    const_reference operator[](size_type n) const [[cpu, hc]] {
 #if __KALMAR_ACCELERATOR__ != 1
         assert(n < N);
 #endif
         return base_[n];
     }
 
-    offset& operator+=(const offset& rhs) restrict(amp,cpu) {
+    offset& operator+=(const offset& rhs) [[cpu, hc]] {
         base_ += rhs.base_;
         return *this;
     }
-    offset& operator-=(const offset& rhs) restrict(amp,cpu) {
+    offset& operator-=(const offset& rhs) [[cpu, hc]] {
         base_ -= rhs.base_;
         return *this;
     }
 
     template <size_t K = N, class = typename enable_if<K == 1>::type>
-    offset& operator++() restrict(amp,cpu) {
+    offset& operator++() [[cpu, hc]] {
         base_ += 1;
         return *this;
     }
     template <size_t K = N, class = typename enable_if<K == 1>::type>
-    offset  operator++(int) restrict(amp,cpu) {
+    offset  operator++(int) [[cpu, hc]] {
         offset ret = *this;
         base_ += 1;
         return ret;
     };
     template <size_t K = N, class = typename enable_if<K == 1>::type>
-    offset& operator--() restrict(amp,cpu) {
+    offset& operator--() [[cpu, hc]] {
         base_ -= 1;
         return *this;
     }
     template <size_t K = N, class = typename enable_if<K == 1>::type>
-    offset  operator--(int) restrict(amp,cpu) {
+    offset  operator--(int) [[cpu, hc]] {
         offset ret = *this;
         base_ -= 1;
         return ret;
     }
 
-    offset  operator+() const /*noexcept*/ restrict(amp,cpu)  { return *this; }
-    offset  operator-() const restrict(amp,cpu) {
+    offset  operator+() const /*noexcept*/ [[cpu, hc]]  { return *this; }
+    offset  operator-() const [[cpu, hc]] {
         offset __r;
         __r -= *this;
         return __r;
     }
 
-    offset& operator*=(value_type v) restrict(amp,cpu) {
+    offset& operator*=(value_type v) [[cpu, hc]] {
         base_ *= v;
         return *this;
     }
-    offset& operator/=(value_type v) restrict(amp,cpu) {
+    offset& operator/=(value_type v) [[cpu, hc]] {
         base_ /= v;
         return *this;
     }
@@ -300,7 +300,7 @@ private:
 
 public:
     __attribute__((annotate("__cxxamp_opencl_index")))
-    void __cxxamp_opencl_index() restrict(amp,cpu)
+    void __cxxamp_opencl_index() [[cpu, hc]]
 #if __KALMAR_ACCELERATOR__ == 1
     {
       offset_helper<N, offset<N>>::set(*this);
@@ -394,7 +394,7 @@ class bounds_iterator : public std::iterator<std::random_access_iterator_tag,
     template <size_t K> friend class bounds;
     ptrdiff_t stride;
     bounds<N> bnd_;  // exposition only
-    explicit bounds_iterator(const bounds<N>& bnd_, ptrdiff_t stride_ = 0) restrict(amp,cpu)
+    explicit bounds_iterator(const bounds<N>& bnd_, ptrdiff_t stride_ = 0) [[cpu, hc]]
         : bnd_(bnd_), stride(stride_) {}
 public:
     using value_type        = offset<N>;
@@ -523,16 +523,16 @@ public:
     using size_type           = size_t;
     using value_type          = ptrdiff_t;
 
-    bounds() restrict(amp,cpu) : base_() {}
+    bounds() [[cpu, hc]] : base_() {}
 
     template <size_t K = N, class = typename enable_if<K == 1>::type>
-    bounds(value_type v) restrict(amp,cpu) : base_(v) {
+    bounds(value_type v) [[cpu, hc]] : base_(v) {
 #if __KALMAR_ACCELERATOR__ != 1
         assert(v >= 0 && v <= numeric_limits<ptrdiff_t>::max());
 #endif
     }
 
-    bounds(initializer_list<value_type> il) restrict(amp,cpu) : base_(il) {
+    bounds(initializer_list<value_type> il) [[cpu, hc]] : base_(il) {
 #if __KALMAR_ACCELERATOR__ != 1
         assert(il.size() == N);
 #endif
diff --git a/include/experimental/algorithm b/include/experimental/algorithm
index 408bcd8047f..eb7a8f35015 100644
--- a/include/experimental/algorithm
+++ b/include/experimental/algorithm
@@ -19,7 +19,7 @@
  */
 
 #pragma once
-#include "../hc.hpp"
+#include <hc/hc.hpp>
 
 #include "execution_policy"
 
diff --git a/include/experimental/impl/algorithm_impl.inl b/include/experimental/impl/algorithm_impl.inl
index b2f9a5867b1..a261653b647 100644
--- a/include/experimental/impl/algorithm_impl.inl
+++ b/include/experimental/impl/algorithm_impl.inl
@@ -46,12 +46,12 @@ void generate_impl(ForwardIterator first, ForwardIterator last,
   }
 
   // FIXME: [[hc]] will cause g() having ambient context,
-  //        use restrict(amp) temporarily
+  //        use [[hc]] temporarily
   using _Ty = typename std::iterator_traits<ForwardIterator>::value_type;
   auto first_ = utils::get_pointer(first);
   hc::array_view<_Ty> av(hc::extent<1>(N), first_);
   av.discard_data();
-  kernel_launch(N, [av, g](hc::index<1> idx) restrict(amp) {
+  kernel_launch(N, [av, g](hc::index<1> idx) [[hc]] {
     av(idx) = g();
   });
 }
diff --git a/include/grid_launch.h b/include/grid_launch.h
deleted file mode 100644
index f91d23341a3..00000000000
--- a/include/grid_launch.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-#include <hc_defines.h>
-
-#define GRID_LAUNCH_VERSION 20
-
-// Extern definitions
-namespace hc{
-class completion_future;
-class accelerator_view;
-}
-
-
-// 3 dim structure for groups and grids.
-typedef struct gl_dim3
-{
-  int x,y,z;
-  gl_dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
-} gl_dim3;
-
-typedef enum gl_barrier_bit {
-    barrier_bit_queue_default,
-    barrier_bit_none,
-    barrier_bit_wait,
-} gl_barrier_bit;
-
-
-// grid_launch_parm contains information used to launch the kernel.
-typedef struct grid_launch_parm
-{
-  //! Grid dimensions
-  gl_dim3      grid_dim;
-
-  //! Group dimensions
-  gl_dim3      group_dim;;
-
-  //! Amount of dynamic group memory to use with the kernel launch.
-  //! This memory is in addition to the amount used statically in the kernel.
-  unsigned int  dynamic_group_mem_bytes;;  
-
-  //! Control setting of barrier bit on per-packet basis:
-  //! See gl_barrier_bit description.  
-  //! Placeholder, is not used to control packet dispatch yet
-  enum gl_barrier_bit barrier_bit;
-
-  //! Value of packet fences to apply to launch.
-  //! The correspond to the value of bits 9:14 in the AQL packet,
-  //! see HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE and hsa_fence_scope_t.
-  //! Set to -1 for conservative defaults.
-  //! Placeholder, is not used to control packet dispatch yet
-  unsigned int  launch_fence;
-
-  //! Pointer to the accelerator_view where the kernel should execute.
-  //! If NULL, the default view on the default accelerator is used.
-  hc::accelerator_view  *av;
-
-  //! Pointe to the completion_future used to track the status of the command.
-  //! If NULL, the command does not write status.  In this case, 
-  //! synchronization can be enforced with queue-level waits or 
-  //! waiting on younger commands.
-  hc::completion_future *cf;
-
-  grid_launch_parm() = default;
-} grid_launch_parm;
-
-
-extern void init_grid_launch(grid_launch_parm *gl);
diff --git a/include/grid_launch.hpp b/include/grid_launch.hpp
deleted file mode 100644
index 04ce7e03664..00000000000
--- a/include/grid_launch.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#pragma once
-
-#include "grid_launch.h"
-#include "hc.hpp"
-
-class grid_launch_parm_cxx : public grid_launch_parm
-{
-public:
-  grid_launch_parm_cxx() = default;
-
-  // customized serialization: don't need av and cf in kernel
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(int), &grid_dim.x);
-    s.Append(sizeof(int), &grid_dim.y);
-    s.Append(sizeof(int), &grid_dim.z);
-    s.Append(sizeof(int), &group_dim.x);
-    s.Append(sizeof(int), &group_dim.y);
-    s.Append(sizeof(int), &group_dim.z);
-  }
-
-  __attribute__((annotate("user_deserialize")))
-  grid_launch_parm_cxx(int grid_dim_x,  int grid_dim_y,  int grid_dim_z,
-                   int group_dim_x, int group_dim_y, int group_dim_z) {
-    grid_dim.x  = grid_dim_x;
-    grid_dim.y  = grid_dim_y;
-    grid_dim.z  = grid_dim_z;
-    group_dim.x = group_dim_x;
-    group_dim.y = group_dim_y;
-    group_dim.z = group_dim_z;
-  }
-};
-
-
-extern inline void grid_launch_init(grid_launch_parm *lp) {
-  lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1;
-
-  lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1;
-
-  lp->dynamic_group_mem_bytes = 0;
-
-  lp->barrier_bit = barrier_bit_queue_default;
-  lp->launch_fence = -1;
-
-  // TODO - set to NULL?
-  static hc::accelerator_view av = hc::accelerator().get_default_view();
-  lp->av = &av;
-  lp->cf = NULL;
-}
-
diff --git a/include/hc.hpp b/include/hc.hpp
index 5817321b902..b456e633b41 100644
--- a/include/hc.hpp
+++ b/include/hc.hpp
@@ -12,17 +12,14 @@
 
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 #include "hc_defines.h"
 #include "kalmar_exception.h"
-#include "kalmar_index.h"
 #include "kalmar_runtime.h"
-#include "kalmar_serialize.h"
-#include "kalmar_launch.h"
-#include "kalmar_buffer.h"
-#include "kalmar_math.h"
 
-#include "hsa_atomic.h"
-#include "kalmar_cpu_launch.h"
 #include "hcc_features.hpp"
 
 #ifndef __HC__
@@ -50,32 +47,10 @@ class AmPointerInfo;
 using namespace Kalmar::enums;
 using namespace Kalmar::CLAMP;
 
-
 // forward declaration
 class accelerator;
 class accelerator_view;
 class completion_future;
-template <int N> class extent;
-template <int N> class tiled_extent;
-template <typename T, int N> class array_view;
-template <typename T, int N> class array;
-
-
-
-// namespace alias
-// namespace hc::fast_math is an alias of namespace Kalmar::fast_math
-namespace fast_math = Kalmar::fast_math;
-
-// namespace hc::precise_math is an alias of namespace Kalmar::precise_math
-namespace precise_math = Kalmar::precise_math;
-
-// type alias
-
-/**
- * Represents a unique position in N-dimensional space.
- */
-template <int N>
-using index = Kalmar::index<N>;
 
 using runtime_exception = Kalmar::runtime_exception;
 using invalid_compute_domain = Kalmar::invalid_compute_domain;
@@ -661,64 +636,6 @@ class accelerator_view {
     std::shared_ptr<Kalmar::KalmarQueue> pQueue;
 
     friend class accelerator;
-    template <typename Q, int K> friend class array;
-    template <typename Q, int K> friend class array_view;
-  
-    template<typename Kernel> friend
-        void* Kalmar::mcw_cxxamp_get_kernel(const std::shared_ptr<Kalmar::KalmarQueue>&, const Kernel&);
-    template<typename Kernel, int dim_ext> friend
-        void Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&, void*, size_t);
-    template<typename Kernel, int dim_ext> friend
-        std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&, void*, size_t);
-    template<typename Kernel, int dim_ext> friend
-        void Kalmar::mcw_cxxamp_launch_kernel(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&);
-    template<typename Kernel, int dim_ext> friend
-        std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_launch_kernel_async(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&);
-  
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    template <typename Kernel, int N> friend
-        completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>&, Kernel const&, extent<N> const&);
-#endif
-
-    // non-tiled parallel_for_each
-    // generic version
-    template <int N, typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const extent<N>&, const Kernel&);
-  
-    // 1D specialization
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&);
-  
-    // 2D specialization
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&);
-  
-    // 3D specialization
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&);
-  
-    // tiled parallel_for_each, 3D version
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&);
-  
-    // tiled parallel_for_each, 2D version
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);
-  
-    // tiled parallel_for_each, 1D version
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);
-
-
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-public:
-#endif
-    __attribute__((annotate("user_deserialize")))
-    accelerator_view() __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        throw runtime_exception("errorMsg_throw", 0);
-#endif
-    }
 };
 
 // ------------------------------------------------------------------------
@@ -1310,7 +1227,7 @@ class completion_future {
     //        void then(const functor& func) const;
     template<typename functor>
     void then(const functor & func) {
-#if __KALMAR_ACCELERATOR__ != 1
+#if __HCC_ACCELERATOR__ != 1
       // could only assign once
       if (__thread_then == nullptr) {
         // spawn a new thread to wait on the future and then execute the callback functor
@@ -1424,63 +1341,6 @@ class completion_future {
         : __amp_future(__future), __thread_then(nullptr), __asyncOp(nullptr) {}
 
     friend class Kalmar::HSAQueue;
-    
-    // non-tiled parallel_for_each
-    // generic version
-    template <int N, typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const extent<N>&, const Kernel&);
-
-    // 1D specialization
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&);
-
-    // 2D specialization
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&);
-
-    // 3D specialization
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&);
-
-    // tiled parallel_for_each, 3D version
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&);
-
-    // tiled parallel_for_each, 2D version
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);
-
-    // tiled parallel_for_each, 1D version
-    template <typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);
-
-    // copy_async
-    template <typename T, int N> friend
-        completion_future copy_async(const array_view<const T, N>& src, const array_view<T, N>& dest);
-    template <typename T, int N> friend
-        completion_future copy_async(const array<T, N>& src, array<T, N>& dest);
-    template <typename T, int N> friend
-        completion_future copy_async(const array<T, N>& src, const array_view<T, N>& dest);
-    template <typename T, int N> friend
-        completion_future copy_async(const array_view<T, N>& src, const array_view<T, N>& dest);
-    template <typename T, int N> friend
-        completion_future copy_async(const array_view<const T, N>& src, array<T, N>& dest);
-
-    template <typename InputIter, typename T, int N> friend
-        completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);
-    template <typename InputIter, typename T, int N> friend
-        completion_future copy_async(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);
-    template <typename InputIter, typename T, int N> friend
-        completion_future copy_async(InputIter srcBegin, array<T, N>& dest);
-    template <typename InputIter, typename T, int N> friend
-        completion_future copy_async(InputIter srcBegin, const array_view<T, N>& dest);
-    template <typename OutputIter, typename T, int N> friend
-        completion_future copy_async(const array<T, N>& src, OutputIter destBegin);
-    template <typename OutputIter, typename T, int N> friend
-        completion_future copy_async(const array_view<T, N>& src, OutputIter destBegin);
-
-    // array_view
-    template <typename T, int N> friend class array_view;
 
     // accelerator_view
     friend class accelerator_view;
@@ -1609,1291 +1469,482 @@ accelerator_view::copy2d_async_ext(const void *src, void *dst, size_t width, siz
 };
 
 // ------------------------------------------------------------------------
-// extent
+// Intrinsic functions for HSAIL instructions
 // ------------------------------------------------------------------------
 
 /**
- * Represents a unique position in N-dimensional space.
+ * Fetch the size of a wavefront
  *
- * @tparam N The dimension to this extent applies. Special constructors are
- *           supplied for the cases where @f$N \in \{ 1,2,3 \}@f$, but N can
- *           be any integer greater than or equal to 1.
+ * @return The size of a wavefront.
  */
-template <int N>
-class extent {
-public:
-    /**
-     * A static member of extent<N> that contains the rank of this extent.
-     */
-    static const int rank = N;
-
-    /**
-     * The element type of extent<N>.
-     */
-    typedef int value_type;
-
-    /**
-     * Default constructor. The value at each dimension is initialized to zero.
-     * Thus, "extent<3> ix;" initializes the variable to the position (0,0,0).
-     */
-    extent() __CPU__ __HC__ : base_() {
-      static_assert(N > 0, "Dimensionality must be positive");
-    };
-
-    /**
-     * Copy constructor. Constructs a new extent<N> from the supplied argument.
-     *
-     * @param other An object of type extent<N> from which to initialize this
-     *              new extent.
-     */
-    extent(const extent& other) __CPU__ __HC__
-        : base_(other.base_) {}
-
-    /** @{ */
-    /**
-     * Constructs an extent<N> with the coordinate values provided by @f$e_{0..2}@f$.
-     * These are specialized constructors that are only valid when the rank of
-     * the extent @f$N \in \{1,2,3\}@f$. Invoking a specialized constructor
-     * whose argument @f$count \ne N@f$ will result in a compilation error.
-     *
-     * @param[in] e0 The component values of the extent vector.
-     */
-    explicit extent(int e0) __CPU__ __HC__
-        : base_(e0) {}
-
-    template <typename ..._Tp>
-        explicit extent(_Tp ... __t) __CPU__ __HC__
-        : base_(__t...) {
-      static_assert(sizeof...(__t) <= 3, "Can only supply at most 3 individual coordinates in the constructor");
-      static_assert(sizeof...(__t) == N, "rank should be consistency");
-    }
-
-    /** @} */
-
-    /**
-     * Constructs an extent<N> with the coordinate values provided the array of
-     * int component values. If the coordinate array length @f$\ne@f$ N, the
-     * behavior is undefined. If the array value is NULL or not a valid
-     * pointer, the behavior is undefined.
-     *
-     * @param[in] components An array of N int values.
-     */
-    explicit extent(const int components[]) __CPU__ __HC__
-        : base_(components) {}
-
-    /**
-     * Constructs an extent<N> with the coordinate values provided the array of
-     * int component values. If the coordinate array length @f$\ne@f$ N, the
-     * behavior is undefined. If the array value is NULL or not a valid
-     * pointer, the behavior is undefined.
-     *
-     * @param[in] components An array of N int values.
-     */
-    explicit extent(int components[]) __CPU__ __HC__
-        : base_(components) {}
-
-    /**
-     * Assigns the component values of "other" to this extent<N> object.
-     *
-     * @param[in] other An object of type extent<N> from which to copy into
-     *                  this extent.
-     * @return Returns *this.
-     */
-    extent& operator=(const extent& other) __CPU__ __HC__ {
-        base_.operator=(other.base_);
-        return *this;
-    }
-
-    /** @{ */
-    /**
-     * Returns the extent component value at position c.
-     *
-     * @param[in] c The dimension axis whose coordinate is to be accessed.
-     * @return A the component value at position c.
-     */
-    int operator[] (unsigned int c) const __CPU__ __HC__ {
-        return base_[c];
-    }
-    int& operator[] (unsigned int c) __CPU__ __HC__ {
-        return base_[c];
-    }
-
-    /** @} */
+#define __HSA_WAVEFRONT_SIZE__ (64)
+extern "C" unsigned int __wavesize() __HC__; 
 
-    /**
-     * Tests whether the index "idx" is properly contained within this extent
-     * (with an assumed origin of zero).
-     *
-     * @param[in] idx An object of type index<N>
-     * @return Returns true if the "idx" is contained within the space defined
-     *         by this extent (with an assumed origin of zero).
-     */
-    bool contains(const index<N>& idx) const __CPU__ __HC__ {
-        return Kalmar::amp_helper<N, index<N>, extent<N>>::contains(idx, *this);
-    }
 
-    /**
-     * This member function returns the total linear size of this extent<N> (in
-     * units of elements), which is computed as:
-     * extent[0] * extent[1] ... * extent[N-1]
-     */
-    unsigned int size() const __CPU__ __HC__ {
-        return Kalmar::index_helper<N, extent<N>>::count_size(*this);
-    }
+#if __hcc_backend__==HCC_BACKEND_AMDGPU
+extern "C" inline unsigned int __wavesize() __HC__ {
+  return __HSA_WAVEFRONT_SIZE__;
+}
+#endif
 
-    /** @{ */
-    /**
-     * Produces a tiled_extent object with the tile extents given by t0, t1,
-     * and t2.
-     *
-     * tile(t0, t1, t2) is only supported on extent<1>. It will produce a
-     * compile-time error if used on an extent where N @f$\ne@f$ 3.
-     * tile(t0, t1) is only supported on extent<2>. It will produce a
-     * compile-time error if used on an extent where N @f$\ne@f$ 2.
-     * tile(t0) is only supported on extent<1>. It will produce a
-     * compile-time error if used on an extent where N @f$\ne@f$ 1.
-     */
-    tiled_extent<1> tile(int t0) const;
-    tiled_extent<2> tile(int t0, int t1) const;
-    tiled_extent<3> tile(int t0, int t1, int t2) const;
+/**
+ * Count number of 1 bits in the input
+ *
+ * @param[in] input An unsinged 32-bit integer.
+ * @return Number of 1 bits in the input.
+ */
+extern "C" inline unsigned int __popcount_u32_b32(unsigned int input) __HC__ {
+  return __builtin_popcount(input);
+}
 
-    /** @} */
+/**
+ * Count number of 1 bits in the input
+ *
+ * @param[in] input An unsinged 64-bit integer.
+ * @return Number of 1 bits in the input.
+ */
+extern "C" inline unsigned int __popcount_u32_b64(unsigned long long int input) __HC__ {
+  return __builtin_popcountl(input);
+}
 
-    /** @{ */
-    /**
-     * Produces a tiled_extent object with the tile extents given by t0, t1,
-     * and t2, plus a certain amount of dynamic group segment.
-     */
-    tiled_extent<1> tile_with_dynamic(int t0, int dynamic_size) const;
-    tiled_extent<2> tile_with_dynamic(int t0, int t1, int dynamic_size) const;
-    tiled_extent<3> tile_with_dynamic(int t0, int t1, int t2, int dynamic_size) const;
+/** @{ */
+/**
+ * Extract a range of bits
+ *
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ */
+extern "C" inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ {
+  uint32_t offset = src1 & 31;
+  uint32_t width = src2 & 31;
+  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
+}
 
-    /** @} */
+extern "C" inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__ {
+  uint64_t offset = src1 & 63;
+  uint64_t width = src2 & 63;
+  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
+}
 
-    /** @{ */
-    /**
-     * Compares two objects of extent<N>.
-     *
-     * The expression
-     * leftExt @f$\oplus@f$ rightExt
-     * is true if leftExt[i] @f$\oplus@f$ rightExt[i] for every i from 0 to N-1.
-     *
-     * @param[in] other The right-hand extent<N> to be compared.
-     */
-    bool operator==(const extent& other) const __CPU__ __HC__ {
-        return Kalmar::index_helper<N, extent<N> >::equal(*this, other);
-    }
-    bool operator!=(const extent& other) const __CPU__ __HC__ {
-        return !(*this == other);
-    }
+extern "C" int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__;
 
-    /** @} */
+extern "C" int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__;
+/** @} */
 
-    /** @{ */
-    /**
-     * Adds (or subtracts) an object of type extent<N> from this extent to form
-     * a new extent. The result extent<N> is such that for a given operator @f$\oplus@f$,
-     * result[i] = this[i] @f$\oplus@f$ ext[i]
-     *
-     * @param[in] ext The right-hand extent<N> to be added or subtracted.
-     */
-    extent& operator+=(const extent& __r) __CPU__ __HC__ {
-        base_.operator+=(__r.base_);
-        return *this;
-    }
-    extent& operator-=(const extent& __r) __CPU__ __HC__ {
-        base_.operator-=(__r.base_);
-        return *this;
-    }
-    extent& operator*=(const extent& __r) __CPU__ __HC__ {
-        base_.operator*=(__r.base_);
-        return *this;
-    }
-    extent& operator/=(const extent& __r) __CPU__ __HC__ {
-        base_.operator/=(__r.base_);
-        return *this;
-    }
-    extent& operator%=(const extent& __r) __CPU__ __HC__ {
-        base_.operator%=(__r.base_);
-        return *this;
-    }
+/** @{ */
+/**
+ * Replace a range of bits
+ *
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ */
+extern "C" inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__ {
+  uint32_t offset = src2 & 31;
+  uint32_t width = src3 & 31;
+  uint32_t mask = (1 << width) - 1;
+  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+}
 
-    /** @} */
+extern "C" inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__ {
+  uint64_t offset = src2 & 63;
+  uint64_t width = src3 & 63;
+  uint64_t mask = (1 << width) - 1;
+  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+}
 
-    /** @{ */
-    /**
-     * Adds (or subtracts) an object of type index<N> from this extent to form
-     * a new extent. The result extent<N> is such that for a given operator @f$\oplus@f$,
-     * result[i] = this[i] @f$\oplus@f$ idx[i]
-     *
-     * @param[in] idx The right-hand index<N> to be added or subtracted.
-     */
-    extent operator+(const index<N>& idx) __CPU__ __HC__ {
-        extent __r = *this;
-        __r += idx;
-        return __r;
-    }
-    extent operator-(const index<N>& idx) __CPU__ __HC__ {
-        extent __r = *this;
-        __r -= idx;
-        return __r;
-    }
-    extent& operator+=(const index<N>& idx) __CPU__ __HC__ {
-        base_.operator+=(idx.base_);
-        return *this;
-    }
-    extent& operator-=(const index<N>& idx) __CPU__ __HC__ {
-        base_.operator-=(idx.base_);
-        return *this;
-    }
+extern "C" int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__;
 
-    /** @} */
+extern "C" int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__;
+/** @} */
 
-    /** @{ */
-    /**
-     * For a given operator @f$\oplus@f$, produces the same effect as
-     * (*this) = (*this) @f$\oplus@f$ value
-     *
-     * The return value is "*this".
-     *
-     * @param[in] value The right-hand int of the arithmetic operation.
-     */
-    extent& operator+=(int value) __CPU__ __HC__ {
-        base_.operator+=(value);
-        return *this;
-    }
-    extent& operator-=(int value) __CPU__ __HC__ {
-        base_.operator-=(value);
-        return *this;
-    }
-    extent& operator*=(int value) __CPU__ __HC__ {
-        base_.operator*=(value);
-        return *this;
-    }
-    extent& operator/=(int value) __CPU__ __HC__ {
-        base_.operator/=(value);
-        return *this;
-    }
-    extent& operator%=(int value) __CPU__ __HC__ {
-        base_.operator%=(value);
-        return *this;
-    }
+/** @{ */
+/**
+ * Create a bit mask that can be used with bitselect
+ *
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ */
+extern "C" unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__;
 
-    /** @} */
+extern "C" uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__;
+/** @} */
 
-    /** @{ */
-    /**
-     * For a given operator @f$\oplus@f$, produces the same effect as
-     * (*this) = (*this) @f$\oplus@f$ 1
-     *
-     * For prefix increment and decrement, the return value is "*this".
-     * Otherwise a new extent<N> is returned.
-     */
-    extent& operator++() __CPU__ __HC__ {
-        base_.operator+=(1);
-        return *this;
-    }
-    extent operator++(int) __CPU__ __HC__ {
-        extent ret = *this;
-        base_.operator+=(1);
-        return ret;
-    }
-    extent& operator--() __CPU__ __HC__ {
-        base_.operator-=(1);
-        return *this;
-    }
-    extent operator--(int) __CPU__ __HC__ {
-        extent ret = *this;
-        base_.operator-=(1);
-        return ret;
-    }
+/** @{ */
+/**
+ * Reverse the bits
+ *
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ */
 
-    /** @} */
+unsigned int __bitrev_b32(unsigned int src0) [[hc]] __asm("llvm.bitreverse.i32");
 
-private:
-    typedef Kalmar::index_impl<typename Kalmar::__make_indices<N>::type> base;
-    base base_;
-    template <int K, typename Q> friend struct Kalmar::index_helper;
-    template <int K, typename Q1, typename Q2> friend struct Kalmar::amp_helper;
-};
+uint64_t __bitrev_b64(uint64_t src0) [[hc]] __asm("llvm.bitreverse.i64");
 
-// ------------------------------------------------------------------------
-// global functions for extent
-// ------------------------------------------------------------------------
+/** @} */
 
 /** @{ */
 /**
- * Adds (or subtracts) two objects of extent<N> to form a new extent. The
- * result extent<N> is such that for a given operator @f$\oplus@f$,
- * result[i] = leftExt[i] @f$\oplus@f$ rightExt[i]
- * for every i from 0 to N-1.
+ * Do bit field selection
  *
- * @param[in] lhs The left-hand extent<N> to be compared.
- * @param[in] rhs The right-hand extent<N> to be compared.
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
  */
-// FIXME: the signature is not entirely the same as defined in:
-//        C++AMP spec v1.2 #1253
-template <int N>
-extent<N> operator+(const extent<N>& lhs, const extent<N>& rhs) __CPU__ __HC__ {
-    extent<N> __r = lhs;
-    __r += rhs;
-    return __r;
-}
-template <int N>
-extent<N> operator-(const extent<N>& lhs, const extent<N>& rhs) __CPU__ __HC__ {
-    extent<N> __r = lhs;
-    __r -= rhs;
-    return __r;
+extern "C" inline unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ {
+  return (src1 & src0) | (src2 & ~src0);
 }
 
+extern "C" inline uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__ {
+  return (src1 & src0) | (src2 & ~src0);
+}
 /** @} */
 
-/** @{ */
 /**
- * Binary arithmetic operations that produce a new extent<N> that is the result
- * of performing the corresponding binary arithmetic operation on the elements
- * of the extent operands. The result extent<N> is such that for a given
- * operator @f$\oplus@f$,
- * result[i] = ext[i] @f$\oplus@f$ value
- * or
- * result[i] = value @f$\oplus@f$ ext[i]
- * for every i from 0 to N-1.
+ * Count leading zero bits in the input
  *
- * @param[in] ext The extent<N> operand
- * @param[in] value The integer operand
+ * @param[in] input An unsigned 32-bit integer.
+ * @return Number of 0 bits until a 1 bit is found, counting start from the
+ *         most significant bit. -1 if there is no 0 bit.
  */
-// FIXME: the signature is not entirely the same as defined in:
-//        C++AMP spec v1.2 #1259
-template <int N>
-extent<N> operator+(const extent<N>& ext, int value) __CPU__ __HC__ {
-    extent<N> __r = ext;
-    __r += value;
-    return __r;
-}
-template <int N>
-extent<N> operator+(int value, const extent<N>& ext) __CPU__ __HC__ {
-    extent<N> __r = ext;
-    __r += value;
-    return __r;
-}
-template <int N>
-extent<N> operator-(const extent<N>& ext, int value) __CPU__ __HC__ {
-    extent<N> __r = ext;
-    __r -= value;
-    return __r;
-}
-template <int N>
-extent<N> operator-(int value, const extent<N>& ext) __CPU__ __HC__ {
-    extent<N> __r(value);
-    __r -= ext;
-    return __r;
-}
-template <int N>
-extent<N> operator*(const extent<N>& ext, int value) __CPU__ __HC__ {
-    extent<N> __r = ext;
-    __r *= value;
-    return __r;
-}
-template <int N>
-extent<N> operator*(int value, const extent<N>& ext) __CPU__ __HC__ {
-    extent<N> __r = ext;
-    __r *= value;
-    return __r;
-}
-template <int N>
-extent<N> operator/(const extent<N>& ext, int value) __CPU__ __HC__ {
-    extent<N> __r = ext;
-    __r /= value;
-    return __r;
-}
-template <int N>
-extent<N> operator/(int value, const extent<N>& ext) __CPU__ __HC__ {
-    extent<N> __r(value);
-    __r /= ext;
-    return __r;
-}
-template <int N>
-extent<N> operator%(const extent<N>& ext, int value) __CPU__ __HC__ {
-    extent<N> __r = ext;
-    __r %= value;
-    return __r;
-}
-template <int N>
-extent<N> operator%(int value, const extent<N>& ext) __CPU__ __HC__ {
-    extent<N> __r(value);
-    __r %= ext;
-    return __r;
+extern "C" inline unsigned int __firstbit_u32_u32(unsigned int input) __HC__ {
+  return input == 0 ? -1 : __builtin_clz(input);
 }
 
-/** @} */
-
-// ------------------------------------------------------------------------
-// tiled_extent
-// ------------------------------------------------------------------------
 
 /**
- * Represents an extent subdivided into tiles.
- * Tile sizes can be specified at runtime.
+ * Count leading zero bits in the input
  *
- * @tparam N The dimension of the extent and the tile.
+ * @param[in] input An unsigned 64-bit integer.
+ * @return Number of 0 bits until a 1 bit is found, counting start from the
+ *         most significant bit. -1 if there is no 0 bit.
  */
-template <int N>
-class tiled_extent : public extent<N> {
-public:
-    static const int rank = N;
-  
-    /**
-     * Tile size for each dimension.
-     */
-    int tile_dim[N];
-  
-    /**
-     * Default constructor. The origin and extent is default-constructed and
-     * thus zero.
-     */
-    tiled_extent() __CPU__ __HC__ : extent<N>(), tile_dim{0} {}
-
-    /**
-     * Copy constructor. Constructs a new tiled_extent from the supplied
-     * argument "other".
-     *
-     * @param[in] other An object of type tiled_extent from which to initialize
-     *                  this new extent.
-     */
-    tiled_extent(const tiled_extent& other) __CPU__ __HC__ : extent<N>(other) {
-      for (int i = 0; i < N; ++i) {
-        tile_dim[i] = other.tile_dim[i];
-      }
-    }
-};
+extern "C" inline unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__ {
+  return input == 0 ? -1 : __builtin_clzl(input);
+}
 
 /**
- * Represents an extent subdivided into tiles.
- * Tile sizes can be specified at runtime.
- * This class is 1D specialization of tiled_extent.
+ * Count leading zero bits in the input
+ *
+ * @param[in] input An signed 32-bit integer.
+ * @return Finds the first bit set in a positive integer starting from the
+ *         most significant bit, or finds the first bit clear in a negative
+ *         integer from the most significant bit.
+ *         If no bits in the input are set, then dest is set to -1.
  */
-template <>
-class tiled_extent<1> : public extent<1> {
-private:
-    /**
-     * Size of dynamic group segment.
-     */
-    unsigned int dynamic_group_segment_size;
-
-public:
-    static const int rank = 1;
-
-    /**
-     * Tile size for each dimension.
-     */
-    int tile_dim[1];
-
-    /**
-     * Default constructor. The origin and extent is default-constructed and
-     * thus zero.
-     */
-    tiled_extent() __CPU__ __HC__ : extent(0), dynamic_group_segment_size(0), tile_dim{0} {}
+extern "C" inline unsigned int __firstbit_u32_s32(int input) __HC__ {
+  if (input == 0) {
+    return -1;
+  }
 
-    /**
-     * Construct an tiled extent with the size of extent and the size of tile
-     * specified.
-     *
-     * @param[in] e0 Size of extent.
-     * @param[in] t0 Size of tile.
-     */
-    tiled_extent(int e0, int t0) __CPU__ __HC__ : extent(e0), dynamic_group_segment_size(0), tile_dim{t0} {}
+  return input > 0 ? __firstbit_u32_u32(input) : __firstbit_u32_u32(~input);
+}
 
-    /**
-     * Construct an tiled extent with the size of extent and the size of tile
-     * specified.
-     *
-     * @param[in] e0 Size of extent.
-     * @param[in] t0 Size of tile.
-     * @param[in] size Size of dynamic group segment.
-     */
-    tiled_extent(int e0, int t0, int size) __CPU__ __HC__ : extent(e0), dynamic_group_segment_size(size), tile_dim{t0} {}
 
-    /**
-     * Copy constructor. Constructs a new tiled_extent from the supplied
-     * argument "other".
-     *
-     * @param[in] other An object of type tiled_extent from which to initialize
-     *                  this new extent.
-     */
-    tiled_extent(const tiled_extent<1>& other) __CPU__ __HC__ : extent(other[0]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0]} {}
+/**
+ * Count leading zero bits in the input
+ *
+ * @param[in] input An signed 64-bit integer.
+ * @return Finds the first bit set in a positive integer starting from the
+ *         most significant bit, or finds the first bit clear in a negative
+ *         integer from the most significant bit.
+ *         If no bits in the input are set, then dest is set to -1.
+ */
+extern "C" inline unsigned int __firstbit_u32_s64(long long int input) __HC__ {
+  if (input == 0) {
+    return -1;
+  }
 
+  return input > 0 ? __firstbit_u32_u64(input) : __firstbit_u32_u64(~input);
+}
 
-    /**
-     * Constructs a tiled_extent<N> with the extent "ext".
-     *
-     * @param[in] ext The extent of this tiled_extent
-     * @param[in] t0 Size of tile.
-     */
-    tiled_extent(const extent<1>& ext, int t0) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0} {} 
+/** @{ */
+/**
+ * Find the first bit set to 1 in a number starting from the
+ * least significant bit
+ *
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ */
+extern "C" inline unsigned int __lastbit_u32_u32(unsigned int input) __HC__ {
+  return input == 0 ? -1 : __builtin_ctz(input);
+}
 
-    /**
-     * Constructs a tiled_extent<N> with the extent "ext".
-     *
-     * @param[in] ext The extent of this tiled_extent
-     * @param[in] t0 Size of tile.
-     * @param[in] size Size of dynamic group segment
-     */
-    tiled_extent(const extent<1>& ext, int t0, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0} {}
+extern "C" inline unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__ {
+  return input == 0 ? -1 : __builtin_ctzl(input);
+}
 
-    /**
-     * Set the size of dynamic group segment. The function should be called
-     * in host code, prior to a kernel is dispatched.
-     *
-     * @param[in] size The amount of dynamic group segment needed.
-     */
-    void set_dynamic_group_segment_size(unsigned int size) __CPU__ {
-        dynamic_group_segment_size = size;
-    }
+extern "C" inline unsigned int __lastbit_u32_s32(int input) __HC__ {
+  return __lastbit_u32_u32(input);
+}
 
-    /**
-     * Return the size of dynamic group segment in bytes.
-     */
-    unsigned int get_dynamic_group_segment_size() const __CPU__ {
-        return dynamic_group_segment_size;
-    }
-};
+extern "C" inline unsigned int __lastbit_u32_s64(unsigned long long input) __HC__ {
+  return __lastbit_u32_u64(input);
+}
+/** @} */
 
+/** @{ */
 /**
- * Represents an extent subdivided into tiles.
- * Tile sizes can be specified at runtime.
- * This class is 2D specialization of tiled_extent.
+ * Copy and interleave the lower half of the elements from
+ * each source into the desitionation
+ *
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a> for more detailed specification of these functions.
  */
-template <>
-class tiled_extent<2> : public extent<2> {
-private:
-    /**
-     * Size of dynamic group segment.
-     */
-    unsigned int dynamic_group_segment_size;
-
-public:
-    static const int rank = 2;
+extern "C" unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__;
 
-    /**
-     * Tile size for each dimension.
-     */
-    int tile_dim[2];
+extern "C" uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__;
 
-    /**
-     * Default constructor. The origin and extent is default-constructed and
-     * thus zero.
-     */
-    tiled_extent() __CPU__ __HC__ : extent(0, 0), dynamic_group_segment_size(0), tile_dim{0, 0} {}
+extern "C" unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__;
 
-    /**
-     * Construct an tiled extent with the size of extent and the size of tile
-     * specified.
-     *
-     * @param[in] e0 Size of extent in the 1st dimension.
-     * @param[in] e1 Size of extent in the 2nd dimension.
-     * @param[in] t0 Size of tile in the 1st dimension.
-     * @param[in] t1 Size of tile in the 2nd dimension.
-     */
-    tiled_extent(int e0, int e1, int t0, int t1) __CPU__ __HC__ : extent(e0, e1), dynamic_group_segment_size(0), tile_dim{t0, t1} {}
+extern "C" uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__;
 
-    /**
-     * Construct an tiled extent with the size of extent and the size of tile
-     * specified.
-     *
-     * @param[in] e0 Size of extent in the 1st dimension.
-     * @param[in] e1 Size of extent in the 2nd dimension.
-     * @param[in] t0 Size of tile in the 1st dimension.
-     * @param[in] t1 Size of tile in the 2nd dimension.
-     * @param[in] size Size of dynamic group segment.
-     */
-    tiled_extent(int e0, int e1, int t0, int t1, int size) __CPU__ __HC__ : extent(e0, e1), dynamic_group_segment_size(size), tile_dim{t0, t1} {}
+extern "C" uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__;
 
-    /**
-     * Copy constructor. Constructs a new tiled_extent from the supplied
-     * argument "other".
-     *
-     * @param[in] other An object of type tiled_extent from which to initialize
-     *                  this new extent.
-     */
-    tiled_extent(const tiled_extent<2>& other) __CPU__ __HC__ : extent(other[0], other[1]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1]} {}
+extern "C" int __unpacklo_s8x4(int src0, int src1) __HC__;
 
-    /**
-     * Constructs a tiled_extent<N> with the extent "ext".
-     *
-     * @param[in] ext The extent of this tiled_extent
-     * @param[in] t0 Size of tile in the 1st dimension.
-     * @param[in] t1 Size of tile in the 2nd dimension.
-     */
-    tiled_extent(const extent<2>& ext, int t0, int t1) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0, t1} {}
+extern "C" int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__;
 
-    /**
-     * Constructs a tiled_extent<N> with the extent "ext".
-     *
-     * @param[in] ext The extent of this tiled_extent
-     * @param[in] t0 Size of tile in the 1st dimension.
-     * @param[in] t1 Size of tile in the 2nd dimension.
-     * @param[in] size Size of dynamic group segment.
-     */
-    tiled_extent(const extent<2>& ext, int t0, int t1, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1} {}
+extern "C" int __unpacklo_s16x2(int src0, int src1) __HC__;
 
-    /**
-     * Set the size of dynamic group segment. The function should be called
-     * in host code, prior to a kernel is dispatched.
-     *
-     * @param[in] size The amount of dynamic group segment needed.
-     */
-    void set_dynamic_group_segment_size(unsigned int size) __CPU__ {
-        dynamic_group_segment_size = size;
-    }
+extern "C" int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__;
 
-    /**
-     * Return the size of dynamic group segment in bytes.
-     */
-    unsigned int get_dynamic_group_segment_size() const __CPU__ {
-        return dynamic_group_segment_size;
-    }
-};
+extern "C" int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__;
+/** @} */
 
+/** @{ */
 /**
- * Represents an extent subdivided into tiles.
- * Tile sizes can be specified at runtime.
- * This class is 3D specialization of tiled_extent.
+ * Copy and interleave the upper half of the elements from
+ * each source into the desitionation
+ *
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a> for more detailed specification of these functions.
  */
-template <>
-class tiled_extent<3> : public extent<3> {
-private:
-    /**
-     * Size of dynamic group segment.
-     */
-    unsigned int dynamic_group_segment_size;
+extern "C" unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__;
 
-public:
-    static const int rank = 3;
+extern "C" uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__;
 
-    /**
-     * Tile size for each dimension.
-     */
-    int tile_dim[3];
+extern "C" unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__;
 
-    /**
-     * Default constructor. The origin and extent is default-constructed and
-     * thus zero.
-     */
-    tiled_extent() __CPU__ __HC__ : extent(0, 0, 0), dynamic_group_segment_size(0), tile_dim{0, 0, 0} {}
+extern "C" uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__;
 
-    /**
-     * Construct an tiled extent with the size of extent and the size of tile
-     * specified.
-     *
-     * @param[in] e0 Size of extent in the 1st dimension.
-     * @param[in] e1 Size of extent in the 2nd dimension.
-     * @param[in] e2 Size of extent in the 3rd dimension.
-     * @param[in] t0 Size of tile in the 1st dimension.
-     * @param[in] t1 Size of tile in the 2nd dimension.
-     * @param[in] t2 Size of tile in the 3rd dimension.
-     */
-    tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2) __CPU__ __HC__ : extent(e0, e1, e2), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {}
+extern "C" uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__;
 
-    /**
-     * Construct an tiled extent with the size of extent and the size of tile
-     * specified.
-     *
-     * @param[in] e0 Size of extent in the 1st dimension.
-     * @param[in] e1 Size of extent in the 2nd dimension.
-     * @param[in] e2 Size of extent in the 3rd dimension.
-     * @param[in] t0 Size of tile in the 1st dimension.
-     * @param[in] t1 Size of tile in the 2nd dimension.
-     * @param[in] t2 Size of tile in the 3rd dimension.
-     * @param[in] size Size of dynamic group segment.
-     */
-    tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2, int size) __CPU__ __HC__ : extent(e0, e1, e2), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}
+extern "C" int __unpackhi_s8x4(int src0, int src1) __HC__;
 
-    /**
-     * Copy constructor. Constructs a new tiled_extent from the supplied
-     * argument "other".
-     *
-     * @param[in] other An object of type tiled_extent from which to initialize
-     *                  this new extent.
-     */
-    tiled_extent(const tiled_extent<3>& other) __CPU__ __HC__ : extent(other[0], other[1], other[2]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1], other.tile_dim[2]} {}
+extern "C" int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__;
 
-    /**
-     * Constructs a tiled_extent<N> with the extent "ext".
-     *
-     * @param[in] ext The extent of this tiled_extent
-     * @param[in] t0 Size of tile in the 1st dimension.
-     * @param[in] t1 Size of tile in the 2nd dimension.
-     * @param[in] t2 Size of tile in the 3rd dimension.
-     */
-    tiled_extent(const extent<3>& ext, int t0, int t1, int t2) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {}
+extern "C" int __unpackhi_s16x2(int src0, int src1) __HC__;
 
-    /**
-     * Constructs a tiled_extent<N> with the extent "ext".
-     *
-     * @param[in] ext The extent of this tiled_extent
-     * @param[in] t0 Size of tile in the 1st dimension.
-     * @param[in] t1 Size of tile in the 2nd dimension.
-     * @param[in] t2 Size of tile in the 3rd dimension.
-     * @param[in] size Size of dynamic group segment.
-     */
-    tiled_extent(const extent<3>& ext, int t0, int t1, int t2, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}
+extern "C" int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__;
 
-    /**
-     * Set the size of dynamic group segment. The function should be called
-     * in host code, prior to a kernel is dispatched.
-     *
-     * @param[in] size The amount of dynamic group segment needed.
-     */
-    void set_dynamic_group_segment_size(unsigned int size) __CPU__ {
-        dynamic_group_segment_size = size;
-    }
+extern "C" int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__;
+/** @} */
 
-    /**
-     * Return the size of dynamic group segment in bytes.
-     */
-    unsigned int get_dynamic_group_segment_size() const __CPU__ {
-        return dynamic_group_segment_size;
-    }
-};
+/** @{ */
+/**
+ * Assign the elements of the packed value in src0, replacing
+ * the element specified by src2 with the value from src1
+ *
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a> for more detailed specification of these functions.
+ */
+extern "C" unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 
-// ------------------------------------------------------------------------
-// implementation of extent<N>::tile()
-// ------------------------------------------------------------------------
+extern "C" uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
 
-template <int N>
-inline
-tiled_extent<1> extent<N>::tile(int t0) const __CPU__ __HC__ {
-  static_assert(N == 1, "One-dimensional tile() method only available on extent<1>");
-  return tiled_extent<1>(*this, t0);
-}
+extern "C" unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 
-template <int N>
-inline
-tiled_extent<2> extent<N>::tile(int t0, int t1) const __CPU__ __HC__ {
-  static_assert(N == 2, "Two-dimensional tile() method only available on extent<2>");
-  return tiled_extent<2>(*this, t0, t1);
-}
+extern "C" uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
 
-template <int N>
-inline
-tiled_extent<3> extent<N>::tile(int t0, int t1, int t2) const __CPU__ __HC__ {
-  static_assert(N == 3, "Three-dimensional tile() method only available on extent<3>");
-  return tiled_extent<3>(*this, t0, t1, t2);
-}
+extern "C" uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
 
-// ------------------------------------------------------------------------
-// implementation of extent<N>::tile_with_dynamic()
-// ------------------------------------------------------------------------
+extern "C" int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__;
 
-template <int N>
-inline
-tiled_extent<1> extent<N>::tile_with_dynamic(int t0, int dynamic_size) const __CPU__ __HC__ {
-  static_assert(N == 1, "One-dimensional tile() method only available on extent<1>");
-  return tiled_extent<1>(*this, t0, dynamic_size);
-}
+extern "C" int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__;
 
-template <int N>
-inline
-tiled_extent<2> extent<N>::tile_with_dynamic(int t0, int t1, int dynamic_size) const __CPU__ __HC__ {
-  static_assert(N == 2, "Two-dimensional tile() method only available on extent<2>");
-  return tiled_extent<2>(*this, t0, t1, dynamic_size);
-}
+extern "C" int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__;
 
-template <int N>
-inline
-tiled_extent<3> extent<N>::tile_with_dynamic(int t0, int t1, int t2, int dynamic_size) const __CPU__ __HC__ {
-  static_assert(N == 3, "Three-dimensional tile() method only available on extent<3>");
-  return tiled_extent<3>(*this, t0, t1, t2, dynamic_size);
-}
+extern "C" int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__;
 
-// ------------------------------------------------------------------------
-// Intrinsic functions for HSAIL instructions
-// ------------------------------------------------------------------------
+extern "C" int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__;
+
+extern "C" double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__;
+/** @} */
 
+/** @{ */
 /**
- * Fetch the size of a wavefront
+ * Assign the elements specified by src1 from the packed value in src0
  *
- * @return The size of a wavefront.
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a> for more detailed specification of these functions.
  */
-#define __HSA_WAVEFRONT_SIZE__ (64)
-extern "C" unsigned int __wavesize() __HC__; 
+extern "C" unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__;
 
+extern "C" unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__;
 
-#if __hcc_backend__==HCC_BACKEND_AMDGPU
-extern "C" inline unsigned int __wavesize() __HC__ {
-  return __HSA_WAVEFRONT_SIZE__;
-}
-#endif
+extern "C" unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__;
+
+extern "C" unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__;
+
+extern "C" unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__;
+
+extern "C" int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__;
+
+extern "C" int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__;
+
+extern "C" int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__;
+
+extern "C" int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__;
+
+extern "C" int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__;
+
+extern "C" float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__;
+/** @} */
 
 /**
- * Count number of 1 bits in the input
+ * Align 32 bits within 64 bits of data on an arbitrary bit boundary
  *
- * @param[in] input An unsinged 32-bit integer.
- * @return Number of 1 bits in the input.
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
  */
-extern "C" inline unsigned int __popcount_u32_b32(unsigned int input) __HC__ {
-  return __builtin_popcount(input);
-}
+extern "C" unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 
 /**
- * Count number of 1 bits in the input
+ * Align 32 bits within 64 bis of data on an arbitrary byte boundary
  *
- * @param[in] input An unsinged 64-bit integer.
- * @return Number of 1 bits in the input.
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
  */
-extern "C" inline unsigned int __popcount_u32_b64(unsigned long long int input) __HC__ {
-  return __builtin_popcountl(input);
-}
+extern "C" unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 
-/** @{ */
 /**
- * Extract a range of bits
+ * Do linear interpolation and computes the unsigned 8-bit average of packed
+ * data
  *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
  */
-extern "C" inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ {
-  uint32_t offset = src1 & 31;
-  uint32_t width = src2 & 31;
-  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
-}
-
-extern "C" inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__ {
-  uint64_t offset = src1 & 63;
-  uint64_t width = src2 & 63;
-  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
-}
-
-extern "C" int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__;
-
-extern "C" int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__;
-/** @} */
+extern "C" unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 
-/** @{ */
 /**
- * Replace a range of bits
+ * Takes four floating-point number, convers them to
+ * unsigned integer values, and packs them into a packed u8x4 value
  *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
  */
-extern "C" inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__ {
-  uint32_t offset = src2 & 31;
-  uint32_t width = src3 & 31;
-  uint32_t mask = (1 << width) - 1;
-  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
-}
-
-extern "C" inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__ {
-  uint64_t offset = src2 & 63;
-  uint64_t width = src3 & 63;
-  uint64_t mask = (1 << width) - 1;
-  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
-}
-
-extern "C" int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__;
-
-extern "C" int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__;
-/** @} */
+extern "C" unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__;
 
-/** @{ */
 /**
- * Create a bit mask that can be used with bitselect
+ * Unpacks a single element from a packed u8x4 value and converts it to an f32.
  *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
  */
-extern "C" unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__;
-
-extern "C" uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__;
-/** @} */
+extern "C" float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__;
 
 /** @{ */
 /**
- * Reverse the bits
+ * Computes the sum of the absolute differences of src0 and
+ * src1 and then adds src2 to the result
  *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
  */
+extern "C" unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 
-unsigned int __bitrev_b32(unsigned int src0) [[hc]] __asm("llvm.bitreverse.i32");
-
-uint64_t __bitrev_b64(uint64_t src0) [[hc]] __asm("llvm.bitreverse.i64");
+extern "C" unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 
+extern "C" unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 /** @} */
 
-/** @{ */
 /**
- * Do bit field selection
+ * This function is mostly the same as sad except the sum of absolute
+ * differences is added to the most significant 16 bits of the result
  *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
  */
-extern "C" inline unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ {
-  return (src1 & src0) | (src2 & ~src0);
-}
-
-extern "C" inline uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__ {
-  return (src1 & src0) | (src2 & ~src0);
-}
-/** @} */
+extern "C" unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 
 /**
- * Count leading zero bits in the input
- *
- * @param[in] input An unsigned 32-bit integer.
- * @return Number of 0 bits until a 1 bit is found, counting start from the
- *         most significant bit. -1 if there is no 0 bit.
+ * Get system timestamp
  */
-extern "C" inline unsigned int __firstbit_u32_u32(unsigned int input) __HC__ {
-  return input == 0 ? -1 : __builtin_clz(input);
-}
-
+extern "C" uint64_t __clock_u64() __HC__;
 
 /**
- * Count leading zero bits in the input
+ * Get hardware cycle count
  *
- * @param[in] input An unsigned 64-bit integer.
- * @return Number of 0 bits until a 1 bit is found, counting start from the
- *         most significant bit. -1 if there is no 0 bit.
+ * Notice the return value of this function is implementation defined.
  */
-extern "C" inline unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__ {
-  return input == 0 ? -1 : __builtin_clzl(input);
-}
+extern "C" uint64_t __cycle_u64() __HC__;
 
 /**
- * Count leading zero bits in the input
+ * Get the count of the number of earlier (in flattened
+ * work-item order) active work-items within the same wavefront.
  *
- * @param[in] input An signed 32-bit integer.
- * @return Finds the first bit set in a positive integer starting from the
- *         most significant bit, or finds the first bit clear in a negative
- *         integer from the most significant bit.
- *         If no bits in the input are set, then dest is set to -1.
+ * @return The result will be in the range 0 to WAVESIZE - 1.
  */
-extern "C" inline unsigned int __firstbit_u32_s32(int input) __HC__ {
-  if (input == 0) {
-    return -1;
-  }
-
-  return input > 0 ? __firstbit_u32_u32(input) : __firstbit_u32_u32(~input);
-}
-
+extern "C" unsigned int __activelaneid_u32() __HC__;
 
 /**
- * Count leading zero bits in the input
+ * Return a bit mask shows which active work-items in the
+ * wavefront have a non-zero input. The affected bit position within the
+ * registers of dest corresponds to each work-item's lane ID.
  *
- * @param[in] input An signed 64-bit integer.
- * @return Finds the first bit set in a positive integer starting from the
- *         most significant bit, or finds the first bit clear in a negative
- *         integer from the most significant bit.
- *         If no bits in the input are set, then dest is set to -1.
+ * The HSAIL instruction would return 4 64-bit registers but the current
+ * implementation would only return the 1st one and ignore the other 3 as
+ * right now all HSA agents have wavefront of size 64.
+ *
+ * @param[in] input An unsigned 32-bit integer.
+ * @return The bitmask calculated.
  */
-extern "C" inline unsigned int __firstbit_u32_s64(long long int input) __HC__ {
-  if (input == 0) {
-    return -1;
-  }
-
-  return input > 0 ? __firstbit_u32_u64(input) : __firstbit_u32_u64(~input);
-}
+extern "C" uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__;
 
-/** @{ */
 /**
- * Find the first bit set to 1 in a number starting from the
- * least significant bit
+ * Count the number of active work-items in the current
+ * wavefront that have a non-zero input.
  *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a> for more detailed specification of these functions.
+ * @param[in] input An unsigned 32-bit integer.
+ * @return The number of active work-items in the current wavefront that have
+ *         a non-zero input.
  */
-extern "C" inline unsigned int __lastbit_u32_u32(unsigned int input) __HC__ {
-  return input == 0 ? -1 : __builtin_ctz(input);
+extern "C" inline unsigned int __activelanecount_u32_b1(unsigned int input) __HC__ {
+ return  __popcount_u32_b64(__activelanemask_v4_b64_b1(input));
 }
 
-extern "C" inline unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__ {
-  return input == 0 ? -1 : __builtin_ctzl(input);
-}
+// ------------------------------------------------------------------------
+// Wavefront Vote Functions
+// ------------------------------------------------------------------------
 
-extern "C" inline unsigned int __lastbit_u32_s32(int input) __HC__ {
-  return __lastbit_u32_u32(input);
+/**
+ * Evaluate predicate for all active work-items in the
+ * wavefront and return non-zero if and only if predicate evaluates to non-zero
+ * for any of them.
+ */
+extern "C" bool __ockl_wfany_i32(int) __HC__;
+extern "C" inline int __any(int predicate) __HC__ {
+    return __ockl_wfany_i32(predicate);
 }
 
-extern "C" inline unsigned int __lastbit_u32_s64(unsigned long long input) __HC__ {
-  return __lastbit_u32_u64(input);
+/**
+ * Evaluate predicate for all active work-items in the
+ * wavefront and return non-zero if and only if predicate evaluates to non-zero
+ * for all of them.
+ */
+extern "C" bool __ockl_wfall_i32(int) __HC__;
+extern "C" inline int __all(int predicate) __HC__ {
+    return __ockl_wfall_i32(predicate);
 }
-/** @} */
 
-/** @{ */
 /**
- * Copy and interleave the lower half of the elements from
- * each source into the desitionation
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a> for more detailed specification of these functions.
+ * Evaluate predicate for all active work-items in the
+ * wavefront and return an integer whose Nth bit is set if and only if
+ * predicate evaluates to non-zero for the Nth work-item of the wavefront and
+ * the Nth work-item is active.
  */
-extern "C" unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__;
-
-extern "C" uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__;
-
-extern "C" unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__;
 
-extern "C" uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__;
+// XXX from llvm/include/llvm/IR/InstrTypes.h
+#define ICMP_NE 33
+__attribute__((convergent))
+unsigned long long __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) [[hc]] __asm("llvm.amdgcn.icmp.i32");
+extern "C" inline uint64_t __ballot(int predicate) __HC__ {
+    return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
+}
 
-extern "C" uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__;
+// ------------------------------------------------------------------------
+// Wavefront Shuffle Functions
+// ------------------------------------------------------------------------
 
-extern "C" int __unpacklo_s8x4(int src0, int src1) __HC__;
-
-extern "C" int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__;
-
-extern "C" int __unpacklo_s16x2(int src0, int src1) __HC__;
-
-extern "C" int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__;
-
-extern "C" int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__;
-/** @} */
-
-/** @{ */
-/**
- * Copy and interleave the upper half of the elements from
- * each source into the desitionation
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a> for more detailed specification of these functions.
- */
-extern "C" unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__;
-
-extern "C" uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__;
-
-extern "C" unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__;
-
-extern "C" uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__;
-
-extern "C" uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__;
-
-extern "C" int __unpackhi_s8x4(int src0, int src1) __HC__;
-
-extern "C" int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__;
-
-extern "C" int __unpackhi_s16x2(int src0, int src1) __HC__;
-
-extern "C" int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__;
-
-extern "C" int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__;
-/** @} */
-
-/** @{ */
-/**
- * Assign the elements of the packed value in src0, replacing
- * the element specified by src2 with the value from src1
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a> for more detailed specification of these functions.
- */
-extern "C" unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
-
-extern "C" uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
-
-extern "C" unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
-
-extern "C" uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
-
-extern "C" uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
-
-extern "C" int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__;
-
-extern "C" int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__;
-
-extern "C" int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__;
-
-extern "C" int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__;
-
-extern "C" int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__;
-
-extern "C" double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__;
-/** @} */
-
-/** @{ */
-/**
- * Assign the elements specified by src1 from the packed value in src0
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a> for more detailed specification of these functions.
- */
-extern "C" unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__;
-
-extern "C" unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__;
-
-extern "C" unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__;
-
-extern "C" unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__;
-
-extern "C" unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__;
-
-extern "C" int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__;
-
-extern "C" int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__;
-
-extern "C" int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__;
-
-extern "C" int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__;
-
-extern "C" int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__;
-
-extern "C" float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__;
-/** @} */
-
-/**
- * Align 32 bits within 64 bits of data on an arbitrary bit boundary
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
- */
-extern "C" unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
-
-/**
- * Align 32 bits within 64 bis of data on an arbitrary byte boundary
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
- */
-extern "C" unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
-
-/**
- * Do linear interpolation and computes the unsigned 8-bit average of packed
- * data
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
- */
-extern "C" unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
-
-/**
- * Takes four floating-point number, convers them to
- * unsigned integer values, and packs them into a packed u8x4 value
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
- */
-extern "C" unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__;
-
-/**
- * Unpacks a single element from a packed u8x4 value and converts it to an f32.
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
- */
-extern "C" float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__;
-
-/** @{ */
-/**
- * Computes the sum of the absolute differences of src0 and
- * src1 and then adds src2 to the result
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
- */
-extern "C" unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
-
-extern "C" unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
-
-extern "C" unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
-/** @} */
-
-/**
- * This function is mostly the same as sad except the sum of absolute
- * differences is added to the most significant 16 bits of the result
- *
- * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> for more detailed specification.
- */
-extern "C" unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
-
-/**
- * Get system timestamp
- */
-extern "C" __attribute__((always_inline))
-std::uint64_t __ockl_memrealtime_u64(void);
-
-extern "C" inline __attribute((always_inline)) std::uint64_t __clock_u64() __HC__ {
-  return __ockl_memrealtime_u64();
-}
-
-
-/**
- * Get hardware cycle count
- *
- * Notice the return value of this function is implementation defined.
- */
-extern "C" __attribute__((always_inline))
-std::uint64_t __ockl_memtime_u64(void);
-
-extern "C" inline __attribute((always_inline)) std::uint64_t __cycle_u64() __HC__ {
-  return __ockl_memtime_u64();
-}
-
-/**
- * Get the count of the number of earlier (in flattened
- * work-item order) active work-items within the same wavefront.
- *
- * @return The result will be in the range 0 to WAVESIZE - 1.
- */
-extern "C" unsigned int __activelaneid_u32() __HC__;
-
-/**
- * Return a bit mask shows which active work-items in the
- * wavefront have a non-zero input. The affected bit position within the
- * registers of dest corresponds to each work-item's lane ID.
- *
- * The HSAIL instruction would return 4 64-bit registers but the current
- * implementation would only return the 1st one and ignore the other 3 as
- * right now all HSA agents have wavefront of size 64.
- *
- * @param[in] input An unsigned 32-bit integer.
- * @return The bitmask calculated.
- */
-extern "C" uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__;
-
-/**
- * Count the number of active work-items in the current
- * wavefront that have a non-zero input.
- *
- * @param[in] input An unsigned 32-bit integer.
- * @return The number of active work-items in the current wavefront that have
- *         a non-zero input.
- */
-extern "C" inline unsigned int __activelanecount_u32_b1(unsigned int input) __HC__ {
- return  __popcount_u32_b64(__activelanemask_v4_b64_b1(input));
-}
-
-// ------------------------------------------------------------------------
-// Wavefront Vote Functions
-// ------------------------------------------------------------------------
-
-/**
- * Evaluate predicate for all active work-items in the
- * wavefront and return non-zero if and only if predicate evaluates to non-zero
- * for any of them.
- */
-extern "C" bool __ockl_wfany_i32(int) __HC__;
-extern "C" inline int __any(int predicate) __HC__ {
-    return __ockl_wfany_i32(predicate);
-}
-
-/**
- * Evaluate predicate for all active work-items in the
- * wavefront and return non-zero if and only if predicate evaluates to non-zero
- * for all of them.
- */
-extern "C" bool __ockl_wfall_i32(int) __HC__;
-extern "C" inline int __all(int predicate) __HC__ {
-    return __ockl_wfall_i32(predicate);
-}
-
-/**
- * Evaluate predicate for all active work-items in the
- * wavefront and return an integer whose Nth bit is set if and only if
- * predicate evaluates to non-zero for the Nth work-item of the wavefront and
- * the Nth work-item is active.
- */
-
-// XXX from llvm/include/llvm/IR/InstrTypes.h
-#define ICMP_NE 33
-__attribute__((convergent))
-unsigned long long __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) [[hc]] __asm("llvm.amdgcn.icmp.i32");
-extern "C" inline uint64_t __ballot(int predicate) __HC__ {
-    return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
-}
-
-// ------------------------------------------------------------------------
-// Wavefront Shuffle Functions
-// ------------------------------------------------------------------------
-
-// utility union type
-union __u {
-    int i;
-    unsigned int u;
-    float f;
-};
+// utility union type
+union __u {
+    int i;
+    unsigned int u;
+    float f;
+};
 
 /** @{ */
 /**
@@ -3085,8 +2136,6 @@ inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__
   return __amdgcn_ds_bpermute(index<<2, var);
 }
 
-#endif
-
 inline unsigned int __shfl(unsigned int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
      __u tmp; tmp.u = var;
     tmp.i = __shfl(tmp.i, srcLane, width);
@@ -3100,6 +2149,8 @@ inline float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __
     return tmp.f;
 }
 
+#endif
+
 // FIXME: support half type
 /** @} */
 
@@ -3134,8 +2185,6 @@ inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WA
   return __amdgcn_ds_bpermute(index<<2, var);
 }
 
-#endif
-
 inline unsigned int __shfl_up(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.u = var;
     tmp.i = __shfl_up(tmp.i, delta, width);
@@ -3148,6 +2197,8 @@ inline float __shfl_up(float var, const unsigned int delta, const int width=__HS
     return tmp.f;
 }
 
+#endif
+
 // FIXME: support half type
 /** @} */
 
@@ -3183,8 +2234,6 @@ inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_
   return __amdgcn_ds_bpermute(index<<2, var);
 }
 
-#endif
-
 inline unsigned int __shfl_down(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.u = var;
     tmp.i = __shfl_down(tmp.i, delta, width);
@@ -3197,6 +2246,7 @@ inline float __shfl_down(float var, const unsigned int delta, const int width=__
     return tmp.f;
 }
 
+#endif
 
 // FIXME: support half type
 /** @} */
@@ -3229,8 +2279,6 @@ inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) _
   return __amdgcn_ds_bpermute(index<<2, var);
 }
 
-#endif
-
 inline float __shfl_xor(float var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.f = var;
     tmp.i = __shfl_xor(tmp.i, laneMask, width);
@@ -3246,6 +2294,8 @@ inline unsigned int __shfl_xor(unsigned int var, int laneMask, int width=__HSA_W
     return tmp.u;
 }
 
+#endif
+
 /**
  * Multiply two unsigned integers (x,y) but only the lower 24 bits will be used in the multiplication.
  *
@@ -3327,4616 +2377,4 @@ extern "C" void* get_group_segment_base_pointer() __HC__;
  * Fetch the address of the beginning of dynamic group segment.
  */
 extern "C" void* get_dynamic_group_segment_base_pointer() __HC__;
-
-// ------------------------------------------------------------------------
-// utility class for tiled_barrier
-// ------------------------------------------------------------------------
-
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-template <typename Ker, typename Ti>
-void bar_wrapper(Ker *f, Ti *t)
-{
-    (*f)(*t);
-}
-
-struct barrier_t {
-    std::unique_ptr<ucontext_t[]> ctx;
-    int idx;
-    barrier_t (int a) :
-        ctx(new ucontext_t[a + 1]) {}
-    template <typename Ti, typename Ker>
-    void setctx(int x, char *stack, Ker& f, Ti* tidx, int S) {
-        getcontext(&ctx[x]);
-        ctx[x].uc_stack.ss_sp = stack;
-        ctx[x].uc_stack.ss_size = S;
-        ctx[x].uc_link = &ctx[x - 1];
-        makecontext(&ctx[x], (void (*)(void))bar_wrapper<Ker, Ti>, 2, &f, tidx);
-    }
-    void swap(int a, int b) {
-        swapcontext(&ctx[a], &ctx[b]);
-    }
-    void wait() __HC__ {
-        --idx;
-        swapcontext(&ctx[idx + 1], &ctx[idx]);
-    }
-};
-#endif
-
-
-// ------------------------------------------------------------------------
-// tiled_barrier
-// ------------------------------------------------------------------------
-
-/**
- * The tile_barrier class is a capability class that is only creatable by the
- * system, and passed to a tiled parallel_for_each function object as part of
- * the tiled_index parameter. It provides member functions, such as wait, whose
- * purpose is to synchronize execution of threads running within the thread
- * tile.
- */
-class tile_barrier {
-public:
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    using pb_t = std::shared_ptr<barrier_t>;
-    tile_barrier(pb_t pb) : pbar(pb) {}
-
-    /**
-     * Copy constructor. Constructs a new tile_barrier from the supplied
-     * argument "other".
-     *
-     * @param[in] other An object of type tile_barrier from which to initialize
-     *                  this.
-     */
-    tile_barrier(const tile_barrier& other) __CPU__ __HC__ : pbar(other.pbar) {}
-#else
-
-    /**
-     * Copy constructor. Constructs a new tile_barrier from the supplied
-     * argument "other".
-     *
-     * @param[in] other An object of type tile_barrier from which to initialize
-     *                  this.
-     */
-    tile_barrier(const tile_barrier& other) __CPU__ __HC__ {}
-#endif
-
-    /**
-     * Blocks execution of all threads in the thread tile until all threads in
-     * the tile have reached this call. Establishes a memory fence on all
-     * tile_static and global memory operations executed by the threads in the
-     * tile such that all memory operations issued prior to hitting the barrier
-     * are visible to all other threads after the barrier has completed and
-     * none of the memory operations occurring after the barrier are executed
-     * before hitting the barrier. This is identical to
-     * wait_with_all_memory_fence().
-     */
-    void wait() const __HC__ {
-#if __KALMAR_ACCELERATOR__ == 1
-        wait_with_all_memory_fence();
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-        pbar->wait();
-#endif
-    }
-
-    /**
-     * Blocks execution of all threads in the thread tile until all threads in
-     * the tile have reached this call. Establishes a memory fence on all
-     * tile_static and global memory operations executed by the threads in the
-     * tile such that all memory operations issued prior to hitting the barrier
-     * are visible to all other threads after the barrier has completed and
-     * none of the memory operations occurring after the barrier are executed
-     * before hitting the barrier. This is identical to wait().
-     */
-    void wait_with_all_memory_fence() const __HC__ {
-#if __KALMAR_ACCELERATOR__ == 1
-        amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-        pbar->wait();
-#endif
-    }
-
-    /**
-     * Blocks execution of all threads in the thread tile until all threads in
-     * the tile have reached this call. Establishes a memory fence on global
-     * memory operations (but not tile-static memory operations) executed by
-     * the threads in the tile such that all global memory operations issued
-     * prior to hitting the barrier are visible to all other threads after the
-     * barrier has completed and none of the global memory operations occurring
-     * after the barrier are executed before hitting the barrier.
-     */
-    void wait_with_global_memory_fence() const __HC__ {
-#if __KALMAR_ACCELERATOR__ == 1
-        amp_barrier(CLK_GLOBAL_MEM_FENCE);
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-        pbar->wait();
-#endif
-    }
-
-    /**
-     * Blocks execution of all threads in the thread tile until all threads in
-     * the tile have reached this call. Establishes a memory fence on
-     * tile-static memory operations (but not global memory operations)
-     * executed by the threads in the tile such that all tile_static memory
-     * operations issued prior to hitting the barrier are visible to all other
-     * threads after the barrier has completed and none of the tile-static
-     * memory operations occurring after the barrier are executed before
-     * hitting the barrier.
-     */
-    void wait_with_tile_static_memory_fence() const __HC__ {
-#if __KALMAR_ACCELERATOR__ == 1
-        amp_barrier(CLK_LOCAL_MEM_FENCE);
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-        pbar->wait();
-#endif
-    }
-
-private:
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    tile_barrier() __CPU__ __HC__ = default;
-    pb_t pbar;
-#else
-    tile_barrier() __HC__ {}
-#endif
-
-    template <int N> friend
-        class tiled_index;
-};
-
-// ------------------------------------------------------------------------
-// other memory fences
-// ------------------------------------------------------------------------
-
-/**
- * Establishes a thread-tile scoped memory fence for both global and
- * tile-static memory operations. This function does not imply a barrier and
- * is therefore permitted in divergent code.
- */
-// FIXME: this functions has not been implemented.
-void all_memory_fence(const tile_barrier&) __HC__;
-
-/**
- * Establishes a thread-tile scoped memory fence for global (but not
- * tile-static) memory operations. This function does not imply a barrier and
- * is therefore permitted in divergent code.
- */
-// FIXME: this functions has not been implemented.
-void global_memory_fence(const tile_barrier&) __HC__;
-
-/**
- * Establishes a thread-tile scoped memory fence for tile-static (but not
- * global) memory operations. This function does not imply a barrier and is
- * therefore permitted in divergent code.
- */
-// FIXME: this functions has not been implemented.
-void tile_static_memory_fence(const tile_barrier&) __HC__;
-
-// ------------------------------------------------------------------------
-// tiled_index
-// ------------------------------------------------------------------------
-
-/**
- * Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional
- * tiles.
- *
- * @tparam N Tile dimension.
- */
-template <int N=3>
-class tiled_index {
-public:
-    /**
-     * A static member of tiled_index that contains the rank of this tiled
-     * extent, and is either 1, 2, or 3 depending on the specialization used.
-     */
-    static const int rank = 3;
-
-    /**
-     * Copy constructor. Constructs a new tiled_index from the supplied
-     * argument "other".
-     *
-     * @param[in] other An object of type tiled_index from which to initialize
-     *                  this.
-     */
-    tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the global index within an
-     * extent.
-     */
-    const index<3> global;
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the relative index within
-     * the current tile of a tiled extent.
-     */
-    const index<3> local;
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the coordinates of the
-     * current tile of a tiled extent.
-     */
-    const index<3> tile;
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the global coordinates of
-     * the origin of the current tile within a tiled extent.
-     */
-    const index<3> tile_origin;
-
-    /**
-     * An object which represents a barrier within the current tile of threads.
-     */
-    const tile_barrier barrier;
-
-    /**
-     * An index of rank 1, 2, 3 that represents the size of the tile.
-     */
-    const index<3> tile_dim;
-
-    /**
-     * Implicit conversion operator that converts a tiled_index<N> into
-     * an index<N>. The implicit conversion converts to the .global index
-     * member.
-     */
-    operator const index<3>() const __CPU__ __HC__ {
-        return global;
-    }
-
-    tiled_index(const index<3>& g) __CPU__ __HC__ : global(g) {}
-
-private:
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    __attribute__((always_inline)) tiled_index(int a0, int a1, int a2, int b0, int b1, int b2, int c0, int c1, int c2, tile_barrier& pb, int D0, int D1, int D2) __CPU__ __HC__
-        : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0), tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1, D2) {}
-#endif
-
-    __attribute__((annotate("__cxxamp_opencl_index")))
-#if __KALMAR_ACCELERATOR__ == 1
-    __attribute__((always_inline)) tiled_index() __HC__
-        : global(index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))),
-          local(index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))),
-          tile(index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))),
-          tile_origin(index<3>(amp_get_global_id(2) - amp_get_local_id(2),
-                               amp_get_global_id(1) - amp_get_local_id(1),
-                               amp_get_global_id(0) - amp_get_local_id(0))),
-          tile_dim(index<3>(amp_get_local_size(2), amp_get_local_size(1), amp_get_local_size(0)))
-#elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    __attribute__((always_inline)) tiled_index() __CPU__ __HC__
-#else
-    __attribute__((always_inline)) tiled_index() __HC__
-#endif // __KALMAR_ACCELERATOR__
-    {}
-
-    template<typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const tiled_extent<N>&, const Kernel&);
-
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    template<typename K> friend
-        void partitioned_task_tile_3D(K const&, tiled_extent<3> const&, int);
-#endif
-};
-
-
-/**
- * Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional
- * tiles.
- * This class is 1D specialization of tiled_index.
- */
-template<>
-class tiled_index<1> {
-public:
-    /**
-     * A static member of tiled_index that contains the rank of this tiled
-     * extent, and is either 1, 2, or 3 depending on the specialization used.
-     */
-    static const int rank = 1;
-
-    /**
-     * Copy constructor. Constructs a new tiled_index from the supplied
-     * argument "other".
-     *
-     * @param[in] other An object of type tiled_index from which to initialize
-     *                  this.
-     */
-    tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the global index within an
-     * extent.
-     */
-    const index<1> global;
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the relative index within
-     * the current tile of a tiled extent.
-     */
-    const index<1> local;
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the coordinates of the
-     * current tile of a tiled extent.
-     */
-    const index<1> tile;
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the global coordinates of
-     * the origin of the current tile within a tiled extent.
-     */
-    const index<1> tile_origin;
-
-    /**
-     * An object which represents a barrier within the current tile of threads.
-     */
-    const tile_barrier barrier;
-
-    /**
-     * An index of rank 1, 2, 3 that represents the size of the tile.
-     */
-    const index<1> tile_dim;
-
-    /**
-     * Implicit conversion operator that converts a tiled_index<N> into
-     * an index<N>. The implicit conversion converts to the .global index
-     * member.
-     */
-    operator const index<1>() const __CPU__ __HC__ {
-        return global;
-    }
-
-    tiled_index(const index<1>& g) __CPU__ __HC__ : global(g) {}
-
-private:
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    __attribute__((always_inline)) tiled_index(int a, int b, int c, tile_barrier& pb, int D0) __CPU__ __HC__
-        : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_dim(D0) {}
-#endif
-
-    __attribute__((annotate("__cxxamp_opencl_index")))
-#if __KALMAR_ACCELERATOR__ == 1
-    __attribute__((always_inline)) tiled_index() __HC__
-        : global(index<1>(amp_get_global_id(0))),
-          local(index<1>(amp_get_local_id(0))),
-          tile(index<1>(amp_get_group_id(0))),
-          tile_origin(index<1>(amp_get_global_id(0) - amp_get_local_id(0))),
-          tile_dim(index<1>(amp_get_local_size(0)))
-#elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    __attribute__((always_inline)) tiled_index() __CPU__ __HC__
-#else
-    __attribute__((always_inline)) tiled_index() __HC__
-#endif // __KALMAR_ACCELERATOR__
-    {}
-
-    template<typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);
-
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    template<typename K> friend
-        void partitioned_task_tile_1D(K const&, tiled_extent<1> const&, int);
-#endif
-};
-
-/**
- * Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional
- * tiles.
- * This class is 2D specialization of tiled_index.
- */
-template<>
-class tiled_index<2> {
-public:
-    /**
-     * A static member of tiled_index that contains the rank of this tiled
-     * extent, and is either 1, 2, or 3 depending on the specialization used.
-     */
-    static const int rank = 2;
-
-    /**
-     * Copy constructor. Constructs a new tiled_index from the supplied
-     * argument "other".
-     *
-     * @param[in] other An object of type tiled_index from which to initialize
-     *                  this.
-     */
-    tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the global index within an
-     * extent.
-     */
-    const index<2> global;
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the relative index within
-     * the current tile of a tiled extent.
-     */
-    const index<2> local;
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the coordinates of the
-     * current tile of a tiled extent.
-     */
-    const index<2> tile;
-
-    /**
-     * An index of rank 1, 2, or 3 that represents the global coordinates of
-     * the origin of the current tile within a tiled extent.
-     */
-    const index<2> tile_origin;
-
-    /**
-     * An object which represents a barrier within the current tile of threads.
-     */
-    const tile_barrier barrier;
-
-    /**
-     * An index of rank 1, 2, 3 that represents the size of the tile.
-     */
-    const index<2> tile_dim;
-
-    /**
-     * Implicit conversion operator that converts a tiled_index<N> into
-     * an index<N>. The implicit conversion converts to the .global index
-     * member.
-     */
-    operator const index<2>() const __CPU__ __HC__ {
-      return global;
-    }
-
-    tiled_index(const index<2>& g) __CPU__ __HC__ : global(g) {}
-
-private:
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    __attribute__((always_inline)) tiled_index(int a0, int a1, int b0, int b1, int c0, int c1, tile_barrier& pb, int D0, int D1) __CPU__ __HC__
-        : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1) {}
-#endif
-
-    __attribute__((annotate("__cxxamp_opencl_index")))
-#if __KALMAR_ACCELERATOR__ == 1
-    __attribute__((always_inline)) tiled_index() __HC__
-        : global(index<2>(amp_get_global_id(1), amp_get_global_id(0))),
-          local(index<2>(amp_get_local_id(1), amp_get_local_id(0))),
-          tile(index<2>(amp_get_group_id(1), amp_get_group_id(0))),
-          tile_origin(index<2>(amp_get_global_id(1) - amp_get_local_id(1),
-                               amp_get_global_id(0) - amp_get_local_id(0))),
-          tile_dim(index<2>(amp_get_local_size(1), amp_get_local_size(0)))
-#elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    __attribute__((always_inline)) tiled_index() __CPU__ __HC__
-#else
-    __attribute__((always_inline)) tiled_index() __HC__
-#endif // __KALMAR_ACCELERATOR__
-    {}
-
-    template<typename Kernel> friend
-        completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);
-
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    template<typename K> friend
-        void partitioned_task_tile_2D(K const&, tiled_extent<2> const&, int);
-#endif
-};
-
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-#define SSIZE 1024 * 10
-template <int N, typename Kernel,  int K>
-struct cpu_helper
-{
-    static inline void call(const Kernel& k, index<K>& idx, const extent<K>& ext) __CPU__ __HC__ {
-        int i;
-        for (i = 0; i < ext[N]; ++i) {
-            idx[N] = i;
-            cpu_helper<N + 1, Kernel, K>::call(k, idx, ext);
-        }
-    }
-};
-template <typename Kernel, int K>
-struct cpu_helper<K, Kernel, K>
-{
-    static inline void call(const Kernel& k, const index<K>& idx, const extent<K>& ext) __CPU__ __HC__ {
-        (const_cast<Kernel&>(k))(idx);
-    }
-};
-
-template <typename Kernel, int N>
-void partitioned_task(const Kernel& ker, const extent<N>& ext, int part) {
-    index<N> idx;
-    int start = ext[0] * part / Kalmar::NTHREAD;
-    int end = ext[0] * (part + 1) / Kalmar::NTHREAD;
-    for (int i = start; i < end; i++) {
-        idx[0] = i;
-        cpu_helper<1, Kernel, N>::call(ker, idx, ext);
-    }
-}
-
-template <typename Kernel>
-void partitioned_task_tile_1D(Kernel const& f, tiled_extent<1> const& ext, int part) {
-    int D0 = ext.tile_dim[0];
-    int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
-    int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
-    int stride = end - start;
-    if (stride == 0)
-        return;
-    char *stk = new char[D0 * SSIZE];
-    tiled_index<1> *tidx = new tiled_index<1>[D0];
-    tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0);
-    tile_barrier tbar(hc_bar);
-    for (int tx = start; tx < end; tx++) {
-        int id = 0;
-        char *sp = stk;
-        tiled_index<1> *tip = tidx;
-        for (int x = 0; x < D0; x++) {
-            new (tip) tiled_index<1>(tx * D0 + x, x, tx, tbar, D0);
-            hc_bar->setctx(++id, sp, f, tip, SSIZE);
-            sp += SSIZE;
-            ++tip;
-        }
-        hc_bar->idx = 0;
-        while (hc_bar->idx == 0) {
-            hc_bar->idx = id;
-            hc_bar->swap(0, id);
-        }
-    }
-    delete [] stk;
-    delete [] tidx;
-}
-
-template <typename Kernel>
-void partitioned_task_tile_2D(Kernel const& f, tiled_extent<2> const& ext, int part) {
-    int D0 = ext.tile_dim[0];
-    int D1 = ext.tile_dim[1];
-    int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
-    int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
-    int stride = end - start;
-    if (stride == 0)
-        return;
-    char *stk = new char[D1 * D0 * SSIZE];
-    tiled_index<2> *tidx = new tiled_index<2>[D0 * D1];
-    tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1);
-    tile_barrier tbar(hc_bar);
-
-    for (int tx = 0; tx < ext[1] / D1; tx++)
-        for (int ty = start; ty < end; ty++) {
-            int id = 0;
-            char *sp = stk;
-            tiled_index<2> *tip = tidx;
-            for (int x = 0; x < D1; x++)
-                for (int y = 0; y < D0; y++) {
-                    new (tip) tiled_index<2>(D1 * tx + x, D0 * ty + y, x, y, tx, ty, tbar, D0, D1);
-                    hc_bar->setctx(++id, sp, f, tip, SSIZE);
-                    ++tip;
-                    sp += SSIZE;
-                }
-            hc_bar->idx = 0;
-            while (hc_bar->idx == 0) {
-                hc_bar->idx = id;
-                hc_bar->swap(0, id);
-            }
-        }
-    delete [] stk;
-    delete [] tidx;
-}
-
-template <typename Kernel>
-void partitioned_task_tile_3D(Kernel const& f, tiled_extent<3> const& ext, int part) {
-    int D0 = ext.tile_dim[0];
-    int D1 = ext.tile_dim[1];
-    int D2 = ext.tile_dim[2];
-    int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
-    int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
-    int stride = end - start;
-    if (stride == 0)
-        return;
-    char *stk = new char[D2 * D1 * D0 * SSIZE];
-    tiled_index<3> *tidx = new tiled_index<3>[D0 * D1 * D2];
-    tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1 * D2);
-    tile_barrier tbar(hc_bar);
-
-    for (int i = 0; i < ext[2] / D2; i++)
-        for (int j = 0; j < ext[1] / D1; j++)
-            for(int k = start; k < end; k++) {
-                int id = 0;
-                char *sp = stk;
-                tiled_index<3> *tip = tidx;
-                for (int x = 0; x < D2; x++)
-                    for (int y = 0; y < D1; y++)
-                        for (int z = 0; z < D0; z++) {
-                            new (tip) tiled_index<3>(D2 * i + x,
-                                                              D1 * j + y,
-                                                              D0 * k + z,
-                                                              x, y, z, i, j, k, tbar, D0, D1, D2);
-                            hc_bar->setctx(++id, sp, f, tip, SSIZE);
-                            ++tip;
-                            sp += SSIZE;
-                        }
-                hc_bar->idx = 0;
-                while (hc_bar->idx == 0) {
-                    hc_bar->idx = id;
-                    hc_bar->swap(0, id);
-                }
-            }
-    delete [] stk;
-    delete [] tidx;
-}
-
-template <typename Kernel, int N>
-completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
-                     extent<N> const& compute_domain)
-{
-    Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
-    for (int i = 0; i < Kalmar::NTHREAD; ++i)
-        obj[i] = std::thread(partitioned_task<Kernel, N>, std::cref(f), std::cref(compute_domain), i);
-    // FIXME wrap the above operation into the completion_future object
-    return completion_future();
-}
-
-template <typename Kernel>
-completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
-                     tiled_extent<1> const& compute_domain)
-{
-    Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
-    for (int i = 0; i < Kalmar::NTHREAD; ++i)
-        obj[i] = std::thread(partitioned_task_tile_1D<Kernel>,
-                             std::cref(f), std::cref(compute_domain), i);
-    // FIXME wrap the above operation into the completion_future object
-    return completion_future();
-}
-
-template <typename Kernel>
-completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
-                     tiled_extent<2> const& compute_domain)
-{
-    Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
-    for (int i = 0; i < Kalmar::NTHREAD; ++i)
-        obj[i] = std::thread(partitioned_task_tile_2D<Kernel>,
-                             std::cref(f), std::cref(compute_domain), i);
-    // FIXME wrap the above operation into the completion_future object
-    return completion_future();
-}
-
-template <typename Kernel>
-completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
-                     tiled_extent<3> const& compute_domain)
-{
-    Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
-    for (int i = 0; i < Kalmar::NTHREAD; ++i)
-        obj[i] = std::thread(partitioned_task_tile_3D<Kernel>,
-                             std::cref(f), std::cref(compute_domain), i);
-    // FIXME wrap the above operation into the completion_future object
-    return completion_future();
-}
-
-#endif
-
-// ------------------------------------------------------------------------
-// utility helper classes for array_view
-// ------------------------------------------------------------------------
-
-template <typename T, int N>
-struct projection_helper
-{
-    // array_view<T,N>, where N>1
-    //    array_view<T,N-1> operator[](int i) const __CPU__ __HC__
-    static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher");
-    typedef array_view<T, N - 1> result_type;
-    static result_type project(array_view<T, N>& now, int stride) __CPU__ __HC__ {
-        int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
-        for (i = N - 1; i > 0; --i) {
-            ext_o[i - 1] = now.extent[i];
-            ext[i - 1] = now.extent_base[i];
-            idx[i - 1] = now.index_base[i];
-        }
-        stride += now.index_base[0];
-        extent<N - 1> ext_now(ext_o);
-        extent<N - 1> ext_base(ext);
-        index<N - 1> idx_base(idx);
-        return result_type (now.cache, ext_now, ext_base, idx_base,
-                            now.offset + ext_base.size() * stride);
-    }
-    static result_type project(const array_view<T, N>& now, int stride) __CPU__ __HC__ {
-        int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
-        for (i = N - 1; i > 0; --i) {
-            ext_o[i - 1] = now.extent[i];
-            ext[i - 1] = now.extent_base[i];
-            idx[i - 1] = now.index_base[i];
-        }
-        stride += now.index_base[0];
-        extent<N - 1> ext_now(ext_o);
-        extent<N - 1> ext_base(ext);
-        index<N - 1> idx_base(idx);
-        return result_type (now.cache, ext_now, ext_base, idx_base,
-                            now.offset + ext_base.size() * stride);
-    }
-};
-
-template <typename T>
-struct projection_helper<T, 1>
-{
-    // array_view<T,1>
-    //      T& operator[](int i) const __CPU__ __HC__;
-    typedef T& result_type;
-    static result_type project(array_view<T, 1>& now, int i) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        now.cache.get_cpu_access(true);
-#endif
-        T *ptr = reinterpret_cast<T *>(now.cache.get() + i + now.offset + now.index_base[0]);
-        return *ptr;
-    }
-    static result_type project(const array_view<T, 1>& now, int i) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        now.cache.get_cpu_access(true);
-#endif
-        T *ptr = reinterpret_cast<T *>(now.cache.get() + i + now.offset + now.index_base[0]);
-        return *ptr;
-    }
-};
-
-template <typename T, int N>
-struct projection_helper<const T, N>
-{
-    // array_view<T,N>, where N>1
-    //    array_view<const T,N-1> operator[](int i) const __CPU__ __HC__;
-    static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher");
-    typedef array_view<const T, N - 1> const_result_type;
-    static const_result_type project(array_view<const T, N>& now, int stride) __CPU__ __HC__ {
-        int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
-        for (i = N - 1; i > 0; --i) {
-            ext_o[i - 1] = now.extent[i];
-            ext[i - 1] = now.extent_base[i];
-            idx[i - 1] = now.index_base[i];
-        }
-        stride += now.index_base[0];
-        extent<N - 1> ext_now(ext_o);
-        extent<N - 1> ext_base(ext);
-        index<N - 1> idx_base(idx);
-        auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
-                                      now.offset + ext_base.size() * stride);
-        return ret;
-    }
-    static const_result_type project(const array_view<const T, N>& now, int stride) __CPU__ __HC__ {
-        int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
-        for (i = N - 1; i > 0; --i) {
-            ext_o[i - 1] = now.extent[i];
-            ext[i - 1] = now.extent_base[i];
-            idx[i - 1] = now.index_base[i];
-        }
-        stride += now.index_base[0];
-        extent<N - 1> ext_now(ext_o);
-        extent<N - 1> ext_base(ext);
-        index<N - 1> idx_base(idx);
-        auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
-                                      now.offset + ext_base.size() * stride);
-        return ret;
-    }
-};
-
-template <typename T>
-struct projection_helper<const T, 1>
-{
-    // array_view<const T,1>
-    //      const T& operator[](int i) const __CPU__ __HC__;
-    typedef const T& const_result_type;
-    static const_result_type project(array_view<const T, 1>& now, int i) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        now.cache.get_cpu_access();
-#endif
-        const T *ptr = reinterpret_cast<const T *>(now.cache.get() + i + now.offset + now.index_base[0]);
-        return *ptr;
-    }
-    static const_result_type project(const array_view<const T, 1>& now, int i) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        now.cache.get_cpu_access();
-#endif
-        const T *ptr = reinterpret_cast<const T *>(now.cache.get() + i + now.offset + now.index_base[0]);
-        return *ptr;
-    }
-};
-
-// ------------------------------------------------------------------------
-// utility helper classes for array_view
-// ------------------------------------------------------------------------
-
-template <typename T>
-struct __has_data
-{
-private:
-    struct two {char __lx; char __lxx;};
-    template <typename C> static char test(decltype(std::declval<C>().data()));
-    template <typename C> static two test(...);
-public:
-    static const bool value = sizeof(test<T>(0)) == 1;
-};
-
-template <typename T>
-struct __has_size
-{
-private:
-    struct two {char __lx; char __lxx;};
-    template <typename C> static char test(decltype(&C::size));
-    template <typename C> static two test(...);
-public:
-    static const bool value = sizeof(test<T>(0)) == 1;
-};
-
-template <typename T>
-struct __is_container
-{
-    using _T = typename std::remove_reference<T>::type;
-    static const bool value = __has_size<_T>::value && __has_data<_T>::value;
-};
-
-
-// ------------------------------------------------------------------------
-// utility helper classes for array
-// ------------------------------------------------------------------------
-
-template <typename T, int N>
-struct array_projection_helper
-{
-    // array<T,N>, where N>1
-    //     array_view<T,N-1> operator[](int i0) __CPU__ __HC__;
-    //     array_view<const T,N-1> operator[](int i0) const __CPU__ __HC__;
-    static_assert(N > 1, "projection_helper is only supported on array with a rank of 2 or higher");
-    typedef array_view<T, N - 1> result_type;
-    typedef array_view<const T, N - 1> const_result_type;
-    static result_type project(array<T, N>& now, int stride) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        if( stride < 0)
-          throw runtime_exception("errorMsg_throw", 0);
-#endif
-        int comp[N - 1], i;
-        for (i = N - 1; i > 0; --i)
-            comp[i - 1] = now.extent[i];
-        extent<N - 1> ext(comp);
-        int offset = ext.size() * stride;
-#if __KALMAR_ACCELERATOR__ != 1
-        if( offset >= now.extent.size())
-          throw runtime_exception("errorMsg_throw", 0);
-#endif
-        return result_type(now.m_device, ext, ext, index<N - 1>(), offset);
-    }
-    static const_result_type project(const array<T, N>& now, int stride) __CPU__ __HC__ {
-        int comp[N - 1], i;
-        for (i = N - 1; i > 0; --i)
-            comp[i - 1] = now.extent[i];
-        extent<N - 1> ext(comp);
-        int offset = ext.size() * stride;
-        return const_result_type(now.m_device, ext, ext, index<N - 1>(), offset);
-    }
-};
-
-template <typename T>
-struct array_projection_helper<T, 1>
-{
-    // array<T,1>
-    //    T& operator[](int i0) __CPU__ __HC__;
-    //    const T& operator[](int i0) const __CPU__ __HC__;
-    typedef T& result_type;
-    typedef const T& const_result_type;
-    static result_type project(array<T, 1>& now, int i) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        now.m_device.synchronize(true);
-#endif
-        T *ptr = reinterpret_cast<T *>(now.m_device.get() + i);
-        return *ptr;
-    }
-    static const_result_type project(const array<T, 1>& now, int i) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        now.m_device.synchronize();
-#endif
-        const T *ptr = reinterpret_cast<const T *>(now.m_device.get() + i);
-        return *ptr;
-    }
-};
-
-template <int N>
-const extent<N>& check(const extent<N>& ext)
-{
-#if __KALMAR_ACCELERATOR__ != 1
-    for (int i = 0; i < N; i++)
-    {
-        if(ext[i] <=0)
-            throw runtime_exception("errorMsg_throw", 0);
-    }
-#endif
-    return ext;
-}
-
-// ------------------------------------------------------------------------
-// forward declarations of copy routines used by array / array_view
-// ------------------------------------------------------------------------
-
-template <typename T, int N>
-void copy(const array_view<const T, N>& src, const array_view<T, N>& dest);
-
-template <typename T, int N>
-void copy(const array_view<T, N>& src, const array_view<T, N>& dest);
-
-template <typename T, int N>
-void copy(const array<T, N>& src, const array_view<T, N>& dest);
-
-template <typename T, int N>
-void copy(const array<T, N>& src, array<T, N>& dest);
-
-template <typename T, int N>
-void copy(const array_view<const T, N>& src, array<T, N>& dest);
-
-template <typename T, int N>
-void copy(const array_view<T, N>& src, array<T, N>& dest);
-
-template <typename InputIter, typename T, int N>
-void copy(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);
-
-template <typename InputIter, typename T, int N>
-void copy(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);
-
-template <typename InputIter, typename T, int N>
-void copy(InputIter srcBegin, const array_view<T, N>& dest);
-
-template <typename InputIter, typename T, int N>
-void copy(InputIter srcBegin, array<T, N>& dest);
-
-template <typename OutputIter, typename T, int N>
-void copy(const array_view<T, N> &src, OutputIter destBegin);
-
-template <typename OutputIter, typename T, int N>
-void copy(const array<T, N> &src, OutputIter destBegin);
-
-// ------------------------------------------------------------------------
-// array
-// ------------------------------------------------------------------------
-
-/**
- * Represents an N-dimensional region of memory (with type T) located on an
- * accelerator.
- *
- * @tparam T The element type of this array
- * @tparam N The dimensionality of the array, defaults to 1 if elided.
- */
-template <typename T, int N = 1>
-class array {
-    static_assert(!std::is_const<T>::value, "array<const T> is not supported");
-public:
-#if __KALMAR_ACCELERATOR__ == 1
-    typedef Kalmar::_data<T> acc_buffer_t;
-#else
-    typedef Kalmar::_data_host<T> acc_buffer_t;
-#endif
-
-    /**
-     * The rank of this array.
-     */
-    static const int rank = N;
-
-    /**
-     * The element type of this array.
-     */
-    typedef T value_type;
-
-    /**
-     * There is no default constructor for array<T,N>.
-     */
-    array() = delete;
- 
-    /**
-     * Copy constructor. Constructs a new array<T,N> from the supplied argument
-     * other. The new array is located on the same accelerator_view as the
-     * source array. A deep copy is performed.
-     *
-     * @param[in] other An object of type array<T,N> from which to initialize
-     *                  this new array.
-     */
-    array(const array& other)
-        : array(other.get_extent(), other.get_accelerator_view())
-    { copy(other, *this); }
-
-    /**
-     * Move constructor. Constructs a new array<T,N> by moving from the
-     * supplied argument other.
-     *
-     * @param[in] other An object of type array<T,N> from which to initialize
-     *                  this new array.
-     */
-    array(array&& other)
-        : m_device(other.m_device), extent(other.extent)
-    { other.m_device.reset(); }
-
-    /**
-     * Constructs a new array with the supplied extent, located on the default
-     * view of the default accelerator. If any components of the extent are
-     * non-positive, an exception will be thrown.
-     *
-     * @param[in] ext The extent in each dimension of this array.
-     */
-    explicit array(const extent<N>& ext)
-        : array(ext, accelerator(L"default").get_default_view()) {}
-
-    /** @{ */
-    /**
-     * Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array.
-     */
-    explicit array(int e0)
-        : array(hc::extent<N>(e0)) { static_assert(N == 1, "illegal"); }
-    explicit array(int e0, int e1)
-        : array(hc::extent<N>(e0, e1)) {}
-    explicit array(int e0, int e1, int e2)
-        : array(hc::extent<N>(e0, e1, e2)) {}
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Constructs a new array with the supplied extent, located on the default
-     * accelerator, initialized with the contents of a source container
-     * specified by a beginning and optional ending iterator. The source data
-     * is copied by value into this array as if by calling "copy()".
-     *
-     * If the number of available container elements is less than
-     * this->extent.size(), undefined behavior results.
-     *
-     * @param[in] ext The extent in each dimension of this array.
-     * @param[in] srcBegin A beginning iterator into the source container.
-     * @param[in] srcEnd An ending iterator into the source container.
-     */
-    template <typename InputIter>
-        array(const extent<N>& ext, InputIter srcBegin)
-            : array(ext, srcBegin, accelerator(L"default").get_default_view()) {}
-    template <typename InputIter>
-        array(const extent<N>& ext, InputIter srcBegin, InputIter srcEnd)
-            : array(ext, srcBegin, srcEnd, accelerator(L"default").get_default_view()) {}
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Equivalent to construction using
-     * "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array.
-     * @param[in] srcBegin A beginning iterator into the source container. 
-     * @param[in] srcEnd An ending iterator into the source container.
-     */
-    template <typename InputIter>
-        array(int e0, InputIter srcBegin)
-            : array(hc::extent<N>(e0), srcBegin) {}
-    template <typename InputIter>
-        array(int e0, InputIter srcBegin, InputIter srcEnd)
-            : array(hc::extent<N>(e0), srcBegin, srcEnd) {}
-    template <typename InputIter>
-        array(int e0, int e1, InputIter srcBegin)
-            : array(hc::extent<N>(e0, e1), srcBegin) {}
-    template <typename InputIter>
-        array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
-            : array(hc::extent<N>(e0, e1), srcBegin, srcEnd) {}
-    template <typename InputIter>
-        array(int e0, int e1, int e2, InputIter srcBegin)
-            : array(hc::extent<N>(e0, e1, e2), srcBegin) {}
-    template <typename InputIter>
-        array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
-            : array(hc::extent<N>(e0, e1, e2), srcBegin, srcEnd) {}
-
-    /** @} */
-
-    /**
-     * Constructs a new array, located on the default view of the default
-     * accelerator, initialized with the contents of the array_view "src". The
-     * extent of this array is taken from the extent of the source array_view.
-     * The "src" is copied by value into this array as if by calling
-     * "copy(src, *this)".
-     *
-     * @param[in] src An array_view object from which to copy the data into
-     *                this array (and also to determine the extent of this
-     *                array).
-     */
-    explicit array(const array_view<const T, N>& src)
-        : array(src.get_extent(), accelerator(L"default").get_default_view())
-    { copy(src, *this); }
-
-    /**
-     * Constructs a new array with the supplied extent, located on the
-     * accelerator bound to the accelerator_view "av".
-     *
-     * Users can optionally specify the type of CPU access desired for "this"
-     * array thus requesting creation of an array that is accessible both on
-     * the specified accelerator_view "av" as well as the CPU (with the
-     * specified CPU access_type). If a value other than access_type_auto or
-     * access_type_none is specified for the cpu_access_type parameter and the
-     * accelerator corresponding to the accelerator_view "av" does not support
-     * cpu_shared_memory, a runtime_exception is thrown. The cpu_access_type
-     * parameter has a default value of access_type_auto which leaves it up to
-     * the implementation to decide what type of allowed CPU access should the
-     * array be created with. The actual CPU access_type allowed for the
-     * created array can be queried using the get_cpu_access_type member
-     * method.
-     *
-     * @param[in] ext The extent in each dimension of this array.
-     * @param[in] av An accelerator_view object which specifies the location of
-     *               this array.
-     * @param[in] access_type The type of CPU access desired for this array.
-     */
-    array(const extent<N>& ext, accelerator_view av, access_type cpu_access_type = access_type_auto)
-#if __KALMAR_ACCELERATOR__ == 1
-        : m_device(ext.size()), extent(ext) {}
-#else
-        : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type), extent(ext) {}
-#endif
-
-    /** @{ */
-    /**
-     * Constructs an array instance based on the given pointer on the device memory.
-     */
-    explicit array(int e0, void* accelerator_pointer)
-        : array(hc::extent<N>(e0), accelerator(L"default").get_default_view(), accelerator_pointer) {}
-    explicit array(int e0, int e1, void* accelerator_pointer)
-        : array(hc::extent<N>(e0, e1), accelerator(L"default").get_default_view(), accelerator_pointer) {}
-    explicit array(int e0, int e1, int e2, void* accelerator_pointer)
-        : array(hc::extent<N>(e0, e1, e2), accelerator(L"default").get_default_view(), accelerator_pointer) {}
-
-    explicit array(const extent<N>& ext, void* accelerator_pointer)
-        : array(ext, accelerator(L"default").get_default_view(), accelerator_pointer) {}
-    /** @} */
-
-    /**
-     * Constructs an array instance based on the given pointer on the device memory.
-     *
-     * @param[in] ext The extent in each dimension of this array.
-     * @param[in] av An accelerator_view object which specifies the location of
-     *               this array.
-     * @param[in] accelerator_pointer The pointer to the device memory.
-     * @param[in] access_type The type of CPU access desired for this array.
-     */
-    explicit array(const extent<N>& ext, accelerator_view av, void* accelerator_pointer, access_type cpu_access_type = access_type_auto)
-#if __KALMAR_ACCELERATOR__ == 1
-        : m_device(ext.size(), accelerator_pointer), extent(ext) {}
-#else
-        : m_device(av.pQueue, av.pQueue, check(ext).size(), accelerator_pointer, cpu_access_type), extent(ext) {}
-#endif
-
-    /** @{ */
-    /**
-     * Equivalent to construction using
-     * "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".   
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array.
-     * @param[in] av An accelerator_view object which specifies the location of
-     *               this array.
-     * @param[in] access_type The type of CPU access desired for this array.
-     */
-    array(int e0, accelerator_view av, access_type cpu_access_type = access_type_auto)
-        : array(hc::extent<N>(e0), av, cpu_access_type) {}
-    array(int e0, int e1, accelerator_view av, access_type cpu_access_type = access_type_auto)
-        : array(hc::extent<N>(e0, e1), av, cpu_access_type) {}
-    array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type = access_type_auto)
-        : array(hc::extent<N>(e0, e1, e2), av, cpu_access_type) {}
-
-    /** @} */
-
-    /**
-     * Constructs a new array with the supplied extent, located on the
-     * accelerator bound to the accelerator_view "av", initialized with the
-     * contents of the source container specified by a beginning and optional
-     * ending iterator. The data is copied by value into this array as if by
-     * calling "copy()".
-     *
-     * Users can optionally specify the type of CPU access desired for "this"
-     * array thus requesting creation of an array that is accessible both on
-     * the specified accelerator_view "av" as well as the CPU (with the
-     * specified CPU access_type). If a value other than access_type_auto or
-     * access_type_none is specified for the cpu_access_type parameter and the
-     * accelerator corresponding to the accelerator_view "av" does not support
-     * cpu_shared_memory, a runtime_exception is thrown. The cpu_access_type
-     * parameter has a default value of access_type_auto which leaves it upto
-     * the implementation to decide what type of allowed CPU access should the
-     * array be created with. The actual CPU access_type allowed for the
-     * created array can be queried using the get_cpu_access_type member
-     * method.
-     *
-     * @param[in] ext The extent in each dimension of this array.
-     * @param[in] srcBegin A beginning iterator into the source container.
-     * @param[in] srcEnd An ending iterator into the source container.
-     * @param[in] av An accelerator_view object which specifies the home
-     *               location of this array.
-     * @param[in] access_type The type of CPU access desired for this array.
-     */
-    template <typename InputIter>
-        array(const extent<N>& ext, InputIter srcBegin, accelerator_view av,
-              access_type cpu_access_type = access_type_auto)
-        : array(ext, av, cpu_access_type) { copy(srcBegin, *this); }
-    template <typename InputIter>
-        array(const extent<N>& ext, InputIter srcBegin, InputIter srcEnd,
-              accelerator_view av, access_type cpu_access_type = access_type_auto)
-        : array(ext, av, cpu_access_type) {
-            if (ext.size() < std::distance(srcBegin, srcEnd))
-                throw runtime_exception("errorMsg_throw", 0);
-            copy(srcBegin, srcEnd, *this);
-        }
-
-    /** @} */
-
-    /**
-     * Constructs a new array initialized with the contents of the array_view
-     * "src". The extent of this array is taken from the extent of the source
-     * array_view. The "src" is copied by value into this array as if by
-     * calling "copy(src, *this)". The new array is located on the accelerator
-     * bound to the accelerator_view "av".
-     *
-     * Users can optionally specify the type of CPU access desired for "this"
-     * array thus requesting creation of an array that is accessible both on
-     * the specified accelerator_view "av" as well as the CPU (with the 
-     * specified CPU access_type). If a value other than access_type_auto or
-     * access_type_none is specified for the cpu_access_type parameter and the
-     * accelerator corresponding to the accelerator_view “av” does not support
-     * cpu_shared_memory, a runtime_exception is thrown. The cpu_access_type
-     * parameter has a default value of access_type_auto which leaves it upto
-     * the implementation to decide what type of allowed CPU access should the
-     * array be created with. The actual CPU access_type allowed for the
-     * created array can be queried using the get_cpu_access_type member
-     * method.
-     *
-     * @param[in] src An array_view object from which to copy the data into
-     *                this array (and also to determine the extent of this array).
-     * @param[in] av An accelerator_view object which specifies the home
-     *               location of this array.
-     * @param[in] access_type The type of CPU access desired for this array.
-     */
-    array(const array_view<const T, N>& src, accelerator_view av, access_type cpu_access_type = access_type_auto)
-        : array(src.get_extent(), av, cpu_access_type) { copy(src, *this); }
-
-    /** @{ */
-    /**
-     * Equivalent to construction using
-     * "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd], av, cpu_access_type)".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array.
-     * @param[in] srcBegin A beginning iterator into the source container.
-     * @param[in] srcEnd An ending iterator into the source container.
-     * @param[in] av An accelerator_view object which specifies the home
-     *               location of this array.
-     * @param[in] access_type The type of CPU access desired for this array.
-     */
-    template <typename InputIter>
-        array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
-            : array(extent<N>(e0), srcBegin, av, cpu_access_type) {}
-    template <typename InputIter>
-        array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
-            : array(extent<N>(e0), srcBegin, srcEnd, av, cpu_access_type) {}
-    template <typename InputIter>
-        array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
-            : array(hc::extent<N>(e0, e1), srcBegin, av, cpu_access_type) {}
-    template <typename InputIter>
-        array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
-            : array(hc::extent<N>(e0, e1), srcBegin, srcEnd, av, cpu_access_type) {}
-    template <typename InputIter>
-        array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
-            : array(hc::extent<N>(e0, e1, e2), srcBegin, av, cpu_access_type) {}
-    template <typename InputIter>
-        array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
-            : array(hc::extent<N>(e0, e1, e2), srcBegin, srcEnd, av, cpu_access_type) {}
-
-    /** @} */
-
-    /**
-     * Constructs a staging array with the given extent, which acts as a
-     * staging area between accelerator views "av" and "associated_av". If "av"
-     * is a cpu accelerator view, this will construct a staging array which is
-     * optimized for data transfers between the CPU and "associated_av".
-     *
-     * @param[in] ext The extent in each dimension of this array.
-     * @param[in] av An accelerator_view object which specifies the home
-     *               location of this array.
-     * @param[in] associated_av An accelerator_view object which specifies a
-     *                          target device accelerator.
-     */
-    array(const extent<N>& ext, accelerator_view av, accelerator_view associated_av)
-#if __KALMAR_ACCELERATOR__ == 1
-        : m_device(ext.size()), extent(ext) {}
-#else
-        : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto), extent(ext) {}
-#endif
-
-    /** @{ */
-    /**
-     * Equivalent to construction using 
-     * "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array.
-     * @param[in] av An accelerator_view object which specifies the home
-     *               location of this array.
-     * @param[in] associated_av An accelerator_view object which specifies a
-     *                          target device accelerator.
-     */
-    array(int e0, accelerator_view av, accelerator_view associated_av)
-        : array(hc::extent<N>(e0), av, associated_av) {}
-    array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
-        : array(hc::extent<N>(e0, e1), av, associated_av) {}
-    array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
-        : array(hc::extent<N>(e0, e1, e2), av, associated_av) {}
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Constructs a staging array with the given extent, which acts as a
-     * staging area between accelerator_views "av" (which must be the CPU
-     * accelerator) and "associated_av". The staging array will be initialized
-     * with the data specified by "src" as if by calling "copy(src, *this)".
-     *
-     * @param[in] ext The extent in each dimension of this array.
-     * @param[in] srcBegin A beginning iterator into the source container.
-     * @param[in] srcEnd An ending iterator into the source container.
-     * @param[in] av An accelerator_view object which specifies the home
-     *               location of this array.
-     * @param[in] associated_av An accelerator_view object which specifies a
-     *                          target device accelerator.
-     */
-    template <typename InputIter>
-        array(const extent<N>& ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
-            : array(ext, av, associated_av) { copy(srcBegin, *this); }
-    template <typename InputIter>
-        array(const extent<N>& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
-            : array(ext, av, associated_av) {
-            if (ext.size() < std::distance(srcBegin, srcEnd))
-                throw runtime_exception("errorMsg_throw", 0);
-            copy(srcBegin, srcEnd, *this);
-        }
-
-    /** @} */
-
-    /**
-     * Constructs a staging array initialized with the array_view given by
-     * "src", which acts as a staging area between accelerator_views "av"
-     * (which must be the CPU accelerator) and "associated_av". The extent of
-     * this array is taken from the extent of the source array_view. The
-     * staging array will be initialized from "src" as if by calling
-     * "copy(src, *this)".
-     *
-     * @param[in] src An array_view object from which to copy the data into
-     *                this array (and also to determine the extent of this
-     *                array).
-     * @param[in] av An accelerator_view object which specifies the home
-     *               location of this array.
-     * @param[in] associated_av An accelerator_view object which specifies a
-     *                          target device accelerator.
-     */
-    array(const array_view<const T, N>& src, accelerator_view av, accelerator_view associated_av)
-        : array(src.get_extent(), av, associated_av)
-    { copy(src, *this); }
-
-    /** @{ */
-    /**
-     * Equivalent to construction using
-     * "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array.
-     * @param[in] srcBegin A beginning iterator into the source container.
-     * @param[in] srcEnd An ending iterator into the source container.
-     * @param[in] av An accelerator_view object which specifies the home
-     *               location of this array.
-     * @param[in] associated_av An accelerator_view object which specifies a
-     *                          target device accelerator.
-     */
-    template <typename InputIter>
-        array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
-            : array(extent<N>(e0), srcBegin, av, associated_av) {}
-    template <typename InputIter>
-        array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
-            : array(extent<N>(e0), srcBegin, srcEnd, av, associated_av) {}
-    template <typename InputIter>
-        array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
-            : array(hc::extent<N>(e0, e1), srcBegin, av, associated_av) {}
-    template <typename InputIter>
-        array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
-            : array(hc::extent<N>(e0, e1), srcBegin, srcEnd, av, associated_av) {}
-    template <typename InputIter>
-        array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
-            : array(hc::extent<N>(e0, e1, e2), srcBegin, av, associated_av) {}
-    template <typename InputIter>
-        array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
-            : array(hc::extent<N>(e0, e1, e2), srcBegin, srcEnd, av, associated_av) {}
-
-    /** @} */
-
-    /**
-     * Access the extent that defines the shape of this array.
-     */
-    extent<N> get_extent() const __CPU__ __HC__ { return extent; }
-
-    /**
-     * This property returns the accelerator_view representing the location
-     * where this array has been allocated.
-     */
-    accelerator_view get_accelerator_view() const { return m_device.get_av(); }
-
-    /**
-     * This property returns the accelerator_view representing the preferred
-     * target where this array can be copied.
-     */
-    accelerator_view get_associated_accelerator_view() const { return m_device.get_stage(); }
-
-    /**
-     * This property returns the CPU "access_type" allowed for this array.
-     */
-    access_type get_cpu_access_type() const { return m_device.get_access(); }
-  
-    /**
-     * Assigns the contents of the array "other" to this array, using a deep
-     * copy.
-     *
-     * @param[in] other An object of type array<T,N> from which to copy into
-     *                  this array.
-     * @return Returns *this.
-     */
-    array& operator=(const array& other) {
-        if (this != &other) {
-            array arr(other);
-            *this = std::move(arr);
-        }
-        return *this;
-    }
-
-    /**
-     * Moves the contents of the array "other" to this array.
-     *
-     * @param[in] other An object of type array<T,N> from which to move into
-     *                  this array.
-     * @return Returns *this.
-     */
-    array& operator=(array&& other) {
-        if (this != &other) {
-            extent = other.extent;
-            m_device = other.m_device;
-            other.m_device.reset();
-        }
-        return *this;
-    }
-
-    /**
-     * Assigns the contents of the array_view "src", as if by calling
-     * "copy(src, *this)".
-     *
-     * @param[in] src An object of type array_view<T,N> from which to copy into
-     *                this array.
-     * @return Returns *this.
-     */
-    array& operator=(const array_view<T,N>& src) {
-        array arr(src);
-        *this = std::move(arr);
-        return *this;
-    }
-  
-    /**
-     * Copies the contents of this array to the array given by "dest", as
-     * if by calling "copy(*this, dest)".
-     *
-     * @param[out] dest An object of type array<T,N> to which to copy data
-     *                  from this array.
-     */
-    void copy_to(array& dest) const {
-#if __KALMAR_ACCELERATOR__ != 1
-        for(int i = 0 ; i < N ; i++)
-        {
-            if (dest.extent[i] < this->extent[i] )
-                throw runtime_exception("errorMsg_throw", 0);
-        }
-#endif
-        copy(*this, dest);
-    }
-
-    /**
-     * Copies the contents of this array to the array_view given by "dest", as
-     * if by calling "copy(*this, dest)".
-     *
-     * @param[out] dest An object of type array_view<T,N> to which to copy data
-     *                  from this array.
-     */
-    void copy_to(const array_view<T,N>& dest) const { copy(*this, dest); }
-
-    /**
-     * Returns a pointer to the raw data underlying this array.
-     *
-     * @return A (const) pointer to the first element in the linearized array.
-     */
-    T* data() const __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        if (!m_device.get())
-            return nullptr;
-        m_device.synchronize(true);
-#endif
-        return reinterpret_cast<T*>(m_device.get());
-    }
-
-    /**
-     * Returns a pointer to the device memory underlying this array.
-     *
-     * @return A (const) pointer to the first element in the array on the
-     *         device memory.
-     */
-    T* accelerator_pointer() const __CPU__ __HC__ {
-        return reinterpret_cast<T*>(m_device.get_device_pointer());
-    }
-
-    /**
-     * Implicitly converts an array to a std::vector, as if by
-     * "copy(*this, vector)".
-     *
-     * @return An object of type vector<T> which contains a copy of the data
-     *         contained on the array.
-     */
-    operator std::vector<T>() const {
-        std::vector<T> vec(extent.size());
-        copy(*this, vec.data());
-        return std::move(vec);
-    }
-
-    /** @{ */
-    /**
-     * Returns a reference to the element of this array that is at the location
-     * in N-dimensional space specified by "idx". Accessing array data on a
-     * location where it is not resident (e.g. from the CPU when it is resident
-     * on a GPU) results in an exception (in cpu context) or
-     * undefined behavior (in GPU context).
-     *
-     * @param[in] idx An object of type index<N> from that specifies the
-     *                location of the element.
-     */
-    T& operator[](const index<N>& idx) __CPU__ __HC__ {
-#ifndef __KALMAR_ACCELERATOR__
-        if (!m_device.get())
-            throw runtime_exception("The array is not accessible on CPU.", 0);
-        m_device.synchronize(true);
-#endif
-        T *ptr = reinterpret_cast<T*>(m_device.get());
-        return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx, extent)];
-    }
-    T& operator()(const index<N>& idx) __CPU__ __HC__ {
-        return (*this)[idx];
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Returns a const reference to the element of this array that is at the
-     * location in N-dimensional space specified by "idx". Accessing array data
-     * on a location where it is not resident (e.g. from the CPU when it is
-     * resident on a GPU) results in an exception (in cpu context)
-     * or undefined behavior (in GPU context).
-     *
-     * @param[in] idx An object of type index<N> from that specifies the
-     *                location of the element.
-     */
-    const T& operator[](const index<N>& idx) const __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        if (!m_device.get())
-            throw runtime_exception("The array is not accessible on CPU.", 0);
-        m_device.synchronize();
-#endif
-        T *ptr = reinterpret_cast<T*>(m_device.get());
-        return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx, extent)];
-    }
-    const T& operator()(const index<N>& idx) const __CPU__ __HC__ {
-        return (*this)[idx];
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Equivalent to
-     * "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
-     *
-     * @param[in] i0,i1,i2 The component values that will form the index into
-     *                     this array.
-     */
-    T& operator()(int i0, int i1) __CPU__ __HC__ {
-        return (*this)[index<2>(i0, i1)];
-    }
-    T& operator()(int i0, int i1, int i2) __CPU__ __HC__ {
-        return (*this)[index<3>(i0, i1, i2)];
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Equivalent to
-     * "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
-     *
-     * @param[in] i0,i1,i2 The component values that will form the index into
-     *                     this array.
-     */
-    const T& operator()(int i0, int i1) const __CPU__ __HC__ {
-        return (*this)[index<2>(i0, i1)];
-    }
-    const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ {
-        return (*this)[index<3>(i0, i1, i2)];
-    }
-
-    /** @{ */
-    /**
-     * This overload is defined for array<T,N> where @f$N \ge 2@f$.
-     * This mode of indexing is equivalent to projecting on the
-     * most-significant dimension. It allows C-style indexing. For example:
-     *
-     * @code{.cpp}
-     * array<float,4> myArray(myExtents, …);
-     * myArray[index<4>(5,4,3,2)] = 7;
-     * assert(myArray[5][4][3][2] == 7);
-     * @endcode
-     *
-     * @param i0 An integer that is the index into the most-significant
-     *           dimension of this array.
-     * @return Returns an array_view whose dimension is one lower than that of
-     *         this array.
-     */
-    typename array_projection_helper<T, N>::result_type
-        operator[] (int i) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-            if (!m_device.get())
-                throw runtime_exception("The array is not accessible on CPU.", 0);
-            m_device.synchronize();
-#endif
-            return array_projection_helper<T, N>::project(*this, i);
-        }
-    typename array_projection_helper<T, N>::result_type
-        operator()(int i0) __CPU__ __HC__ {
-            return (*this)[i0];
-        }
-    typename array_projection_helper<T, N>::const_result_type
-        operator[] (int i) const __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-            if (!m_device.get())
-                throw runtime_exception("The array is not accessible on CPU.", 0);
-            m_device.synchronize();
-#endif
-            return array_projection_helper<T, N>::project(*this, i);
-        }
-    typename array_projection_helper<T, N>::const_result_type
-        operator()(int i0) const __CPU__ __HC__ {
-            return (*this)[i0];
-        }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Returns a subsection of the source array view at the origin specified by
-     * "idx" and with the extent specified by "ext".
-     *
-     * Example:
-     * @code{.cpp}
-     * array<float,2> a(extent<2>(200,100));
-     * array_view<float,2> v1(a); // v1.extent = <200,100>
-     * array_view<float,2> v2 = v1.section(index<2>(15,25), extent<2>(40,50));
-     * assert(v2(0,0) == v1(15,25));
-     * @endcode
-     *
-     * @param[in] origin Provides the offset/origin of the resulting section.
-     * @param[in] ext Provides the extent of the resulting section.
-     * @return Returns a subsection of the source array at specified origin,
-     *         and with the specified extent.
-     */
-    array_view<T, N> section(const index<N>& origin, const extent<N>& ext) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        if ( !Kalmar::amp_helper<N, index<N>, hc::extent<N>>::contains(origin,  ext ,this->extent) )
-            throw runtime_exception("errorMsg_throw", 0);
-#endif
-        array_view<T, N> av(*this);
-        return av.section(origin, ext);
-    }
-    array_view<const T, N> section(const index<N>& origin, const extent<N>& ext) const __CPU__ __HC__ {
-        array_view<const T, N> av(*this);
-        return av.section(origin, ext);
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Equivalent to "section(idx, this->extent – idx)".
-     */
-    array_view<T, N> section(const index<N>& idx) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        if ( !Kalmar::amp_helper<N, index<N>, hc::extent<N>>::contains(idx, this->extent ) )
-            throw runtime_exception("errorMsg_throw", 0);
-#endif
-        array_view<T, N> av(*this);
-        return av.section(idx);
-    }
-    array_view<const T, N> section(const index<N>& idx) const __CPU__ __HC__ {
-        array_view<const T, N> av(*this);
-        return av.section(idx);
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Equivalent to "section(index<N>(), ext)".
-     */
-    array_view<T,N> section(const extent<N>& ext) __CPU__ __HC__ {
-        array_view<T, N> av(*this);
-        return av.section(ext);
-    }
-    array_view<const T,N> section(const extent<N>& ext) const __CPU__ __HC__ {
-        array_view<const T, N> av(*this);
-        return av.section(ext);
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Equivalent to
-     * "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]])) const".
-     *
-     * @param[in] i0,i1,i2 The component values that will form the origin of
-     *                     the section
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     the section
-     */
-    array_view<T, 1> section(int i0, int e0) __CPU__ __HC__ {
-        static_assert(N == 1, "Rank must be 1");
-        return section(index<1>(i0), hc::extent<1>(e0));
-    }
-    array_view<const T, 1> section(int i0, int e0) const __CPU__ __HC__ {
-        static_assert(N == 1, "Rank must be 1");
-        return section(index<1>(i0), hc::extent<1>(e0));
-    }
-    array_view<T, 2> section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ {
-        static_assert(N == 2, "Rank must be 2");
-        return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
-    }
-    array_view<T, 2> section(int i0, int i1, int e0, int e1) __CPU__ __HC__ {
-        static_assert(N == 2, "Rank must be 2");
-        return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
-    }
-    array_view<T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) __CPU__ __HC__ {
-        static_assert(N == 3, "Rank must be 3");
-        return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
-    }
-    array_view<const T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ {
-        static_assert(N == 3, "Rank must be 3");
-        return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Sometimes it is desirable to view the data of an N-dimensional array as
-     * a linear array, possibly with a (unsafe) reinterpretation of the element
-     * type. This can be achieved through the reinterpret_as member function.
-     * Example:
-     *
-     * @code{.cpp}
-     * struct RGB { float r; float g; float b; };
-     * array<RGB,3> a = ...;
-     * array_view<float,1> v = a.reinterpret_as<float>();
-     * assert(v.extent == 3*a.extent);
-     * @endcode
-     *
-     * The size of the reinterpreted ElementType must evenly divide into the
-     * total size of this array.
-     *
-     * @return Returns an array_view from this array<T,N> with the element type
-     *         reinterpreted from T to ElementType, and the rank reduced from N
-     *         to 1.
-     */
-    template <typename ElementType>
-        array_view<ElementType, 1> reinterpret_as() __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-            static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
-            static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
-            if( (extent.size() * sizeof(T)) % sizeof(ElementType))
-                throw runtime_exception("errorMsg_throw", 0);
-#endif
-            int size = extent.size() * sizeof(T) / sizeof(ElementType);
-            using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
-            array_view<ElementType, 1> av(buffer_type(m_device), extent<1>(size), 0);
-            return av;
-        }
-    template <typename ElementType>
-        array_view<const ElementType, 1> reinterpret_as() const __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-            static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
-            static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
-#endif
-            int size = extent.size() * sizeof(T) / sizeof(ElementType);
-            using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
-            array_view<const ElementType, 1> av(buffer_type(m_device), extent<1>(size), 0);
-            return av;
-        }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * An array of higher rank can be reshaped into an array of lower rank, or
-     * vice versa, using the view_as member function. Example:
-     *
-     * @code{.cpp}
-     * array<float,1> a(100);
-     * array_view<float,2> av = a.view_as(extent<2>(2,50));
-     * @endcode
-     *
-     * @return Returns an array_view from this array<T,N> with the rank changed
-     *         to K from N.
-     */
-    template <int K> array_view<T, K>
-        view_as(const extent<K>& viewExtent) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-            if( viewExtent.size() > extent.size())
-                throw runtime_exception("errorMsg_throw", 0);
-#endif
-            array_view<T, K> av(m_device, viewExtent, 0);
-            return av;
-        }
-    template <int K> array_view<const T, K>
-        view_as(const extent<K>& viewExtent) const __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-            if( viewExtent.size() > extent.size())
-                throw runtime_exception("errorMsg_throw", 0);
-#endif
-            const array_view<T, K> av(m_device, viewExtent, 0);
-            return av;
-        }
-
-    /** @} */
-
-    ~array() {}
-
-    // FIXME: functions below may be considered to move to private
-    const acc_buffer_t& internal() const __CPU__ __HC__ { return m_device; }
-    int get_offset() const __CPU__ __HC__ { return 0; }
-    index<N> get_index_base() const __CPU__ __HC__ { return index<N>(); }
-private:
-    template <typename K, int Q> friend struct projection_helper;
-    template <typename K, int Q> friend struct array_projection_helper;
-    acc_buffer_t m_device;
-    extent<N> extent;
-
-    template <typename Q, int K> friend
-        void copy(const array<Q, K>&, const array_view<Q, K>&);
-    template <typename Q, int K> friend
-        void copy(const array_view<const Q, K>&, array<Q, K>&);
-};
-
-// ------------------------------------------------------------------------
-// array_view
-// ------------------------------------------------------------------------
-
-/**
- * The array_view<T,N> type represents a possibly cached view into the data
- * held in an array<T,N>, or a section thereof. It also provides such views
- * over native CPU data. It exposes an indexing interface congruent to that of
- * array<T,N>.
- */
-template <typename T, int N = 1>
-class array_view
-{
-public:
-    typedef typename std::remove_const<T>::type nc_T;
-#if __KALMAR_ACCELERATOR__ == 1
-    typedef Kalmar::_data<T> acc_buffer_t;
-#else
-    typedef Kalmar::_data_host<T> acc_buffer_t;
-#endif
-
-    /**
-     * The rank of this array.
-     */
-    static const int rank = N;
-
-    /**
-     * The element type of this array.
-     */
-    typedef T value_type;
-
-    /**
-     * There is no default constructor for array_view<T,N>.
-     */
-    array_view() = delete;
-
-    /**
-     * Constructs an array_view which is bound to the data contained in the
-     * "src" array. The extent of the array_view is that of the src array, and
-     * the origin of the array view is at zero.
-     *
-     * @param[in] src An array which contains the data that this array_view is
-     *                bound to.
-     */
-    array_view(array<T, N>& src) __CPU__ __HC__
-        : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {}
-
-    // FIXME: following interfaces were not implemented yet
-    // template <typename Container>
-    //     explicit array_view<T, 1>::array_view(Container& src);
-    // template <typename value_type, int Size>
-    //     explicit array_view<T, 1>::array_view(value_type (&src) [Size]) __CPU__ __HC__;
-
-    /**
-     * Constructs an array_view which is bound to the data contained in the
-     * "src" container. The extent of the array_view is that given by the
-     * "extent" argument, and the origin of the array view is at zero.
-     *
-     * @param[in] src A template argument that must resolve to a linear
-     *                container that supports .data() and .size() members (such
-     *                as std::vector or std::array)
-     * @param[in] extent The extent of this array_view.
-     */
-    template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
-        array_view(const extent<N>& extent, Container& src)
-            : array_view(extent, src.data())
-        { static_assert( std::is_same<decltype(src.data()), T*>::value, "container element type and array view element type must match"); }
-
-    /**
-     * Constructs an array_view which is bound to the data contained in the
-     * "src" container. The extent of the array_view is that given by the
-     * "extent" argument, and the origin of the array view is at zero.
-     *
-     * @param[in] src A pointer to the source data this array_view will bind
-     *                to. If the number of elements pointed to is less than the
-     *                size of extent, the behavior is undefined.
-     * @param[in] ext The extent of this array_view.
-     */
-    array_view(const extent<N>& ext, value_type* src) __CPU__ __HC__
-#if __KALMAR_ACCELERATOR__ == 1
-        : cache((T *)(src)), extent(ext), extent_base(ext), offset(0) {}
-#else
-        : cache(ext.size(), (T *)(src)), extent(ext), extent_base(ext), offset(0) {}
-#endif
-
-    /**
-     * Constructs an array_view which is not bound to a data source. The extent
-     * of the array_view is that given by the "extent" argument, and the origin
-     * of the array view is at zero. An array_view thus constructed represents
-     * uninitialized data and the underlying allocations are created lazily as
-     * the array_view is accessed on different locations (on an
-     * accelerator_view or on the CPU).
-     *
-     * @param[in] ext The extent of this array_view.
-     */
-    explicit array_view(const extent<N>& ext)
-        : cache(ext.size()), extent(ext), extent_base(ext), offset(0) {}
-
-    /**
-     * Equivalent to construction using
-     * "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array_view.
-     * @param[in] src A template argument that must resolve to a contiguousi
-     *                container that supports .data() and .size() members (such
-     *                as std::vector or std::array)
-     */
-    template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
-        array_view(int e0, Container& src)
-            : array_view(hc::extent<N>(e0), src) {}
-    template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
-        array_view(int e0, int e1, Container& src)
-            : array_view(hc::extent<N>(e0, e1), src) {}
-    template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
-        array_view(int e0, int e1, int e2, Container& src)
-            : array_view(hc::extent<N>(e0, e1, e2), src) {}
-
-    /**
-     * Equivalent to construction using
-     * "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array_view.
-     * @param[in] src A pointer to the source data this array_view will bind
-     *                to. If the number of elements pointed to is less than
-     *                the size of extent, the behavior is undefined.
-     */
-    array_view(int e0, value_type *src) __CPU__ __HC__
-        : array_view(hc::extent<N>(e0), src) {}
-    array_view(int e0, int e1, value_type *src) __CPU__ __HC__
-        : array_view(hc::extent<N>(e0, e1), src) {}
-    array_view(int e0, int e1, int e2, value_type *src) __CPU__ __HC__
-        : array_view(hc::extent<N>(e0, e1, e2), src) {}
-
-    /**
-     * Equivalent to construction using
-     * "array_view(extent<N>(e0 [, e1 [, e2 ]]))".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array_view.
-     */
-    explicit array_view(int e0) : array_view(hc::extent<N>(e0)) {}
-    explicit array_view(int e0, int e1)
-        : array_view(hc::extent<N>(e0, e1)) {}
-    explicit array_view(int e0, int e1, int e2)
-        : array_view(hc::extent<N>(e0, e1, e2)) {}
-
-    /**
-     * Copy constructor. Constructs an array_view from the supplied argument
-     * other. A shallow copy is performed.
-     *
-     * @param[in] other An object of type array_view<T,N> or
-     *                  array_view<const T,N> from which to initialize this
-     *                  new array_view.
-     */
-    array_view(const array_view& other) __CPU__ __HC__
-        : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
-
-    /**
-     * Access the extent that defines the shape of this array_view.
-     */
-    extent<N> get_extent() const __CPU__ __HC__ { return extent; }
-
-    /**
-     * Access the accelerator_view where the data source of the array_view is
-     * located.
-     *
-     * When the data source of the array_view is native CPU memory, the method
-     * returns accelerator(accelerator::cpu_accelerator).default_view. When the
-     * data source underlying the array_view is an array, the method returns
-     * the accelerator_view where the source array is located.
-     */
-    accelerator_view get_source_accelerator_view() const { return cache.get_av(); }
-
-    /**
-     * Assigns the contents of the array_view "other" to this array_view, using
-     * a shallow copy. Both array_views will refer to the same data.
-     *
-     * @param[in] other An object of type array_view<T,N> from which to copy
-     *                  into this array.
-     * @return Returns *this.
-     */
-    array_view& operator=(const array_view& other) __CPU__ __HC__ {
-        if (this != &other) {
-            cache = other.cache;
-            extent = other.extent;
-            index_base = other.index_base;
-            extent_base = other.extent_base;
-            offset = other.offset;
-        }
-        return *this;
-    }
-
-    /**
-     * Copies the data referred to by this array_view to the array given by
-     * "dest", as if by calling "copy(*this, dest)"
-     *
-     * @param[in] dest An object of type array <T,N> to which to copy data from
-     *                 this array.
-     */
-    void copy_to(array<T,N>& dest) const {
-#if __KALMAR_ACCELERATOR__ != 1
-        for(int i= 0 ;i< N;i++)
-        {
-          if (dest.get_extent()[i] < this->extent[i])
-              throw runtime_exception("errorMsg_throw", 0);
-        }
-#endif
-        copy(*this, dest);
-    }
-
-    /**
-     * Copies the contents of this array_view to the array_view given by
-     * "dest", as if by calling "copy(*this, dest)"
-     *
-     * @param[in] dest An object of type array_view<T,N> to which to copy data
-     * from this array.
-     */
-    void copy_to(const array_view& dest) const { copy(*this, dest); }
-
-    /**
-     * Returns a pointer to the first data element underlying this array_view.
-     * This is only available on array_views of rank 1.
-     *
-     * When the data source of the array_view is native CPU memory, the pointer
-     * returned by data() is valid for the lifetime of the data source.
-     *
-     * When the data source underlying the array_view is an array, or the array
-     * view is created without a data source, the pointer returned by data() in
-     * CPU context is ephemeral and is invalidated when the original data
-     * source or any of its views are accessed on an accelerator_view through a
-     *  parallel_for_each or a copy operation.
-     *
-     * @return A pointer to the first element in the linearized array.
-     */
-    T* data() const __CPU__ __HC__ {
-
-#if __KALMAR_ACCELERATOR__ != 1
-        cache.get_cpu_access(true);
-#endif
-        static_assert(N == 1, "data() is only permissible on array views of rank 1");
-        return reinterpret_cast<T*>(cache.get() + offset + index_base[0]);
-    }
-
-    /**
-     * Returns a pointer to the device memory underlying this array_view.
-     *
-     * @return A (const) pointer to the first element in the array_view on the
-     *         device memory.
-     */
-    T* accelerator_pointer() const __CPU__ __HC__ {
-        return reinterpret_cast<T*>(cache.get_device_pointer() + offset + index_base[0]);
-    }
-
-    /**
-     * Calling this member function informs the array_view that its bound
-     * memory has been modified outside the array_view interface. This will
-     * render all cached information stale.
-     */
-    void refresh() const { cache.refresh(); }
-
-    /**
-     * Calling this member function synchronizes any modifications made to the
-     * data underlying "this" array_view to its source data container. For
-     * example, for an array_view on system memory, if the data underlying the
-     * view are modified on a remote accelerator_view through a
-     * parallel_for_each invocation, calling synchronize ensures that the
-     * modifications are synchronized to the source data and will be visible
-     * through the system memory pointer which the array_view was created over.
-     *
-     * For writable array_view objects, callers of this functional can
-     * optionally specify the type of access desired on the source data
-     * container through the "type" parameter. For example specifying a
-     * "access_type_read" (which is also the default value of the parameter)
-     * indicates that the data has been synchronized to its source location
-     * only for reading. On the other hand, specifying an access_type of
-     * "access_type_read_write" synchronizes the data to its source location
-     * both for reading and writing; i.e. any modifications to the source data
-     * directly through the source data container are legal after synchronizing
-     * the array_view with write access and before subsequently accessing the
-     * array_view on another remote location.
-     *
-     * It is advisable to be precise about the access_type specified in the
-     * synchronize call; i.e. if only write access it required, specifying
-     * access_type_write may yield better performance that calling synchronize
-     * with "access_type_read_write" since the later may require any
-     * modifications made to the data on remote locations to be synchronized to
-     * the source location, which is unnecessary if the contents are intended
-     * to be overwritten without reading.
-     *
-     * @param[in] type An argument of type "access_type" which specifies the
-     *                 type of access on the data source that the array_view is
-     *                 synchronized for.
-     */
-    // FIXME: type parameter is not implemented
-    void synchronize() const { cache.get_cpu_access(); }
-
-    /**
-     * An asynchronous version of synchronize, which returns a completion
-     * future object. When the future is ready, the synchronization operation
-     * is complete.
-     *
-     * @return An object of type completion_future that can be used to
-     *         determine the status of the asynchronous operation or can be
-     *         used to chain other operations to be executed after the
-     *         completion of the asynchronous operation.
-     */
-    // FIXME: type parameter is not implemented
-    completion_future synchronize_async() const {
-        std::future<void> fut = std::async([&]() mutable { synchronize(); });
-        return completion_future(fut.share());
-    }
-
-    /**
-     * Calling this member function synchronizes any modifications made to the
-     * data underlying "this" array_view to the specified accelerator_view
-     * "av". For example, for an array_view on system memory, if the data
-     * underlying the view is modified on the CPU, and synchronize_to is called
-     * on "this" array_view, then the array_view contents are cached on the
-     * specified accelerator_view location.
-     *
-     * For writable array_view objects, callers of this functional can
-     * optionally specify the type of access desired on the specified target
-     * accelerator_view "av", through the "type" parameter. For example
-     * specifying a "access_type_read" (which is also the default value of the
-     * parameter) indicates that the data has been synchronized to "av" only
-     * for reading. On the other hand, specifying an access_type of
-     * "access_type_read_write" synchronizes the data to "av" both for reading
-     * and writing; i.e. any modifications to the data on "av" are legal after
-     * synchronizing the array_view with write access and before subsequently
-     * accessing the array_view on a location other than "av".
-     *
-     * It is advisable to be precise about the access_type specified in the
-     * synchronize call; i.e. if only write access it required, specifying
-     * access_type_write may yield better performance that calling synchronize
-     * with "access_type_read_write" since the later may require any
-     * modifications made to the data on remote locations to be synchronized to
-     * "av", which is unnecessary if the contents are intended to be
-     * immediately overwritten without reading.
-     *
-     * @param[in] av The target accelerator_view that "this" array_view is
-     *               synchronized for access on.
-     * @param[in] type An argument of type "access_type" which specifies the
-     *                 type of access on the data source that the array_view is
-     *                 synchronized for.
-     */
-    // FIXME: type parameter is not implemented
-    void synchronize_to(const accelerator_view& av) const {
-#if __KALMAR_ACCELERATOR__ != 1
-        cache.sync_to(av.pQueue);
-#endif
-    }
-
-    /**
-     * An asynchronous version of synchronize_to, which returns a completion
-     * future object. When the future is ready, the synchronization operation
-     * is complete.
-     *
-     * @param[in] av The target accelerator_view that "this" array_view is
-     *               synchronized for access on.
-     * @param[in] type An argument of type "access_type" which specifies the
-     *                 type of access on the data source that the array_view is
-     *                 synchronized for.
-     * @return An object of type completion_future that can be used to
-     *         determine the status of the asynchronous operation or can be
-     *         used to chain other operations to be executed after the
-     *         completion of the asynchronous operation.
-     */
-    // FIXME: this method is not implemented yet
-    completion_future synchronize_to_async(const accelerator_view& av) const;
-
-    /**
-     * Indicates to the runtime that it may discard the current logical
-     * contents of this array_view. This is an optimization hint to the runtime
-     * used to avoid copying the current contents of the view to a target
-     * accelerator_view, and its use is recommended if the existing content is
-     * not needed.
-     */
-    void discard_data() const {
-#if __KALMAR_ACCELERATOR__ != 1
-        cache.discard();
-#endif
-    }
-
-    /** @{ */
-    /**
-     * Returns a reference to the element of this array_view that is at the
-     * location in N-dimensional space specified by "idx".
-     *
-     * @param[in] idx An object of type index<N> that specifies the location of
-     *                the element.
-     */
-    T& operator[] (const index<N>& idx) const __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        cache.get_cpu_access(true);
-#endif
-        T *ptr = reinterpret_cast<T*>(cache.get() + offset);
-        return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx + index_base, extent_base)];
-    }
-
-    T& operator()(const index<N>& idx) const __CPU__ __HC__ {
-        return (*this)[idx];
-    }
-
-    /** @} */
-
-    /**
-     * Returns a reference to the element of this array_view that is at the
-     * location in N-dimensional space specified by "idx".
-     *
-     * Unlike the other indexing operators for accessing the array_view on the
-     * CPU, this method does not implicitly synchronize this array_view's
-     * contents to the CPU. After accessing the array_view on a remote location
-     * or performing a copy operation involving this array_view, users are
-     * responsible to explicitly synchronize the array_view to the CPU before
-     * calling this method. Failure to do so results in undefined behavior.
-     */
-    // FIXME: this method is not implemented
-    T& get_ref(const index<N>& idx) const __CPU__ __HC__;
-
-    /** @{ */
-    /**
-     * Equivalent to
-     * "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
-     *
-     * @param[in] i0,i1,i2 The component values that will form the index into
-     *                     this array.
-     */
-    T& operator() (int i0, int i1) const __CPU__ __HC__ {
-        static_assert(N == 2, "T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
-        return (*this)[index<2>(i0, i1)];
-    }
-    T& operator() (int i0, int i1, int i2) const __CPU__ __HC__ {
-        static_assert(N == 3, "T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
-        return (*this)[index<3>(i0, i1, i2)];
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * This overload is defined for array_view<T,N> where @f$N \ge 2@f$.
-     *
-     * This mode of indexing is equivalent to projecting on the
-     * most-significant dimension. It allows C-style indexing. For example:
-     *
-     * @code{.cpp}
-     * array<float,4> myArray(myExtents, ...);
-     *
-     * myArray[index<4>(5,4,3,2)] = 7;
-     * assert(myArray[5][4][3][2] == 7);
-     * @endcode
-     *
-     * @param[in] i0 An integer that is the index into the most-significant
-     *               dimension of this array.
-     * @return Returns an array_view whose dimension is one lower than that of
-     *         this array_view.
-     */
-    typename projection_helper<T, N>::result_type
-        operator[] (int i) const __CPU__ __HC__ {
-            return projection_helper<T, N>::project(*this, i);
-        }
-    typename projection_helper<T, N>::result_type
-        operator() (int i0) const __CPU__ __HC__ { return (*this)[i0]; }
-
-    /** @} */
-
-    /**
-     * Returns a subsection of the source array view at the origin specified by
-     * "idx" and with the extent specified by "ext".
-     *
-     * Example:
-     *
-     * @code{.cpp}
-     * array<float,2> a(extent<2>(200,100));
-     * array_view<float,2> v1(a); // v1.extent = <200,100>
-     * array_view<float,2> v2 = v1.section(index<2>(15,25), extent<2>(40,50));
-     * assert(v2(0,0) == v1(15,25));
-     * @endcode
-     *
-     * @param[in] idx Provides the offset/origin of the resulting section.
-     * @param[in] ext Provides the extent of the resulting section.
-     * @return Returns a subsection of the source array at specified origin,
-     *         and with the specified extent.
-     */
-    array_view<T, N> section(const index<N>& idx,
-                             const extent<N>& ext) const __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        if ( !Kalmar::amp_helper<N, index<N>, hc::extent<N>>::contains(idx, ext,this->extent ) )
-            throw runtime_exception("errorMsg_throw", 0);
-#endif
-        array_view<T, N> av(cache, ext, extent_base, idx + index_base, offset);
-        return av;
-    }
-
-    /**
-     * Equivalent to "section(idx, this->extent – idx)".
-     */
-    array_view<T, N> section(const index<N>& idx) const __CPU__ __HC__ {
-        hc::extent<N> ext(extent);
-        Kalmar::amp_helper<N, index<N>, hc::extent<N>>::minus(idx, ext);
-        return section(idx, ext);
-    }
-
-    /**
-     * Equivalent to "section(index<N>(), ext)".
-     */
-    array_view<T, N> section(const extent<N>& ext) const __CPU__ __HC__ {
-        index<N> idx;
-        return section(idx, ext);
-    }
-
-    /** @{ */
-    /**
-     * Equivalent to 
-     * "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
-     *
-     * @param[in] i0,i1,i2 The component values that will form the origin of
-     *                     the section
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     the section
-     */
-    array_view<T, 1> section(int i0, int e0) const __CPU__ __HC__ {
-        static_assert(N == 1, "Rank must be 1");
-        return section(index<1>(i0), hc::extent<1>(e0));
-    }
-
-    array_view<T, 2> section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ {
-        static_assert(N == 2, "Rank must be 2");
-        return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
-    }
-
-    array_view<T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ {
-        static_assert(N == 3, "Rank must be 3");
-        return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
-    }
-
-    /** @} */
-
-    /**
-     * This member function is similar to "array<T,N>::reinterpret_as",
-     * although it only supports array_views of rank 1 (only those guarantee
-     * that all elements are laid out contiguously).
-     *
-     * The size of the reinterpreted ElementType must evenly divide into the
-     * total size of this array_view.
-     *
-     * @return Returns an array_view from this array_view<T,1> with the element
-     *         type reinterpreted from T to ElementType.
-     */
-    template <typename ElementType>
-        array_view<ElementType, N> reinterpret_as() const __CPU__ __HC__ {
-            static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1");
-#if __KALMAR_ACCELERATOR__ != 1
-            static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
-            static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
-            if ( (extent.size() * sizeof(T)) % sizeof(ElementType))
-                throw runtime_exception("errorMsg_throw", 0);
-#endif
-            int size = extent.size() * sizeof(T) / sizeof(ElementType);
-            using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
-            array_view<ElementType, 1> av(buffer_type(cache),
-                                          extent<1>(size),
-                                          (offset + index_base[0])* sizeof(T) / sizeof(ElementType));
-            return av;
-        }
-
-    /**
-     * This member function is similar to "array<T,N>::view_as", although it
-     * only supports array_views of rank 1 (only those guarantee that all
-     * elements are laid out contiguously).
-     *
-     * @return Returns an array_view from this array_view<T,1> with the rank
-     * changed to K from 1.
-     */
-    template <int K>
-        array_view<T, K> view_as(extent<K> viewExtent) const __CPU__ __HC__ {
-            static_assert(N == 1, "view_as is only permissible on array views of rank 1");
-#if __KALMAR_ACCELERATOR__ != 1
-            if ( viewExtent.size() > extent.size())
-                throw runtime_exception("errorMsg_throw", 0);
-#endif
-            array_view<T, K> av(cache, viewExtent, offset + index_base[0]);
-            return av;
-        }
-
-    ~array_view() __CPU__ __HC__ {}
-
-    // FIXME: the following functions could be considered to move to private
-    const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; }
-
-    int get_offset() const __CPU__ __HC__ { return offset; }
-
-    index<N> get_index_base() const __CPU__ __HC__ { return index_base; }
-
-private:
-    template <typename K, int Q> friend struct projection_helper;
-    template <typename K, int Q> friend struct array_projection_helper;
-    template <typename Q, int K> friend class array;
-    template <typename Q, int K> friend class array_view;
-  
-    template<typename Q, int K> friend
-        bool is_flat(const array_view<Q, K>&) noexcept;
-    template <typename Q, int K> friend
-        void copy(const array<Q, K>&, const array_view<Q, K>&);
-    template <typename InputIter, typename Q, int K> friend
-        void copy(InputIter, InputIter, const array_view<Q, K>&);
-    template <typename Q, int K> friend
-        void copy(const array_view<const Q, K>&, array<Q, K>&);
-    template <typename OutputIter, typename Q, int K> friend
-        void copy(const array_view<Q, K>&, OutputIter);
-    template <typename Q, int K> friend
-        void copy(const array_view<const Q, K>& src, const array_view<Q, K>& dest);
-  
-    // used by view_as and reinterpret_as
-    array_view(const acc_buffer_t& cache, const hc::extent<N>& ext,
-               int offset) __CPU__ __HC__
-        : cache(cache), extent(ext), extent_base(ext), offset(offset) {}
-
-    // used by section and projection
-    array_view(const acc_buffer_t& cache, const hc::extent<N>& ext_now,
-               const hc::extent<N>& ext_b,
-               const index<N>& idx_b, int off) __CPU__ __HC__
-        : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b),
-        offset(off) {}
-  
-    acc_buffer_t cache;
-    hc::extent<N> extent;
-    hc::extent<N> extent_base;
-    index<N> index_base;
-    int offset;
-};
-
-// ------------------------------------------------------------------------
-// array_view (read-only)
-// ------------------------------------------------------------------------
-
-/**
- * The partial specialization array_view<const T,N> represents a view over
- * elements of type const T with rank N. The elements are readonly. At the
- * boundary of a call site (such as parallel_for_each), this form of array_view
- * need only be copied to the target accelerator if it isn't already there. It
- * will not be copied out.
- */
-template <typename T, int N>
-class array_view<const T, N>
-{
-public:
-    typedef typename std::remove_const<T>::type nc_T;
-
-#if __KALMAR_ACCELERATOR__ == 1
-  typedef Kalmar::_data<nc_T> acc_buffer_t;
-#else
-  typedef Kalmar::_data_host<const T> acc_buffer_t;
-#endif
-
-    /**
-     * The rank of this array.
-     */
-    static const int rank = N;
-
-    /**
-     * The element type of this array.
-     */
-    typedef const T value_type;
-
-    /**
-     * There is no default constructor for array_view<T,N>.
-     */
-    array_view() = delete;
-
-    /**
-     * Constructs an array_view which is bound to the data contained in the
-     * "src" array. The extent of the array_view is that of the src array, and
-     * the origin of the array view is at zero.
-     *
-     * @param[in] src An array which contains the data that this array_view is
-     *                bound to.
-     */
-    array_view(const array<T,N>& src) __CPU__ __HC__
-        : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {}
-
-    // FIXME: following interfaces were not implemented yet
-    // template <typename Container>
-    //     explicit array_view<const T, 1>::array_view(const Container& src);
-    // template <typename value_type, int Size>
-    //     explicit array_view<const T, 1>::array_view(const value_type (&src) [Size]) __CPU__ __HC__;
-
-    /**
-     * Constructs an array_view which is bound to the data contained in the
-     * "src" container. The extent of the array_view is that given by the
-     * "extent" argument, and the origin of the array view is at zero.
-     *
-     * @param[in] src A template argument that must resolve to a linear
-     *                container that supports .data() and .size() members (such
-     *                as std::vector or std::array)
-     * @param[in] extent The extent of this array_view.
-     */
-    template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
-        array_view(const extent<N>& extent, const Container& src)
-            : array_view(extent, src.data())
-        { static_assert( std::is_same<typename std::remove_const<typename std::remove_reference<decltype(*src.data())>::type>::type, T>::value, "container element type and array view element type must match"); }
-
-    /**
-     * Constructs an array_view which is bound to the data contained in the
-     * "src" container. The extent of the array_view is that given by the
-     * "extent" argument, and the origin of the array view is at zero.
-     *
-     * @param[in] src A pointer to the source data this array_view will bind
-     *                to. If the number of elements pointed to is less than the
-     *                size of extent, the behavior is undefined.
-     * @param[in] ext The extent of this array_view.
-     */
-    array_view(const extent<N>& ext, const value_type* src) __CPU__ __HC__
-#if __KALMAR_ACCELERATOR__ == 1
-        : cache((nc_T*)(src)), extent(ext), extent_base(ext), offset(0) {}
-#else
-        : cache(ext.size(), src), extent(ext), extent_base(ext), offset(0) {}
-#endif
-
-    /**
-     * Equivalent to construction using
-     * "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array_view.
-     * @param[in] src A template argument that must resolve to a contiguousi
-     *                container that supports .data() and .size() members (such
-     *                as std::vector or std::array)
-     */
-    template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
-        array_view(int e0, Container& src) : array_view(hc::extent<1>(e0), src) {}
-    template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
-        array_view(int e0, int e1, Container& src)
-            : array_view(hc::extent<N>(e0, e1), src) {}
-    template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
-        array_view(int e0, int e1, int e2, Container& src)
-            : array_view(hc::extent<N>(e0, e1, e2), src) {}
-
-    /**
-     * Equivalent to construction using
-     * "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)".
-     *
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     this array_view.
-     * @param[in] src A pointer to the source data this array_view will bind
-     *                to. If the number of elements pointed to is less than
-     *                the size of extent, the behavior is undefined.
-     */
-    array_view(int e0, const value_type *src) __CPU__ __HC__
-        : array_view(hc::extent<1>(e0), src) {}
-    array_view(int e0, int e1, const value_type *src) __CPU__ __HC__
-        : array_view(hc::extent<2>(e0, e1), src) {}
-    array_view(int e0, int e1, int e2, const value_type *src) __CPU__ __HC__
-        : array_view(hc::extent<3>(e0, e1, e2), src) {}
-
-    /**
-     * Copy constructor. Constructs an array_view from the supplied argument
-     * other. A shallow copy is performed.
-     *
-     * @param[in] other An object of type array_view<T,N> or
-     *                  array_view<const T,N> from which to initialize this
-     *                  new array_view.
-     */
-    array_view(const array_view<nc_T, N>& other) __CPU__ __HC__
-        : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
-
-    /**
-     * Copy constructor. Constructs an array_view from the supplied argument
-     * other. A shallow copy is performed.
-     *
-     * @param[in] other An object of type array_view<T,N> from which to
-     *                  initialize this new array_view.
-     */
-    array_view(const array_view& other) __CPU__ __HC__
-        : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
-
-    /**
-     * Access the extent that defines the shape of this array_view.
-     */
-    extent<N> get_extent() const __CPU__ __HC__ { return extent; }
-
-    /**
-     * Access the accelerator_view where the data source of the array_view is
-     * located.
-     *
-     * When the data source of the array_view is native CPU memory, the method
-     * returns accelerator(accelerator::cpu_accelerator).default_view. When the
-     * data source underlying the array_view is an array, the method returns
-     * the accelerator_view where the source array is located.
-     */
-    accelerator_view get_source_accelerator_view() const { return cache.get_av(); }
-
-    /** @{ */
-    /**
-     * Assigns the contents of the array_view "other" to this array_view, using
-     * a shallow copy. Both array_views will refer to the same data.
-     *
-     * @param[in] other An object of type array_view<T,N> from which to copy
-     *                  into this array.
-     * @return Returns *this.
-     */
-    array_view& operator=(const array_view<T,N>& other) __CPU__ __HC__ {
-        cache = other.cache;
-        extent = other.extent;
-        index_base = other.index_base;
-        extent_base = other.extent_base;
-        offset = other.offset;
-        return *this;
-    }
-  
-    array_view& operator=(const array_view& other) __CPU__ __HC__ {
-        if (this != &other) {
-            cache = other.cache;
-            extent = other.extent;
-            index_base = other.index_base;
-            extent_base = other.extent_base;
-            offset = other.offset;
-        }
-        return *this;
-    }
-
-    /** @} */
-
-    /**
-     * Copies the data referred to by this array_view to the array given by
-     * "dest", as if by calling "copy(*this, dest)"
-     *
-     * @param[in] dest An object of type array <T,N> to which to copy data from
-     *                 this array.
-     */
-    void copy_to(array<T,N>& dest) const { copy(*this, dest); }
-
-    /**
-     * Copies the contents of this array_view to the array_view given by
-     * "dest", as if by calling "copy(*this, dest)"
-     *
-     * @param[in] dest An object of type array_view<T,N> to which to copy data
-     * from this array.
-     */
-    void copy_to(const array_view<T,N>& dest) const { copy(*this, dest); }
-
-    /**
-     * Returns a pointer to the first data element underlying this array_view.
-     * This is only available on array_views of rank 1.
-     *
-     * When the data source of the array_view is native CPU memory, the pointer
-     * returned by data() is valid for the lifetime of the data source.
-     *
-     * When the data source underlying the array_view is an array, or the array
-     * view is created without a data source, the pointer returned by data() in
-     * CPU context is ephemeral and is invalidated when the original data
-     * source or any of its views are accessed on an accelerator_view through a
-     *  parallel_for_each or a copy operation.
-     *
-     * @return A const pointer to the first element in the linearized array.
-     */
-    const T* data() const __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        cache.get_cpu_access();
-#endif
-        static_assert(N == 1, "data() is only permissible on array views of rank 1");
-        return reinterpret_cast<const T*>(cache.get() + offset + index_base[0]);
-    }
-
-    /**
-     * Returns a pointer to the device memory underlying this array_view.
-     *
-     * @return A (const) pointer to the first element in the array_view on the
-     *         device memory.
-     */
-    T* accelerator_pointer() const __CPU__ __HC__ {
-        return reinterpret_cast<const T*>(cache.get_device_pointer() + offset + index_base[0]);
-    }
-
-    /**
-     * Calling this member function informs the array_view that its bound
-     * memory has been modified outside the array_view interface. This will
-     * render all cached information stale.
-     */
-    void refresh() const { cache.refresh(); }
-
-    /**
-     * Calling this member function synchronizes any modifications made to the
-     * data underlying "this" array_view to its source data container. For
-     * example, for an array_view on system memory, if the data underlying the
-     * view are modified on a remote accelerator_view through a
-     * parallel_for_each invocation, calling synchronize ensures that the
-     * modifications are synchronized to the source data and will be visible
-     * through the system memory pointer which the array_view was created over.
-     *
-     * For writable array_view objects, callers of this functional can
-     * optionally specify the type of access desired on the source data
-     * container through the "type" parameter. For example specifying a
-     * "access_type_read" (which is also the default value of the parameter)
-     * indicates that the data has been synchronized to its source location
-     * only for reading. On the other hand, specifying an access_type of
-     * "access_type_read_write" synchronizes the data to its source location
-     * both for reading and writing; i.e. any modifications to the source data
-     * directly through the source data container are legal after synchronizing
-     * the array_view with write access and before subsequently accessing the
-     * array_view on another remote location.
-     *
-     * It is advisable to be precise about the access_type specified in the
-     * synchronize call; i.e. if only write access it required, specifying
-     * access_type_write may yield better performance that calling synchronize
-     * with "access_type_read_write" since the later may require any
-     * modifications made to the data on remote locations to be synchronized to
-     * the source location, which is unnecessary if the contents are intended
-     * to be overwritten without reading.
-     */
-    void synchronize() const { cache.get_cpu_access(); }
-
-    /**
-     * An asynchronous version of synchronize, which returns a completion
-     * future object. When the future is ready, the synchronization operation
-     * is complete.
-     *
-     * @return An object of type completion_future that can be used to
-     *         determine the status of the asynchronous operation or can be
-     *         used to chain other operations to be executed after the
-     *         completion of the asynchronous operation.
-     */
-    completion_future synchronize_async() const {
-        std::future<void> fut = std::async([&]() mutable { synchronize(); });
-        return completion_future(fut.share());
-    }
-
-    /**
-     * Calling this member function synchronizes any modifications made to the
-     * data underlying "this" array_view to the specified accelerator_view
-     * "av". For example, for an array_view on system memory, if the data
-     * underlying the view is modified on the CPU, and synchronize_to is called
-     * on "this" array_view, then the array_view contents are cached on the
-     * specified accelerator_view location.
-     *
-     * @param[in] av The target accelerator_view that "this" array_view is
-     *               synchronized for access on.
-     */
-    void synchronize_to(const accelerator_view& av) const {
-#if __KALMAR_ACCELERATOR__ != 1
-        cache.sync_to(av.pQueue);
-#endif
-    }
-
-    /**
-     * An asynchronous version of synchronize_to, which returns a completion
-     * future object. When the future is ready, the synchronization operation
-     * is complete.
-     *
-     * @param[in] av The target accelerator_view that "this" array_view is
-     *               synchronized for access on.
-     * @param[in] type An argument of type "access_type" which specifies the
-     *                 type of access on the data source that the array_view is
-     *                 synchronized for.
-     * @return An object of type completion_future that can be used to
-     *         determine the status of the asynchronous operation or can be
-     *         used to chain other operations to be executed after the
-     *         completion of the asynchronous operation.
-     */
-    // FIXME: this method is not implemented yet
-    completion_future synchronize_to_async(const accelerator_view& av) const;
-
-    /** @{ */
-    /**
-     * Returns a const reference to the element of this array_view that is at
-     * the location in N-dimensional space specified by "idx".
-     *
-     * @param[in] idx An object of type index<N> that specifies the location of
-     *                the element.
-     */
-    const T& operator[](const index<N>& idx) const __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-        cache.get_cpu_access();
-#endif
-        const T *ptr = reinterpret_cast<const T*>(cache.get() + offset);
-        return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx + index_base, extent_base)];
-    }
-    const T& operator()(const index<N>& idx) const __CPU__ __HC__ {
-        return (*this)[idx];
-    }
-
-    /** @} */
-
-    /**
-     * Returns a reference to the element of this array_view that is at the
-     * location in N-dimensional space specified by "idx".
-     *
-     * Unlike the other indexing operators for accessing the array_view on the
-     * CPU, this method does not implicitly synchronize this array_view's
-     * contents to the CPU. After accessing the array_view on a remote location
-     * or performing a copy operation involving this array_view, users are
-     * responsible to explicitly synchronize the array_view to the CPU before
-     * calling this method. Failure to do so results in undefined behavior.
-     */
-    // FIXME: this method is not implemented
-    const T& get_ref(const index<N>& idx) const __CPU__ __HC__;
-
-    /** @{ */
-    /**
-     * Equivalent to
-     * "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
-     *
-     * @param[in] i0,i1,i2 The component values that will form the index into
-     *                     this array.
-     */
-    const T& operator()(int i0) const __CPU__ __HC__ {
-        static_assert(N == 1, "const T& array_view::operator()(int) is only permissible on array_view<T, 1>");
-        return (*this)[index<1>(i0)];
-    }
-  
-    const T& operator()(int i0, int i1) const __CPU__ __HC__ {
-        static_assert(N == 2, "const T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
-        return (*this)[index<2>(i0, i1)];
-    }
-    const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ {
-        static_assert(N == 3, "const T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
-        return (*this)[index<3>(i0, i1, i2)];
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * This overload is defined for array_view<T,N> where @f$N \ge 2@f$.
-     *
-     * This mode of indexing is equivalent to projecting on the
-     * most-significant dimension. It allows C-style indexing. For example:
-     *
-     * @code{.cpp}
-     * array<float,4> myArray(myExtents, ...);
-     *
-     * myArray[index<4>(5,4,3,2)] = 7;
-     * assert(myArray[5][4][3][2] == 7);
-     * @endcode
-     *
-     * @param[in] i0 An integer that is the index into the most-significant
-     *               dimension of this array.
-     * @return Returns an array_view whose dimension is one lower than that of
-     *         this array_view.
-     */
-    typename projection_helper<const T, N>::const_result_type
-        operator[] (int i) const __CPU__ __HC__ {
-        return projection_helper<const T, N>::project(*this, i);
-    }
-
-    // FIXME: typename projection_helper<const T, N>::const_result_type
-    //            operator() (int i0) const __CPU__ __HC__
-    // is not implemented
-
-    /** @} */
-
-    /**
-     * Returns a subsection of the source array view at the origin specified by
-     * "idx" and with the extent specified by "ext".
-     *
-     * Example:
-     *
-     * @code{.cpp}
-     * array<float,2> a(extent<2>(200,100));
-     * array_view<float,2> v1(a); // v1.extent = <200,100>
-     * array_view<float,2> v2 = v1.section(index<2>(15,25), extent<2>(40,50));
-     * assert(v2(0,0) == v1(15,25));
-     * @endcode
-     *
-     * @param[in] idx Provides the offset/origin of the resulting section.
-     * @param[in] ext Provides the extent of the resulting section.
-     * @return Returns a subsection of the source array at specified origin,
-     *         and with the specified extent.
-     */
-    array_view<const T, N> section(const index<N>& idx,
-                                   const extent<N>& ext) const __CPU__ __HC__ {
-        array_view<const T, N> av(cache, ext, extent_base, idx + index_base, offset);
-        return av;
-    }
-
-    /**
-     * Equivalent to "section(idx, this->extent – idx)".
-     */
-    array_view<const T, N> section(const index<N>& idx) const __CPU__ __HC__ {
-        hc::extent<N> ext(extent);
-        Kalmar::amp_helper<N, index<N>, hc::extent<N>>::minus(idx, ext);
-        return section(idx, ext);
-    }
-
-    /**
-     * Equivalent to "section(index<N>(), ext)".
-     */
-    array_view<const T, N> section(const extent<N>& ext) const __CPU__ __HC__ {
-        index<N> idx;
-        return section(idx, ext);
-    }
-
-    /** @{ */
-    /**
-     * Equivalent to 
-     * "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
-     *
-     * @param[in] i0,i1,i2 The component values that will form the origin of
-     *                     the section
-     * @param[in] e0,e1,e2 The component values that will form the extent of
-     *                     the section
-     */
-    array_view<const T, 1> section(int i0, int e0) const __CPU__ __HC__ {
-        static_assert(N == 1, "Rank must be 1");
-        return section(index<1>(i0), hc::extent<1>(e0));
-    }
-
-    array_view<const T, 2> section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ {
-        static_assert(N == 2, "Rank must be 2");
-        return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
-    }
-
-    array_view<const T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ {
-        static_assert(N == 3, "Rank must be 3");
-        return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
-    }
-
-    /** @} */
-
-    /**
-     * This member function is similar to "array<T,N>::reinterpret_as",
-     * although it only supports array_views of rank 1 (only those guarantee
-     * that all elements are laid out contiguously).
-     *
-     * The size of the reinterpreted ElementType must evenly divide into the
-     * total size of this array_view.
-     *
-     * @return Returns an array_view from this array_view<T,1> with the element
-     *         type reinterpreted from T to ElementType.
-     */
-    template <typename ElementType>
-        array_view<const ElementType, N> reinterpret_as() const __CPU__ __HC__ {
-            static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1");
-#if __KALMAR_ACCELERATOR__ != 1
-            static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
-            static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
-#endif
-            int size = extent.size() * sizeof(T) / sizeof(ElementType);
-            using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
-            array_view<const ElementType, 1> av(buffer_type(cache),
-                                                extent<1>(size),
-                                                (offset + index_base[0])* sizeof(T) / sizeof(ElementType));
-            return av;
-        }
-
-    /**
-     * This member function is similar to "array<T,N>::view_as", although it
-     * only supports array_views of rank 1 (only those guarantee that all
-     * elements are laid out contiguously).
-     *
-     * @return Returns an array_view from this array_view<T,1> with the rank
-     * changed to K from 1.
-     */
-    template <int K>
-        array_view<const T, K> view_as(extent<K> viewExtent) const __CPU__ __HC__ {
-            static_assert(N == 1, "view_as is only permissible on array views of rank 1");
-#if __KALMAR_ACCELERATOR__ != 1
-            if ( viewExtent.size() > extent.size())
-                throw runtime_exception("errorMsg_throw", 0);
-#endif
-            array_view<const T, K> av(cache, viewExtent, offset + index_base[0]);
-            return av;
-        }
-
-    ~array_view() __CPU__ __HC__ {}
-
-    // FIXME: the following functions may be considered to move to private
-    const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; }
-
-    int get_offset() const __CPU__ __HC__ { return offset; }
-
-    index<N> get_index_base() const __CPU__ __HC__ { return index_base; }
-
-private:
-    template <typename K, int Q> friend struct projection_helper;
-    template <typename K, int Q> friend struct array_projection_helper;
-    template <typename Q, int K> friend class array;
-    template <typename Q, int K> friend class array_view;
-  
-    template<typename Q, int K> friend
-        bool is_flat(const array_view<Q, K>&) noexcept;
-    template <typename Q, int K> friend
-        void copy(const array<Q, K>&, const array_view<Q, K>&);
-    template <typename InputIter, typename Q, int K>
-        void copy(InputIter, InputIter, const array_view<Q, K>&);
-    template <typename Q, int K> friend
-        void copy(const array_view<const Q, K>&, array<Q, K>&);
-    template <typename OutputIter, typename Q, int K> friend
-        void copy(const array_view<Q, K>&, OutputIter);
-    template <typename Q, int K> friend
-        void copy(const array_view<const Q, K>& src, const array_view<Q, K>& dest);
-  
-    // used by view_as and reinterpret_as
-    array_view(const acc_buffer_t& cache, const hc::extent<N>& ext,
-               int offset) __CPU__ __HC__
-        : cache(cache), extent(ext), extent_base(ext), offset(offset) {}
-  
-    // used by section and projection
-    array_view(const acc_buffer_t& cache, const hc::extent<N>& ext_now,
-               const extent<N>& ext_b,
-               const index<N>& idx_b, int off) __CPU__ __HC__
-        : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b),
-        offset(off) {}
-  
-    acc_buffer_t cache;
-    hc::extent<N> extent;
-    hc::extent<N> extent_base;
-    index<N> index_base;
-    int offset;
-};
-
-// ------------------------------------------------------------------------
-// utility functions for copy
-// ------------------------------------------------------------------------
-
-template<typename T, int N>
-static inline bool is_flat(const array_view<T, N>& av) noexcept {
-    return av.extent == av.extent_base && av.index_base == index<N>();
-}
-
-template<typename T>
-static inline bool is_flat(const array_view<T, 1>& av) noexcept { return true; }
-
-template <typename InputIter, typename T, int N, int dim>
-struct copy_input
-{
-    void operator()(InputIter& It, T* ptr, const extent<N>& ext,
-                    const extent<N>& base, const index<N>& idx)
-    {
-        size_t stride = 1;
-        for (int i = dim; i < N; i++)
-            stride *= base[i];
-        ptr += stride * idx[dim - 1];
-        for (int i = 0; i < ext[dim - 1]; i++) {
-            copy_input<InputIter, T, N, dim + 1>()(It, ptr, ext, base, idx);
-            ptr += stride;
-        }
-    }
-};
-
-template <typename InputIter, typename T, int N>
-struct copy_input<InputIter, T, N, N>
-{
-    void operator()(InputIter& It, T* ptr, const extent<N>& ext,
-                    const extent<N>& base, const index<N>& idx)
-    {
-        InputIter end = It;
-        std::advance(end, ext[N - 1]);
-        std::copy(It, end, ptr + idx[N - 1]);
-        It = end;
-    }
-};
-
-template <typename OutputIter, typename T, int N, int dim>
-struct copy_output
-{
-    void operator()(const T* ptr, OutputIter& It, const extent<N>& ext,
-                    const extent<N>& base, const index<N>& idx)
-    {
-        size_t stride = 1;
-        for (int i = dim; i < N; i++)
-            stride *= base[i];
-        ptr += stride * idx[dim - 1];
-        for (int i = 0; i < ext[dim - 1]; i++) {
-            copy_output<OutputIter, T, N, dim + 1>()(ptr, It, ext, base, idx);
-            ptr += stride;
-        }
-    }
-};
-
-template <typename OutputIter, typename T, int N>
-struct copy_output<OutputIter, T, N, N>
-{
-    void operator()(const T* ptr, OutputIter& It, const extent<N>& ext,
-                    const extent<N>& base, const index<N>& idx)
-    {
-        ptr += idx[N - 1];
-        It = std::copy(ptr, ptr + ext[N - 1], It);
-    }
-};
-
-template <typename T, int N, int dim>
-struct copy_bidir
-{
-    void operator()(const T* src, T* dst, const extent<N>& ext,
-                    const extent<N>& base1, const index<N>& idx1,
-                    const extent<N>& base2, const index<N>& idx2)
-    {
-        size_t stride1 = 1;
-        for (int i = dim; i < N; i++)
-            stride1 *= base1[i];
-        src += stride1 * idx1[dim - 1];
-
-        size_t stride2 = 1;
-        for (int i = dim; i < N; i++)
-            stride2 *= base2[i];
-        dst += stride2 * idx2[dim - 1];
-
-        for (int i = 0; i < ext[dim - 1]; i++) {
-            copy_bidir<T, N, dim + 1>()(src, dst, ext, base1, idx1, base2, idx2);
-            src += stride1;
-            dst += stride2;
-        }
-    }
-};
-
-template <typename T, int N>
-struct copy_bidir<T, N, N>
-{
-    void operator()(const T* src, T* dst, const extent<N>& ext,
-                    const extent<N>& base1, const index<N>& idx1,
-                    const extent<N>& base2, const index<N>& idx2)
-    {
-        src += idx1[N - 1];
-        dst += idx2[N - 1];
-        std::copy(src, src + ext[N - 1], dst);
-    }
-};
-
-template <typename Iter, typename T, int N>
-struct do_copy
-{
-    template<template <typename, int> class _amp_container>
-    void operator()(Iter srcBegin, Iter srcEnd, const _amp_container<T, N>& dest) {
-        size_t size = dest.get_extent().size();
-        size_t offset = dest.get_offset();
-        bool modify = true;
-
-        T* ptr = dest.internal().map_ptr(modify, size, offset);
-         std::copy(srcBegin, srcEnd, ptr);
-        dest.internal().unmap_ptr(ptr, modify, size, offset);
-    }
-    template<template <typename, int> class _amp_container>
-    void operator()(const _amp_container<T, N> &src, Iter destBegin) {
-        size_t size = src.get_extent().size();
-        size_t offset = src.get_offset();
-        bool modify = false;
-
-        const T* ptr = src.internal().map_ptr(modify, size, offset);
-        std::copy(ptr, ptr + src.get_extent().size(), destBegin);
-        src.internal().unmap_ptr(ptr, modify, size, offset);
-    }
-};
-
-template <typename Iter, typename T>
-struct do_copy<Iter, T, 1>
-{
-    template<template <typename, int> class _amp_container>
-    void operator()(Iter srcBegin, Iter srcEnd, const _amp_container<T, 1>& dest) {
-        size_t size = dest.get_extent().size();
-        size_t offset = dest.get_offset() + dest.get_index_base()[0];
-        bool modify = true;
-
-        T* ptr = dest.internal().map_ptr(modify, size, offset);
-         std::copy(srcBegin, srcEnd, ptr);
-        dest.internal().unmap_ptr(ptr, modify, size, offset);
-    }
-    template<template <typename, int> class _amp_container>
-    void operator()(const _amp_container<T, 1> &src, Iter destBegin) {
-        size_t size = src.get_extent().size();
-        size_t offset = src.get_offset() + src.get_index_base()[0];
-        bool modify = false;
-
-        const T* ptr = src.internal().map_ptr(modify, size, offset);
-        std::copy(ptr, ptr + src.get_extent().size(), destBegin);
-        src.internal().unmap_ptr(ptr, modify, size, offset);
-    }
-};
-
-template <typename T, int N>
-struct do_copy<T*, T, N>
-{
-    template<template <typename, int> class _amp_container>
-    void operator()(T* srcBegin, T* srcEnd, const _amp_container<T, N>& dest) {
-        dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd), dest.get_offset(), true);
-    }
-    template<template <typename, int> class _amp_container>
-    void operator()(const _amp_container<T, N> &src, T* destBegin) {
-        src.internal().read(destBegin, src.get_extent().size(), src.get_offset());
-    }
-};
-
-template <typename T>
-struct do_copy<T*, T, 1>
-{
-    template<template <typename, int> class _amp_container>
-    void operator()(const T* srcBegin, const T* srcEnd, const _amp_container<T, 1>& dest) {
-        dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd),
-                              dest.get_offset() + dest.get_index_base()[0], true);
-    }
-    template<template <typename, int> class _amp_container>
-    void operator()(const _amp_container<T, 1> &src, T* destBegin) {
-        src.internal().read(destBegin, src.get_extent().size(),
-                            src.get_offset() + src.get_index_base()[0]);
-    }
-};
-
-// ------------------------------------------------------------------------
-// copy
-// ------------------------------------------------------------------------
-
-/**
- * The contents of "src" are copied into "dest". The source and destination may
- * reside on different accelerators. If the extents of "src" and "dest" don't
- * match, a runtime exception is thrown.
- *
- * @param[in] src An object of type array<T,N> to be copied from.
- * @param[out] dest An object of type array<T,N> to be copied to.
- */
-template <typename T, int N>
-void copy(const array<T, N>& src, array<T, N>& dest) {
-    src.internal().copy(dest.internal(), 0, 0, 0);
-}
-
-/** @{ */
-/**
- * The contents of "src" are copied into "dest". If the extents of "src" and
- * "dest" don't match, a runtime exception is thrown.
- *
- * @param[in] src An object of type array<T,N> to be copied from.
- * @param[out] dest An object of type array_view<T,N> to be copied to.
- */
-template <typename T, int N>
-void copy(const array<T, N>& src, const array_view<T, N>& dest) {
-    if (is_flat(dest))
-        src.internal().copy(dest.internal(), src.get_offset(),
-                            dest.get_offset(), dest.get_extent().size());
-    else {
-        // FIXME: logic here deserve to be reviewed
-        size_t srcSize = src.extent.size();
-        size_t srcOffset = 0;
-        bool srcModify = false;
-        size_t destSize = dest.extent_base.size();
-        size_t destOffset = dest.offset;
-        bool destModify = true;
-
-        T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
-        T* p = pSrc;
-        T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
-        copy_input<T*, T, N, 1>()(pSrc, pDst, dest.extent, dest.extent_base, dest.index_base);
-        dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
-        src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
-    }
-}
-
-template <typename T>
-void copy(const array<T, 1>& src, const array_view<T, 1>& dest) {
-    src.internal().copy(dest.internal(),
-                        src.get_offset() + src.get_index_base()[0],
-                        dest.get_offset() + dest.get_index_base()[0],
-                        dest.get_extent().size());
-}
-
-/** @} */
-
-/** @{ */
-/**
- * The contents of "src" are copied into "dest". If the extents of "src" and
- * "dest" don't match, a runtime exception is thrown.
- *
- * @param[in] src An object of type array_view<T,N> (or array_view<const T, N>)
- *                to be copied from.
- * @param[out] dest An object of type array<T,N> to be copied to.
- */
-template <typename T, int N>
-void copy(const array_view<const T, N>& src, array<T, N>& dest) {
-    if (is_flat(src)) {
-        src.internal().copy(dest.internal(), src.get_offset(),
-                            dest.get_offset(), dest.get_extent().size());
-    } else {
-        // FIXME: logic here deserve to be reviewed
-        size_t srcSize = src.extent_base.size();
-        size_t srcOffset = src.offset;
-        bool srcModify = false;
-        size_t destSize = dest.extent.size();
-        size_t destOffset = 0;
-        bool destModify = true;
-
-        T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
-        T* p = pDst;
-        const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
-        copy_output<T*, T, N, 1>()(pSrc, pDst, src.extent, src.extent_base, src.index_base);
-        src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
-        dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
-    }
-}
-
-template <typename T, int N>
-void copy(const array_view<T, N>& src, array<T, N>& dest) {
-    const array_view<const T, N> buf(src);
-    copy(buf, dest);
-}
-
-template <typename T>
-void copy(const array_view<const T, 1>& src, array<T, 1>& dest) {
-    src.internal().copy(dest.internal(),
-                        src.get_offset() + src.get_index_base()[0],
-                        dest.get_offset() + dest.get_index_base()[0],
-                        dest.get_extent().size());
-}
-
-/** @} */
-
-/** @{ */
-/**
- * The contents of "src" are copied into "dest". If the extents of "src" and
- * "dest" don't match, a runtime exception is thrown.
- *
- * @param[in] src An object of type array_view<T,N> (or array_view<const T, N>)
- *                to be copied from.
- * @param[out] dest An object of type array_view<T,N> to be copied to.
- */
-template <typename T, int N>
-void copy(const array_view<const T, N>& src, const array_view<T, N>& dest) {
-    if (is_flat(src)) {
-        if (is_flat(dest))
-            src.internal().copy(dest.internal(), src.get_offset(),
-                                dest.get_offset(), dest.get_extent().size());
-        else {
-            // FIXME: logic here deserve to be reviewed
-            size_t srcSize = src.extent.size();
-            size_t srcOffset = 0;
-            bool srcModify = false;
-            size_t destSize = dest.extent_base.size();
-            size_t destOffset = dest.offset;
-            bool destModify = true;
-
-            const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
-            const T* p = pSrc;
-            T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
-            copy_input<const T*, T, N, 1>()(pSrc, pDst, dest.extent, dest.extent_base, dest.index_base);
-            dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
-            src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
-        }
-    } else {
-        if (is_flat(dest)) {
-            // FIXME: logic here deserve to be reviewed
-            size_t srcSize = src.extent_base.size();
-            size_t srcOffset = src.offset;
-            bool srcModify = false;
-            size_t destSize = dest.extent.size();
-            size_t destOffset = 0;
-            bool destModify = true;
-
-            T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
-            T* p = pDst;
-            const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
-            copy_output<T*, T, N, 1>()(pSrc, pDst, src.extent, src.extent_base, src.index_base);
-            dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
-            src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
-        } else {
-            // FIXME: logic here deserve to be reviewed
-            size_t srcSize = src.extent_base.size();
-            size_t srcOffset = src.offset;
-            bool srcModify = false;
-            size_t destSize = dest.extent_base.size();
-            size_t destOffset = dest.offset;
-            bool destModify = true;
-
-            const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
-            T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
-            copy_bidir<T, N, 1>()(pSrc, pDst, src.extent, src.extent_base,
-                                  src.index_base, dest.extent_base, dest.index_base);
-            dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
-            src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
-        }
-    }
-}
-
-template <typename T, int N>
-void copy(const array_view<T, N>& src, const array_view<T, N>& dest) {
-    const array_view<const T, N> buf(src);
-    copy(buf, dest);
-}
-
-template <typename T>
-void copy(const array_view<const T, 1>& src, const array_view<T, 1>& dest) {
-    src.internal().copy(dest.internal(),
-                        src.get_offset() + src.get_index_base()[0],
-                        dest.get_offset() + dest.get_index_base()[0],
-                        dest.get_extent().size());
-}
-
-/** @} */
-
-/** @{ */
-/**
- * The contents of a source container from the iterator range [srcBegin,srcEnd)
- * are copied into "dest". If the number of elements in the iterator range is
- * not equal to "dest.extent.size()", an exception is thrown.
- *
- * In the overloads which don't take an end-iterator it is assumed that the
- * source iterator is able to provide at least dest.extent.size() elements, but
- * no checking is performed (nor possible).
- *
- * @param[in] srcBegin An iterator to the first element of a source container.
- * @param[in] srcEnd An interator to the end of a source container.
- * @param[out] dest An object of type array<T,N> to be copied to.
- */
-template <typename InputIter, typename T, int N>
-void copy(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest) {
-#if __KALMAR_ACCELERATOR__ != 1
-    if( ( std::distance(srcBegin,srcEnd) <=0 )||( std::distance(srcBegin,srcEnd) < dest.get_extent().size() ))
-      throw runtime_exception("errorMsg_throw ,copy between different types", 0);
-#endif
-    do_copy<InputIter, T, N>()(srcBegin, srcEnd, dest);
-}
-
-template <typename InputIter, typename T, int N>
-void copy(InputIter srcBegin, array<T, N>& dest) {
-    InputIter srcEnd = srcBegin;
-    std::advance(srcEnd, dest.get_extent().size());
-    copy(srcBegin, srcEnd, dest);
-}
-
-/** @} */
-
-/** @{ */
-/**
- * The contents of a source container from the iterator range [srcBegin,srcEnd)
- * are copied into "dest". If the number of elements in the iterator range is
- * not equal to "dest.extent.size()", an exception is thrown.
- *
- * In the overloads which don't take an end-iterator it is assumed that the
- * source iterator is able to provide at least dest.extent.size() elements, but
- * no checking is performed (nor possible).
- *
- * @param[in] srcBegin An iterator to the first element of a source container.
- * @param[in] srcEnd An interator to the end of a source container.
- * @param[out] dest An object of type array_view<T,N> to be copied to.
- */
-template <typename InputIter, typename T, int N>
-void copy(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest) {
-    if (is_flat(dest))
-        do_copy<InputIter, T, N>()(srcBegin, srcEnd, dest);
-    else {
-        size_t size = dest.extent_base.size();
-        size_t offset = dest.offset;
-        bool modify = true;
-
-        T* ptr = dest.internal().map_ptr(modify, size, offset);
-        copy_input<InputIter, T, N, 1>()(srcBegin, ptr, dest.extent, dest.extent_base, dest.index_base);
-        dest.internal().unmap_ptr(ptr, modify, size, offset);
-    }
-}
-
-template <typename InputIter, typename T, int N>
-void copy(InputIter srcBegin, const array_view<T, N>& dest) {
-    InputIter srcEnd = srcBegin;
-    std::advance(srcEnd, dest.get_extent().size());
-    copy(srcBegin, srcEnd, dest);
-}
-
-/** @} */
-
-/**
- * The contents of a source array are copied into "dest" starting with iterator
- * destBegin. If the number of elements in the range starting destBegin in the
- * destination container is smaller than "src.extent.size()", the behavior is
- * undefined.
- *
- * @param[in] src An object of type array<T,N> to be copied from.
- * @param[out] destBegin An output iterator addressing the position of the
- *                       first element in the destination container.
- */
-template <typename OutputIter, typename T, int N>
-void copy(const array<T, N> &src, OutputIter destBegin) {
-    do_copy<OutputIter, T, N>()(src, destBegin);
-}
-
-/**
- * The contents of a source array are copied into "dest" starting with iterator
- * destBegin. If the number of elements in the range starting destBegin in the
- * destination container is smaller than "src.extent.size()", the behavior is
- * undefined.
- *
- * @param[in] src An object of type array_view<T,N> to be copied from.
- * @param[out] destBegin An output iterator addressing the position of the
- *                       first element in the destination container.
- */
-template <typename OutputIter, typename T, int N>
-void copy(const array_view<T, N> &src, OutputIter destBegin) {
-    if (is_flat(src))
-        do_copy<OutputIter, T, N>()(src, destBegin);
-    else {
-        size_t size = src.extent_base.size();
-        size_t offset = src.offset;
-        bool modify = false;
-
-        T* ptr = src.internal().map_ptr(modify, size, offset);
-        copy_output<OutputIter, T, N, 1>()(ptr, destBegin, src.extent, src.extent_base, src.index_base);
-        src.internal().unmap_ptr(ptr, modify, size, offset);
-    }
-}
-
-// ------------------------------------------------------------------------
-// utility function for copy_async
-// ------------------------------------------------------------------------
-
-
-// ------------------------------------------------------------------------
-// copy_async
-// ------------------------------------------------------------------------
-
-/**
- * The contents of "src" are copied into "dest". The source and destination may
- * reside on different accelerators. If the extents of "src" and "dest" don't
- * match, a runtime exception is thrown.
- *
- * @param[in] src An object of type array<T,N> to be copied from.
- * @param[out] dest An object of type array<T,N> to be copied to.
- */
-template <typename T, int N>
-completion_future copy_async(const array<T, N>& src, array<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
-    return completion_future(fut.share());
-}
-
-/**
- * The contents of "src" are copied into "dest". If the extents of "src" and
- * "dest" don't match, a runtime exception is thrown.
- *
- * @param[in] src An object of type array<T,N> to be copied from.
- * @param[out] dest An object of type array_view<T,N> to be copied to.
- */
-template <typename T, int N>
-completion_future copy_async(const array<T, N>& src, const array_view<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
-    return completion_future(fut.share());
-}
-
-/** @{ */
-/**
- * The contents of "src" are copied into "dest". If the extents of "src" and
- * "dest" don't match, a runtime exception is thrown.
- *
- * @param[in] src An object of type array_view<T,N> (or array_view<const T, N>)
- *                to be copied from.
- * @param[out] dest An object of type array<T,N> to be copied to.
- */
-template <typename T, int N>
-completion_future copy_async(const array_view<const T, N>& src, array<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
-    return completion_future(fut.share());
-}
-
-template <typename T, int N>
-completion_future copy_async(const array_view<T, N>& src, array<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
-    return completion_future(fut.share());
-}
-
-/** @} */
-
-/** @{ */
-/**
- * The contents of "src" are copied into "dest". If the extents of "src" and
- * "dest" don't match, a runtime exception is thrown.
- *
- * @param[in] src An object of type array_view<T,N> (or array_view<const T, N>)
- *                to be copied from.
- * @param[out] dest An object of type array_view<T,N> to be copied to.
- */
-template <typename T, int N>
-completion_future copy_async(const array_view<const T, N>& src, const array_view<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
-    return completion_future(fut.share());
-}
-
-template <typename T, int N>
-completion_future copy_async(const array_view<T, N>& src, const array_view<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
-    return completion_future(fut.share());
-}
-
-/** @} */
-
-/** @{ */
-/**
- * The contents of a source container from the iterator range [srcBegin,srcEnd)
- * are copied into "dest". If the number of elements in the iterator range is
- * not equal to "dest.extent.size()", an exception is thrown.
- *
- * In the overloads which don't take an end-iterator it is assumed that the
- * source iterator is able to provide at least dest.extent.size() elements, but
- * no checking is performed (nor possible).
- *
- * @param[in] srcBegin An iterator to the first element of a source container.
- * @param[in] srcEnd An interator to the end of a source container.
- * @param[out] dest An object of type array<T,N> to be copied to.
- */
-template <typename InputIter, typename T, int N>
-completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() mutable { copy(srcBegin, srcEnd, dest); });
-    return completion_future(fut.share());
-}
-
-template <typename InputIter, typename T, int N>
-completion_future copy_async(InputIter srcBegin, array<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() mutable { copy(srcBegin, dest); });
-    return completion_future(fut.share());
-}
-
-/** @} */
-
-/** @{ */
-/**
- * The contents of a source container from the iterator range [srcBegin,srcEnd)
- * are copied into "dest". If the number of elements in the iterator range is
- * not equal to "dest.extent.size()", an exception is thrown.
- *
- * In the overloads which don't take an end-iterator it is assumed that the
- * source iterator is able to provide at least dest.extent.size() elements, but
- * no checking is performed (nor possible).
- *
- * @param[in] srcBegin An iterator to the first element of a source container.
- * @param[in] srcEnd An interator to the end of a source container.
- * @param[out] dest An object of type array_view<T,N> to be copied to.
- */
-template <typename InputIter, typename T, int N>
-completion_future copy_async(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() mutable { copy(srcBegin, srcEnd, dest); });
-    return completion_future(fut.share());
-}
-
-template <typename InputIter, typename T, int N>
-completion_future copy_async(InputIter srcBegin, const array_view<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() mutable { copy(srcBegin, dest); });
-    return completion_future(fut.share());
-}
-
-/** @} */
-
-/**
- * The contents of a source array are copied into "dest" starting with iterator
- * destBegin. If the number of elements in the range starting destBegin in the
- * destination container is smaller than "src.extent.size()", the behavior is
- * undefined.
- *
- * @param[in] src An object of type array<T,N> to be copied from.
- * @param[out] destBegin An output iterator addressing the position of the
- *                       first element in the destination container.
- */
-template <typename OutputIter, typename T, int N>
-completion_future copy_async(const array<T, N>& src, OutputIter destBegin) {
-    std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() mutable { copy(src, destBegin); });
-    return completion_future(fut.share());
-}
-
-/**
- * The contents of a source array are copied into "dest" starting with iterator
- * destBegin. If the number of elements in the range starting destBegin in the
- * destination container is smaller than "src.extent.size()", the behavior is
- * undefined.
- *
- * @param[in] src An object of type array_view<T,N> to be copied from.
- * @param[out] destBegin An output iterator addressing the position of the
- *                       first element in the destination container.
- */
-template <typename OutputIter, typename T, int N>
-completion_future copy_async(const array_view<T, N>& src, OutputIter destBegin) {
-    std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() mutable { copy(src, destBegin); });
-    return completion_future(fut.share());
-}
-
-
-// FIXME: consider remove these functions
-template <typename T, int N>
-completion_future copy_async(const array<T, N>& src, const array<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
-    return completion_future(fut.share());
-}
-
-template <typename T, int N>
-completion_future copy_async(const array_view<const T, N>& src, const array<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
-    return completion_future(fut.share());
-}
-
-template <typename T, int N>
-completion_future copy_async(const array_view<T, N>& src, const array<T, N>& dest) {
-    std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
-    return completion_future(fut.share());
-}
-
-// ------------------------------------------------------------------------
-// atomic functions
-// ------------------------------------------------------------------------
-
-/** @{ */
-/**
- * Atomically read the value stored in dest , replace it with the value given
- * in val and return the old value to the caller. This function provides
- * overloads for int , unsigned int and float parameters.
- *
- * @param[out] dest A pointer to the location which needs to be atomically
- *                  modified. The location may reside within a
- *                  hc::array or hc::array_view or within a
- *                  tile_static variable.
- * @param[in] val The new value to be stored in the location pointed to be dest
- * @return These functions return the old value which was previously stored at
- *         dest, and that was atomically replaced. These functions always
- *         succeed.
- */
-#if __KALMAR_ACCELERATOR__ == 1
-extern "C" unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val) __HC__;
-extern "C" int atomic_exchange_int(int *p, int val) __HC__;
-extern "C" float atomic_exchange_float(float *p, float val) __HC__;
-extern "C" uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val) __HC__;
-
-static inline unsigned int atomic_exchange(unsigned int * dest, unsigned int val) __CPU__ __HC__ {
-  return atomic_exchange_unsigned(dest, val);
-}
-static inline int atomic_exchange(int * dest, int val) __CPU__ __HC__ {
-  return atomic_exchange_int(dest, val);
-}
-static inline float atomic_exchange(float * dest, float val) __CPU__ __HC__ {
-  return atomic_exchange_float(dest, val);
-}
-static inline uint64_t atomic_exchange(uint64_t * dest, uint64_t val) __CPU__ __HC__ {
-  return atomic_exchange_uint64(dest, val);
-}
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val);
-int atomic_exchange_int(int *p, int val);
-float atomic_exchange_float(float *p, float val);
-uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val);
-
-static inline unsigned int atomic_exchange(unsigned int *dest, unsigned int val) __CPU__ __HC__ {
-  return atomic_exchange_unsigned(dest, val);
-}
-static inline int atomic_exchange(int *dest, int val) __CPU__ __HC__ {
-  return atomic_exchange_int(dest, val);
-}
-static inline float atomic_exchange(float *dest, float val) __CPU__ __HC__ {
-  return atomic_exchange_float(dest, val);
-}
-static inline uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__ {
-  return atomic_exchange_uint64(dest, val);
-}
-#else
-extern unsigned int atomic_exchange(unsigned int *dest, unsigned int val) __CPU__ __HC__;
-extern int atomic_exchange(int *dest, int val) __CPU__ __HC__;
-extern float atomic_exchange(float *dest, float val) __CPU__ __HC__;
-extern uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__;
-#endif
-/** @} */
-
-/** @{ */
-/**
- * These functions attempt to perform these three steps atomically:
- * 1. Read the value stored in the location pointed to by dest
- * 2. Compare the value read in the previous step with the value contained in
- *    the location pointed by expected_val
- * 3. Carry the following operations depending on the result of the comparison
- *    of the previous step:
- *    a. If the values are identical, then the function tries to atomically
- *       change the value pointed by dest to the value in val. The function
- *       indicates by its return value whether this transformation has been
- *       successful or not.
- *    b. If the values are not identical, then the function stores the value
- *       read in step (1) into the location pointed to by expected_val, and
- *       returns false.
- *
- * @param[out] dest An pointer to the location which needs to be atomically
- *                  modified. The location may reside within a
- *                  concurrency::array or concurrency::array_view or within a
- *                  tile_static variable.
- * @param[out] expected_val A pointer to a local variable or function
- *                          parameter. Upon calling the function, the location
- *                          pointed by expected_val contains the value the
- *                          caller expects dest to contain. Upon return from
- *                          the function, expected_val will contain the most
- *                          recent value read from dest.
- * @param[in] val The new value to be stored in the location pointed to be dest
- * @return The return value indicates whether the function has been successful
- *         in atomically reading, comparing and modifying the contents of the
- *         memory location.
- */
-#if __KALMAR_ACCELERATOR__ == 1
-extern "C" unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val) __HC__;
-extern "C" int atomic_compare_exchange_int(int *dest, int expected_val, int val) __HC__;
-extern "C" uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val) __HC__;
-
-static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__ {
-  *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
-  return (*dest == val);
-}
-static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__ {
-  *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
-  return (*dest == val);
-}
-static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
-  *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
-  return (*dest == val);
-}
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val);
-int atomic_compare_exchange_int(int *dest, int expected_val, int val);
-uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val);
-
-static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__ {
-  *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
-  return (*dest == val);
-}
-static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__ {
-  *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
-  return (*dest == val);
-}
-static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
-  *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
-  return (*dest == val);
-}
-#else
-extern bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__;
-extern bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__;
-extern bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__;
-#endif
-/** @} */
-
-/** @{ */
-/**
- * Atomically read the value stored in dest, apply the binary numerical
- * operation specific to the function with the read value and val serving as
- * input operands, and store the result back to the location pointed by dest.
- *
- * In terms of sequential semantics, the operation performed by any of the
- * above function is described by the following piece of pseudo-code:
- *
- * *dest = *dest @f$\otimes@f$ val;
- *
- * Where the operation denoted by @f$\otimes@f$ is one of: addition
- * (atomic_fetch_add), subtraction (atomic_fetch_sub), find maximum
- * (atomic_fetch_max), find minimum (atomic_fetch_min), bit-wise AND
- * (atomic_fetch_and), bit-wise OR (atomic_fetch_or), bit-wise XOR
- * (atomic_fetch_xor).
- *
- * @param[out] dest An pointer to the location which needs to be atomically
- *                  modified. The location may reside within a
- *                  concurrency::array or concurrency::array_view or within a
- *                  tile_static variable.
- * @param[in] val The second operand which participates in the calculation of
- *                the binary operation whose result is stored into the
- *                location pointed to be dest.
- * @return These functions return the old value which was previously stored at
- *         dest, and that was atomically replaced. These functions always
- *         succeed.
- */
-#if __KALMAR_ACCELERATOR__ == 1
-extern "C" unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val) __HC__;
-extern "C" int atomic_add_int(int *p, int val) __HC__;
-extern "C" float atomic_add_float(float *p, float val) __HC__;
-extern "C" uint64_t atomic_add_uint64(uint64_t *p, uint64_t val) __HC__;
-
-static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_add_unsigned(x, y);
-}
-static inline int atomic_fetch_add(int *x, int y) __CPU__ __HC__ {
-  return atomic_add_int(x, y);
-}
-static inline float atomic_fetch_add(float *x, float y) __CPU__ __HC__ {
-  return atomic_add_float(x, y);
-}
-static inline uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
-  return atomic_add_uint64(x, y);
-}
-
-extern "C" unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val) __HC__;
-extern "C" int atomic_sub_int(int *p, int val) __HC__;
-extern "C" float atomic_sub_float(float *p, float val) __HC__;
-
-static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_sub_unsigned(x, y);
-}
-static inline int atomic_fetch_sub(int *x, int y) __CPU__ __HC__ {
-  return atomic_sub_int(x, y);
-}
-static inline int atomic_fetch_sub(float *x, float y) __CPU__ __HC__ {
-  return atomic_sub_float(x, y);
-}
-
-extern "C" unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val) __HC__;
-extern "C" int atomic_and_int(int *p, int val) __HC__;
-extern "C" uint64_t atomic_and_uint64(uint64_t *p, uint64_t val) __HC__;
-
-static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_and_unsigned(x, y);
-}
-static inline int atomic_fetch_and(int *x, int y) __CPU__ __HC__ {
-  return atomic_and_int(x, y);
-}
-static inline uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
-  return atomic_and_uint64(x, y);
-}
-
-extern "C" unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val) __HC__;
-extern "C" int atomic_or_int(int *p, int val) __HC__;
-extern "C" uint64_t atomic_or_uint64(uint64_t *p, uint64_t val) __HC__;
-
-static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_or_unsigned(x, y);
-}
-static inline int atomic_fetch_or(int *x, int y) __CPU__ __HC__ {
-  return atomic_or_int(x, y);
-}
-static inline uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
-  return atomic_or_uint64(x, y);
-}
-
-extern "C" unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val) __HC__;
-extern "C" int atomic_xor_int(int *p, int val) __HC__;
-extern "C" uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val) __HC__;
-
-static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_xor_unsigned(x, y);
-}
-static inline int atomic_fetch_xor(int *x, int y) __CPU__ __HC__ {
-  return atomic_xor_int(x, y);
-}
-static inline uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
-  return atomic_xor_uint64(x, y);
-}
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val);
-int atomic_add_int(int *p, int val);
-float atomic_add_float(float *p, float val);
-uint64_t atomic_add_uint64(uint64_t *p, uint64_t val);
-
-static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_add_unsigned(x, y);
-}
-static inline int atomic_fetch_add(int *x, int y) __CPU__ __HC__ {
-  return atomic_add_int(x, y);
-}
-static inline float atomic_fetch_add(float *x, float y) __CPU__ __HC__ {
-  return atomic_add_float(x, y);
-}
-static inline uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
-  return atomic_add_uint64(x, y);
-}
-
-unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val);
-int atomic_sub_int(int *p, int val);
-float atomic_sub_float(float *p, float val);
-
-static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_sub_unsigned(x, y);
-}
-static inline int atomic_fetch_sub(int *x, int y) __CPU__ __HC__ {
-  return atomic_sub_int(x, y);
-}
-static inline float atomic_fetch_sub(float *x, float y) __CPU__ __HC__ {
-  return atomic_sub_float(x, y);
-}
-
-unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val);
-int atomic_and_int(int *p, int val);
-uint64_t atomic_and_uint64(uint64_t *p, uint64_t val);
-
-static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_and_unsigned(x, y);
-}
-static inline int atomic_fetch_and(int *x, int y) __CPU__ __HC__ {
-  return atomic_and_int(x, y);
-}
-static inline uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
-  return atomic_and_uint64(x, y);
-}
-
-unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val);
-int atomic_or_int(int *p, int val);
-uint64_t atomic_or_uint64(uint64_t *p, uint64_t val);
-
-static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_or_unsigned(x, y);
-}
-static inline int atomic_fetch_or(int *x, int y) __CPU__ __HC__ {
-  return atomic_or_int(x, y);
-}
-static inline uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
-  return atomic_or_uint64(x, y);
-}
-
-unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val);
-int atomic_xor_int(int *p, int val);
-uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val);
-
-static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) __CPU__ __HC__ {
-  return atomic_xor_unsigned(x, y);
-}
-static inline int atomic_fetch_xor(int *x, int y) __CPU__ __HC__ {
-  return atomic_xor_int(x, y);
-}
-static inline uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
-  return atomic_xor_uint64(x, y);
-}
-#else
-extern unsigned atomic_fetch_add(unsigned *x, unsigned y) __CPU__ __HC__;
-extern int atomic_fetch_add(int *x, int y) __CPU__ __HC__;
-extern float atomic_fetch_add(float *x, float y) __CPU__ __HC__;
-extern uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__;
-
-extern unsigned atomic_fetch_sub(unsigned *x, unsigned y) __CPU__ __HC__;
-extern int atomic_fetch_sub(int *x, int y) __CPU__ __HC__;
-extern float atomic_fetch_sub(float *x, float y) __CPU__ __HC__;
-
-extern unsigned atomic_fetch_and(unsigned *x, unsigned y) __CPU__ __HC__;
-extern int atomic_fetch_and(int *x, int y) __CPU__ __HC__;
-extern uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__;
-
-extern unsigned atomic_fetch_or(unsigned *x, unsigned y) __CPU__ __HC__;
-extern int atomic_fetch_or(int *x, int y) __CPU__ __HC__;
-extern uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__;
-
-extern unsigned atomic_fetch_xor(unsigned *x, unsigned y) __CPU__ __HC__;
-extern int atomic_fetch_xor(int *x, int y) __CPU__ __HC__;
-extern uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__;
-#endif
-
-#if __KALMAR_ACCELERATOR__ == 1
-extern "C" unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val) __HC__;
-extern "C" int atomic_max_int(int *p, int val) __HC__;
-extern "C" uint64_t atomic_max_uint64(uint64_t *p, uint64_t val) __HC__;
-
-static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) __HC__ {
-  return atomic_max_unsigned(x, y);
-}
-static inline int atomic_fetch_max(int *x, int y) __HC__ {
-  return atomic_max_int(x, y);
-}
-static inline uint64_t atomic_fetch_max(uint64_t *x, uint64_t y) __HC__ {
-  return atomic_max_uint64(x, y);
-}
-
-extern "C" unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val) __HC__;
-extern "C" int atomic_min_int(int *p, int val) __HC__;
-extern "C" uint64_t atomic_min_uint64(uint64_t *p, uint64_t val) __HC__;
-
-static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) __HC__ {
-  return atomic_min_unsigned(x, y);
-}
-static inline int atomic_fetch_min(int *x, int y) __HC__ {
-  return atomic_min_int(x, y);
-}
-static inline uint64_t atomic_fetch_min(uint64_t *x, uint64_t y) __HC__ {
-  return atomic_min_uint64(x, y);
-}
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val);
-int atomic_max_int(int *p, int val);
-uint64_t atomic_max_uint64(uint64_t *p, uint64_t val);
-
-static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) __HC__ {
-  return atomic_max_unsigned(x, y);
-}
-static inline int atomic_fetch_max(int *x, int y) __HC__ {
-  return atomic_max_int(x, y);
-}
-static inline uint64_t atomic_fetch_max(uint64_t *x, uint64_t y) __HC__ {
-  return atomic_max_uint64(x, y);
-}
-
-unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val);
-int atomic_min_int(int *p, int val);
-uint64_t atomic_min_uint64(uint64_t *p, uint64_t val);
-
-static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) __HC__ {
-  return atomic_min_unsigned(x, y);
-}
-static inline int atomic_fetch_min(int *x, int y) __HC__ {
-  return atomic_min_int(x, y);
-}
-static inline uint64_t atomic_fetch_min(uint64_t *x, uint64_t y) __HC__ {
-  return atomic_min_uint64(x, y);
-}
-#else
-extern int atomic_fetch_max(int * dest, int val) __CPU__ __HC__;
-extern unsigned int atomic_fetch_max(unsigned int * dest, unsigned int val) __CPU__ __HC__;
-extern uint64_t atomic_fetch_max(uint64_t * dest, uint64_t val) __CPU__ __HC__;
-
-extern int atomic_fetch_min(int * dest, int val) __CPU__ __HC__;
-extern unsigned int atomic_fetch_min(unsigned int * dest, unsigned int val) __CPU__ __HC__;
-extern uint64_t atomic_fetch_min(uint64_t * dest, uint64_t val) __CPU__ __HC__;
-#endif
-
-/** @} */
-
-/** @{ */
-/**
- * Atomically increment or decrement the value stored at the location point to
- * by dest.
- *
- * @param[inout] dest An pointer to the location which needs to be atomically
- *                    modified. The location may reside within a
- *                    concurrency::array or concurrency::array_view or within a
- *                    tile_static variable.
- * @return These functions return the old value which was previously stored at
- *         dest, and that was atomically replaced. These functions always
- *         succeed.
- */
-#if __KALMAR_ACCELERATOR__ == 1
-extern "C" unsigned int atomic_inc_unsigned(unsigned int *p) __HC__;
-extern "C" int atomic_inc_int(int *p) __HC__;
-
-static inline unsigned int atomic_fetch_inc(unsigned int *x) __CPU__ __HC__ {
-  return atomic_inc_unsigned(x);
-}
-static inline int atomic_fetch_inc(int *x) __CPU__ __HC__ {
-  return atomic_inc_int(x);
-}
-
-extern "C" unsigned int atomic_dec_unsigned(unsigned int *p) __HC__;
-extern "C" int atomic_dec_int(int *p) __HC__;
-
-static inline unsigned int atomic_fetch_dec(unsigned int *x) __CPU__ __HC__ {
-  return atomic_dec_unsigned(x);
-}
-static inline int atomic_fetch_dec(int *x) __CPU__ __HC__ {
-  return atomic_dec_int(x);
-}
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-unsigned int atomic_inc_unsigned(unsigned int *p);
-int atomic_inc_int(int *p);
-
-static inline unsigned int atomic_fetch_inc(unsigned int *x) __CPU__ __HC__ {
-  return atomic_inc_unsigned(x);
-}
-static inline int atomic_fetch_inc(int *x) __CPU__ __HC__ {
-  return atomic_inc_int(x);
-}
-
-unsigned int atomic_dec_unsigned(unsigned int *p);
-int atomic_dec_int(int *p);
-
-static inline unsigned int atomic_fetch_dec(unsigned int *x) __CPU__ __HC__ {
-  return atomic_dec_unsigned(x);
-}
-static inline int atomic_fetch_dec(int *x) __CPU__ __HC__ {
-  return atomic_dec_int(x);
-}
-#else
-extern int atomic_fetch_inc(int * _Dest) __CPU__ __HC__;
-extern unsigned int atomic_fetch_inc(unsigned int * _Dest) __CPU__ __HC__;
-
-extern int atomic_fetch_dec(int * _Dest) __CPU__ __HC__;
-extern unsigned int atomic_fetch_dec(unsigned int * _Dest) __CPU__ __HC__;
-#endif
-
-/** @} */
-
-/**
- * Atomically do the following operations:
- * - reads the 32-bit value (original) from address pointer in global or group segment
- * - computes ((original >= val) ? 0 : (original + 1))
- * - stores the result back to the address
- *
- * @return The original value retrieved from address pointer.
- * 
- * Please refer to <a href="http://www.hsafoundation.com/html/HSA_Library.htm#PRM/Topics/06_Memory/atomic.htm">atomic_wrapinc in HSA PRM 6.6</a> for more detailed specification of the function.
- */
-extern "C" unsigned int __atomic_wrapinc(unsigned int* address, unsigned int val) __HC__;
-
-/**
- * Atomically do the following operations:
- * - reads the 32-bit value (original) from address pointer in global or group segment
- * - computes ((original == 0) || (original > val)) ? val : (original - 1)
- * - stores the result back to the address
- *
- * @return The original value retrieved from address pointer.
- * 
- * Please refer to <a href="http://www.hsafoundation.com/html/HSA_Library.htm#PRM/Topics/06_Memory/atomic.htm">atomic_wrapdec in HSA PRM 6.6</a> for more detailed specification of the function.
- */
-extern "C" unsigned int __atomic_wrapdec(unsigned int* address, unsigned int val) __HC__;
-
-
-// ------------------------------------------------------------------------
-// parallel_for_each
-// ------------------------------------------------------------------------
-
-template <int N, typename Kernel>
-completion_future parallel_for_each(const accelerator_view&, const extent<N>&, const Kernel&);
-
-template <typename Kernel>
-completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&);
-
-template <typename Kernel>
-completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);
-
-template <typename Kernel>
-completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);
-
-template <int N, typename Kernel>
-completion_future parallel_for_each(const extent<N>& compute_domain, const Kernel& f) {
-    return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
-}
-
-template <typename Kernel>
-completion_future parallel_for_each(const tiled_extent<3>& compute_domain, const Kernel& f) {
-    return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
-}
-
-template <typename Kernel>
-completion_future parallel_for_each(const tiled_extent<2>& compute_domain, const Kernel& f) {
-    return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
-}
-
-template <typename Kernel>
-completion_future parallel_for_each(const tiled_extent<1>& compute_domain, const Kernel& f) {
-    return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
-}
-
-template <int N, typename Kernel, typename _Tp>
-struct pfe_helper
-{
-    static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
-        int i;
-        for (i = 0; i < k.ext[N - 1]; ++i) {
-            idx[N - 1] = i;
-            pfe_helper<N - 1, Kernel, _Tp>::call(k, idx);
-        }
-    }
-};
-template <typename Kernel, typename _Tp>
-struct pfe_helper<0, Kernel, _Tp>
-{
-    static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ == 1
-        k.k(idx);
-#endif
-    }
-};
-
-template <int N, typename Kernel>
-class pfe_wrapper
-{
-public:
-    explicit pfe_wrapper(const extent<N>& other, const Kernel& f) __CPU__ __HC__
-        : ext(other), k(f) {}
-    void operator() (index<N> idx) __CPU__ __HC__ {
-        pfe_helper<N - 3, pfe_wrapper<N, Kernel>, index<N>>::call(*this, idx);
-    }
-private:
-    const extent<N> ext;
-    const Kernel k;
-    template <int K, typename Ker, typename _Tp>
-        friend struct pfe_helper;
-};
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wreturn-type"
-#pragma clang diagnostic ignored "-Wunused-variable"
-//ND parallel_for_each, nontiled
-template <int N, typename Kernel>
-__attribute__((noinline,used)) completion_future parallel_for_each(
-    const accelerator_view& av,
-    const extent<N>& compute_domain, const Kernel& f) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-    for(int i = 0 ; i < N ; i++)
-    {
-      // silently return in case the any dimension of the extent is 0
-      if (compute_domain[i] == 0)
-        return completion_future();
-      if (compute_domain[i] < 0)
-        throw invalid_compute_domain("Extent is less than 0.");
-      if (static_cast<size_t>(compute_domain[i]) > 4294967295L)
-        throw invalid_compute_domain("Extent size too large.");
-    }
-    size_t ext[3] = {static_cast<size_t>(compute_domain[N - 1]),
-        static_cast<size_t>(compute_domain[N - 2]),
-        static_cast<size_t>(compute_domain[N - 3])};
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    if (is_cpu()) {
-        return launch_cpu_task_async(av.pQueue, f, compute_domain);
-    }
-#endif
-    if (av.get_accelerator().get_device_path() == L"cpu") {
-      throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
-    }
-    const pfe_wrapper<N, Kernel> _pf(compute_domain, f);
-    return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<pfe_wrapper<N, Kernel>, 3>(av.pQueue, ext, NULL, _pf));
-#else
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    int* foo1 = reinterpret_cast<int*>(&Kernel::__cxxamp_trampoline);
-#endif
-    auto bar = &pfe_wrapper<N, Kernel>::operator();
-    auto qq = &index<N>::__cxxamp_opencl_index;
-    int* foo = reinterpret_cast<int*>(&pfe_wrapper<N, Kernel>::__cxxamp_trampoline);
-#endif
-}
-#pragma clang diagnostic pop
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wreturn-type"
-#pragma clang diagnostic ignored "-Wunused-variable"
-//1D parallel_for_each, nontiled
-template <typename Kernel>
-__attribute__((noinline,used)) completion_future parallel_for_each(
-    const accelerator_view& av, const extent<1>& compute_domain, const Kernel& f) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-  // silently return in case the any dimension of the extent is 0
-  if (compute_domain[0] == 0)
-    return completion_future();
-  if (compute_domain[0] < 0) {
-    throw invalid_compute_domain("Extent is less than 0.");
-  }
-  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    if (is_cpu()) {
-        return launch_cpu_task_async(av.pQueue, f, compute_domain);
-    }
-#endif
-  size_t ext = compute_domain[0];
-  if (av.get_accelerator().get_device_path() == L"cpu") {
-    throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
-  }
-  return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 1>(av.pQueue, &ext, NULL, f));
-#else //if __KALMAR_ACCELERATOR__ != 1
-  //to ensure functor has right operator() defined
-  //this triggers the trampoline code being emitted
-  auto foo = &Kernel::__cxxamp_trampoline;
-  auto bar = &Kernel::operator();
-#endif
-}
-#pragma clang diagnostic pop
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wreturn-type"
-#pragma clang diagnostic ignored "-Wunused-variable"
-//2D parallel_for_each, nontiled
-template <typename Kernel>
-__attribute__((noinline,used)) completion_future parallel_for_each(
-    const accelerator_view& av, const extent<2>& compute_domain, const Kernel& f) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-  // silently return in case the any dimension of the extent is 0
-  if (compute_domain[0] == 0 || compute_domain[1] == 0)
-    return completion_future();
-  if (compute_domain[0] < 0 || compute_domain[1] < 0) {
-    throw invalid_compute_domain("Extent is less than 0.");
-  }
-  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-  if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    if (is_cpu()) {
-        return launch_cpu_task_async(av.pQueue, f, compute_domain);
-    }
-#endif
-  size_t ext[2] = {static_cast<size_t>(compute_domain[1]),
-                   static_cast<size_t>(compute_domain[0])};
-  if (av.get_accelerator().get_device_path() == L"cpu") {
-    throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
-  }
-  return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 2>(av.pQueue, ext, NULL, f));
-#else //if __KALMAR_ACCELERATOR__ != 1
-  //to ensure functor has right operator() defined
-  //this triggers the trampoline code being emitted
-  auto foo = &Kernel::__cxxamp_trampoline;
-  auto bar = &Kernel::operator();
-#endif
-}
-#pragma clang diagnostic pop
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wreturn-type"
-#pragma clang diagnostic ignored "-Wunused-variable"
-//3D parallel_for_each, nontiled
-template <typename Kernel>
-__attribute__((noinline,used)) completion_future parallel_for_each(
-    const accelerator_view& av, const extent<3>& compute_domain, const Kernel& f) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-  // silently return in case the any dimension of the extent is 0
-  if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
-    return completion_future();
-  if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
-    throw invalid_compute_domain("Extent is less than 0.");
-  }
-  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-  if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-  if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    if (is_cpu()) {
-        return launch_cpu_task_async(av.pQueue, f, compute_domain);
-    }
-#endif
-  size_t ext[3] = {static_cast<size_t>(compute_domain[2]),
-                   static_cast<size_t>(compute_domain[1]),
-                   static_cast<size_t>(compute_domain[0])};
-  if (av.get_accelerator().get_device_path() == L"cpu") {
-    throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
-  }
-  return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 3>(av.pQueue, ext, NULL, f));
-#else //if __KALMAR_ACCELERATOR__ != 1
-  //to ensure functor has right operator() defined
-  //this triggers the trampoline code being emitted
-  auto foo = &Kernel::__cxxamp_trampoline;
-  auto bar = &Kernel::operator();
-#endif
-}
-#pragma clang diagnostic pop
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wreturn-type"
-#pragma clang diagnostic ignored "-Wunused-variable"
-//1D parallel_for_each, tiled
-template <typename Kernel>
-__attribute__((noinline,used)) completion_future parallel_for_each(
-    const accelerator_view& av, const tiled_extent<1>& compute_domain, const Kernel& f) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-  // silently return in case the any dimension of the extent is 0
-  if (compute_domain[0] == 0)
-    return completion_future();
-  if (compute_domain[0] < 0) {
-    throw invalid_compute_domain("Extent is less than 0.");
-  }
-  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-  size_t ext = compute_domain[0];
-  size_t tile = compute_domain.tile_dim[0];
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-  if (is_cpu()) {
-      return launch_cpu_task_async(av.pQueue, f, compute_domain);
-  } else
-#endif
-  if (av.get_accelerator().get_device_path() == L"cpu") {
-    throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
-  }
-  void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
-  return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async<Kernel, 1>(av.pQueue, &ext, &tile, f, kernel, compute_domain.get_dynamic_group_segment_size()));
-#else //if __KALMAR_ACCELERATOR__ != 1
-  tiled_index<1> this_is_used_to_instantiate_the_right_index;
-  //to ensure functor has right operator() defined
-  //this triggers the trampoline code being emitted
-  auto foo = &Kernel::__cxxamp_trampoline;
-  auto bar = &Kernel::operator();
-#endif
-}
-#pragma clang diagnostic pop
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wreturn-type"
-#pragma clang diagnostic ignored "-Wunused-variable"
-//2D parallel_for_each, tiled
-template <typename Kernel>
-__attribute__((noinline,used)) completion_future parallel_for_each(
-    const accelerator_view& av, const tiled_extent<2>& compute_domain, const Kernel& f) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-  // silently return in case the any dimension of the extent is 0
-  if (compute_domain[0] == 0 || compute_domain[1] == 0)
-    return completion_future();
-  if (compute_domain[0] < 0 || compute_domain[1] < 0) {
-    throw invalid_compute_domain("Extent is less than 0.");
-  }
-  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-  if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-  size_t ext[2] = { static_cast<size_t>(compute_domain[1]),
-                    static_cast<size_t>(compute_domain[0])};
-  size_t tile[2] = { static_cast<size_t>(compute_domain.tile_dim[1]),
-                     static_cast<size_t>(compute_domain.tile_dim[0]) };
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-  if (is_cpu()) {
-      return launch_cpu_task_async(av.pQueue, f, compute_domain);
-  } else
-#endif
-  if (av.get_accelerator().get_device_path() == L"cpu") {
-    throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
-  }
-  void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
-  return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async<Kernel, 2>(av.pQueue, ext, tile, f, kernel, compute_domain.get_dynamic_group_segment_size()));
-#else //if __KALMAR_ACCELERATOR__ != 1
-  tiled_index<2> this_is_used_to_instantiate_the_right_index;
-  //to ensure functor has right operator() defined
-  //this triggers the trampoline code being emitted
-  auto foo = &Kernel::__cxxamp_trampoline;
-  auto bar = &Kernel::operator();
-#endif
-}
-#pragma clang diagnostic pop
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wreturn-type"
-#pragma clang diagnostic ignored "-Wunused-variable"
-//3D parallel_for_each, tiled
-template <typename Kernel>
-__attribute__((noinline,used)) completion_future parallel_for_each(
-    const accelerator_view& av, const tiled_extent<3>& compute_domain, const Kernel& f) __CPU__ __HC__ {
-#if __KALMAR_ACCELERATOR__ != 1
-  // silently return in case the any dimension of the extent is 0
-  if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
-    return completion_future();
-  if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
-    throw invalid_compute_domain("Extent is less than 0.");
-  }
-  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-  if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-  if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
-    throw invalid_compute_domain("Extent size too large.");
-  size_t ext[3] = { static_cast<size_t>(compute_domain[2]),
-                    static_cast<size_t>(compute_domain[1]),
-                    static_cast<size_t>(compute_domain[0])};
-  size_t tile[3] = { static_cast<size_t>(compute_domain.tile_dim[2]),
-                     static_cast<size_t>(compute_domain.tile_dim[1]),
-                     static_cast<size_t>(compute_domain.tile_dim[0]) };
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-  if (is_cpu()) {
-      return launch_cpu_task_async(av.pQueue, f, compute_domain);
-  } else
-#endif
-  if (av.get_accelerator().get_device_path() == L"cpu") {
-    throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
-  }
-  void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
-  return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async<Kernel, 3>(av.pQueue, ext, tile, f, kernel, compute_domain.get_dynamic_group_segment_size()));
-#else //if __KALMAR_ACCELERATOR__ != 1
-  tiled_index<3> this_is_used_to_instantiate_the_right_index;
-  //to ensure functor has right operator() defined
-  //this triggers the trampoline code being emitted
-  auto foo = &Kernel::__cxxamp_trampoline;
-  auto bar = &Kernel::operator();
-#endif
-}
-#pragma clang diagnostic pop
-
-} // namespace hc
+} // namespace hc
\ No newline at end of file
diff --git a/include/hc/CMakeLists.txt b/include/hc/CMakeLists.txt
new file mode 100644
index 00000000000..0480a47fe2e
--- /dev/null
+++ b/include/hc/CMakeLists.txt
@@ -0,0 +1,48 @@
+set(
+    HC_headers
+        hc_agent_pool.hpp
+        hc_aligned_alloc.hpp
+        hc_am.hpp
+        hc_atomics.hpp
+        hc_callable_attributes.hpp
+        hc_completion_future.hpp
+        hc_defines.hpp
+        hc_exception.hpp
+        hc_index.hpp
+        hc_kernel_emitter.hpp
+        hc_launch.hpp
+        hc_math.hpp
+        hc_norm_unorm.hpp
+        hc_printf.hpp
+        hc_queue_pool.hpp
+        hc_rt_debug.hpp
+        hc_runtime.hpp
+        hc_short_vector.hpp
+        hc_signal_pool.hpp
+        hc.hpp)
+
+# Set location for output directory
+set(output_dir "${PROJECT_BINARY_DIR}/include/hc")
+set(out_files)
+foreach(f ${HC_headers})
+    set(src ${CMAKE_CURRENT_SOURCE_DIR}/${f})
+    set(dst ${output_dir}/${f})
+    add_custom_command(
+        OUTPUT ${dst}
+        DEPENDS ${src}
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+        COMMENT "Copying HCC's ${f}...")
+    list(APPEND out_files ${dst})
+endforeach()
+
+# Create target for hcc-headers and set dependencies
+add_custom_target(hc-headers ALL DEPENDS ${out_files})
+add_dependencies(world hc-headers)
+
+# Install command for headers
+install(
+    FILES ${HC_headers}
+    PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+    DESTINATION include/hc)
+
+add_subdirectory(implementation)
\ No newline at end of file
diff --git a/include/hc/hc.hpp b/include/hc/hc.hpp
new file mode 100644
index 00000000000..9733e6c3af9
--- /dev/null
+++ b/include/hc/hc.hpp
@@ -0,0 +1,6993 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file hc.hpp
+ * Heterogeneous C++ (HC) API.
+ */
+
+#pragma once
+
+#include <hc/hc_agent_pool.hpp>
+#include <hc/hc_atomics.hpp>
+#include <hc/hc_callable_attributes.hpp>
+#include <hc/hc_completion_future.hpp>
+#include <hc/hc_defines.hpp>
+#include <hc/hc_exception.hpp>
+#include <hc/hc_index.hpp>
+#include <hc/hc_launch.hpp>
+#include <hc/hc_math.hpp>
+#include <hc/hc_queue_pool.hpp>
+#include <hc/hc_runtime.hpp>
+#include <hc/implementation/hc_n_way_set_associative_cache.hpp>
+
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+
+#include <array>
+#include <atomic>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+/**
+ * @namespace hc
+ * Heterogeneous  C++ (HC) namespace
+ */
+
+namespace hc
+{
+    using namespace atomics;
+    using namespace detail::enums;
+
+    // forward declaration
+    class accelerator;
+    class accelerator_view;
+    class completion_future;
+    template <int> class extent;
+    template <int> class tiled_extent;
+    template <typename, int> class array_view;
+    template <typename, int> class array;
+
+    // namespace alias
+    // namespace hc::fast_math is an alias of namespace detail::fast_math
+    namespace fast_math = detail::fast_math;
+
+    // namespace hc::precise_math is an alias of namespace detail::precise_math
+    namespace precise_math = detail::precise_math;
+
+    // type alias
+
+    /**
+     * Represents a unique position in N-dimensional space.
+     */
+    template <int N>
+    using index = detail::index<N>;
+
+    using runtime_exception = detail::runtime_exception;
+    using invalid_compute_domain = detail::invalid_compute_domain;
+    using accelerator_view_removed = detail::accelerator_view_removed;
+
+    // ------------------------------------------------------------------------
+    // global functions
+    // ------------------------------------------------------------------------
+
+    /**
+     * Get the current tick count for the GPU platform.
+     *
+     * @return An implementation-defined tick count
+     */
+    inline
+    std::uint64_t get_system_ticks()
+    {   // TODO: unify the HSA error checking into a single function.
+        std::uint64_t r{};
+        detail::throwing_hsa_result_check(
+            hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &r),
+            __FILE__, __func__, __LINE__);
+
+        return r;
+    }
+
+    /**
+     * Get the frequency of ticks per second for the underlying asynchronous
+     * operation.
+     *
+     * @return An implementation-defined frequency in Hz in case the instance is
+     *         created by a kernel dispatch or a barrier packet. 0 otherwise.
+     */
+    inline
+    std::uint64_t get_tick_frequency()
+    {
+        std::uint64_t r{};
+        detail::throwing_hsa_result_check(
+            hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &r),
+            __FILE__, __func__, __LINE__);
+
+        return r;
+    }
+
+    // ------------------------------------------------------------------------
+    // accelerator_view
+    // ------------------------------------------------------------------------
+
+    /**
+     * Represents a logical (isolated) accelerator view of a compute
+     * accelerator. An object of this type can be obtained by calling the
+     * default_view property or create_view member functions on an accelerator
+     * object.
+     */
+    class accelerator_view {
+        using ConcurrentTaskList_ =
+            std::pair<std::mutex, std::forward_list<completion_future>>;
+
+        // IMPLEMENTATION - DATA
+        //mutable std::forward_list<completion_future> pending_tasks_; // TODO: spec fault.
+        mutable completion_future pending_tasks_;
+        accelerator const* accelerator_;
+        hsa_queue_t* queue_;
+        queuing_mode qmode_;
+        bool is_default_;
+
+        // FRIENDS
+        friend class accelerator;
+        template<typename, int> friend class array;
+        template<typename, int> friend class array_view;
+
+        template<typename Domain, typename Kernel>
+        friend
+        void detail::launch_kernel(
+            const accelerator_view&,
+            const Domain&,
+            const Kernel&);
+        template<typename Domain, typename Kernel>
+        friend
+        std::shared_future<void> detail::launch_kernel_async(
+            const accelerator_view&,
+            const Domain&,
+            const Kernel&);
+
+        template<typename Kernel, int n>
+        friend
+        completion_future parallel_for_each(
+            const accelerator_view&, const extent<n>&, const Kernel&);
+
+        template<typename Kernel, int n>
+        friend
+        completion_future parallel_for_each(
+            const accelerator_view&, const tiled_extent<n>&, const Kernel&);
+
+        // IMPLEMENTATION - CREATORS
+        accelerator_view(
+            const accelerator& accelerator,
+            hsa_queue_t* queue,
+            queuing_mode qmode = queuing_mode_automatic,
+            bool is_default = false)
+            :
+            accelerator_{&accelerator},
+            queue_{queue},
+            qmode_{qmode},
+            is_default_{is_default}
+        {}
+
+        // IMPLEMENTATION - MANIPULATORS
+        void add_pending_task_(completion_future task) const
+        {
+            //pending_tasks_.push_front(task);
+            auto& prior = is_default_ ?
+                pending_tasks_for_default_av_().second : pending_tasks_;
+
+            std::unique_lock<std::mutex> lck{
+                pending_tasks_for_default_av_().first, std::defer_lock};
+
+            if (is_default_) lck.lock();
+
+            prior = std::async(std::launch::deferred, [](completion_future t, completion_future prev) {
+                if (prev.valid()) prev.wait();
+                if (t.valid()) t.wait();
+            }, std::move(task), std::move(prior)).share();
+        }
+
+        void wait_for_all_pending_tasks_()
+        {
+            auto& prior = is_default_ ?
+                pending_tasks_for_default_av_().second : pending_tasks_;
+
+            if (prior.valid()) prior.wait();
+        }
+
+        // IMPLEMENTATION - ACCESSORS
+        std::pair<std::mutex, completion_future>& pending_tasks_for_default_av_() const;
+    public:
+        accelerator_view() = delete;
+        /**
+         * Copy-constructs an accelerator_view object. This function does a
+         * shallow copy with the newly created accelerator_view object pointing
+         * to the same underlying view as the "other" parameter.
+         *
+         * @param[in] other The accelerator_view object to be copied.
+         */
+        accelerator_view(const accelerator_view&) = default;
+        accelerator_view(accelerator_view&&) = default;
+
+        ~accelerator_view();
+
+        /**
+         * Assigns an accelerator_view object to "this" accelerator_view object
+         * and returns a reference to "this" object. This function does a
+         * shallow assignment with the newly created accelerator_view object
+         * pointing to the same underlying view as the passed accelerator_view
+         * parameter.
+         *
+         * @param[in] other The accelerator_view object to be assigned from.
+         * @return A reference to "this" accelerator_view object.
+         */
+        accelerator_view& operator=(const accelerator_view&) = default;
+        accelerator_view& operator=(accelerator_view&&) = default;
+
+        /**
+         * Returns the queuing mode that this accelerator_view was created with.
+         * See "Queuing Mode".
+         *
+         * @return The queuing mode.
+         */
+        queuing_mode get_queuing_mode() const noexcept
+        {
+            return qmode_;
+        }
+
+        /**
+         * Returns a boolean value indicating whether the accelerator view when
+         * passed to a parallel_for_each would result in automatic selection of
+         * an appropriate execution target by the runtime. In other words, this
+         * is the accelerator view that will be automatically selected if
+         * parallel_for_each is invoked without explicitly specifying an
+         * accelerator view.
+         *
+         * @return A boolean value indicating if the accelerator_view is the
+         *         auto selection accelerator_view.
+         */
+        bool get_is_auto_selection() const noexcept;
+
+        /**
+         * Returns a 32-bit unsigned integer representing the version number of
+         * this accelerator view. The format of the integer is major.minor,
+         * where the major version number is in the high-order 16 bits, and the
+         * minor version number is in the low-order bits.
+         *
+         * The version of the accelerator view is usually the same as that of
+         * the parent accelerator.
+         */
+        unsigned int get_version() const;
+
+        /**
+         * Returns the accelerator that this accelerator_view has been created
+         * on.
+         */
+        accelerator get_accelerator() const;
+
+        /**
+         * Returns a boolean value indicating whether the accelerator_view
+         * supports debugging through extensive error reporting.
+         *
+         * The is_debug property of the accelerator view is usually same as that
+         * of the parent accelerator.
+         */
+        bool get_is_debug() const noexcept
+        {   // FIXME: dummy implementation now
+            return false;
+        }
+
+        /**
+         * Performs a blocking wait for completion of all commands submitted to
+         * the accelerator view prior to calling wait().
+         *
+         * @param waitMode[in] An optional parameter to specify the wait mode.
+         *                     By default it would be hcWaitModeBlocked.
+         *                     hcWaitModeActive would be used to reduce latency
+         *                     with the expense of using one CPU core for active
+         *                     waiting.
+         */
+        void wait()
+        {
+            wait_for_all_pending_tasks_();
+
+            //detail::getContext()->flushPrintfBuffer();
+        }
+
+        /**
+         * Sends the queued up commands in the accelerator_view to the device
+         * for execution.
+         *
+         * An accelerator_view internally maintains a buffer of commands such as
+         * data transfers between the host memory and device buffers, and kernel
+         * invocations (parallel_for_each calls). This member function sends the
+         * commands to the device for processing. Normally, these commands
+         * to the GPU automatically whenever the runtime determines that they
+         * need to be, such as when the command buffer is full or when waiting
+         * for transfer of data from the device buffers to host memory. The
+         * flush member function will send the commands manually to the device.
+         *
+         * Calling this member function incurs an overhead and must be used with
+         * discretion. A typical use of this member function would be when the
+         * CPU waits for an arbitrary amount of time and would like to force the
+         * execution of queued device commands in the meantime. It can also be
+         * used to ensure that resources on the accelerator are reclaimed after
+         * all references to them have been removed.
+         *
+         * Because flush operates asynchronously, it can return either before or
+         * after the device finishes executing the buffered commands, the
+         * commands will eventually always complete.
+         *
+         * If the queuing_mode is queuing_mode_immediate, this function has no
+         * effect.
+         *
+         * @return None
+         */
+        void flush()
+        {   // TODO: for now we always submit immediately, so flush is a NOP.
+            return;
+        }
+
+        /**
+         * This command inserts a marker event into the accelerator_view's
+         * command queue. This marker is returned as a completion_future object.
+         * When all commands that were submitted prior to the marker event
+         * creation have completed, the future is ready.
+         *
+         * Regardless of the accelerator_view's execute_order
+         * (execute_any_order, execute_in_order), the marker always ensures
+         * older commands complete before the returned completion_future is
+         * marked ready. Thus, markers provide a mechanism to enforce order
+         * between commands in an execute_any_order accelerator_view.
+         *
+         * fence_scope controls the scope of the acquire and release fences
+         * applied after the marker executes.  Options are:
+         *   - no_scope : No fence operation is performed.
+         *   - accelerator_scope: Memory is acquired from and released to the
+         *     accelerator scope where the marker executes.
+         *   - system_scope: Memory is acquired from and released to system
+         *     scope (all accelerators including CPUs)
+         *
+         * @return A future which can be waited on, and will block until the
+         *         current batch of commands has completed.
+         */
+        completion_future create_marker(memory_scope = system_scope) const
+        {
+            auto& prior = is_default_ ?
+                pending_tasks_for_default_av_().second : pending_tasks_;
+
+            std::unique_lock<std::mutex> lck{
+                pending_tasks_for_default_av_().first, std::defer_lock};
+
+            if (is_default_) lck.lock();
+
+            completion_future tmp{std::async(std::launch::deferred,
+                [](completion_future prev, std::shared_future<void> barrier) {
+                if (prev.valid()) prev.wait();
+                barrier.wait();
+            }, std::move(prior), detail::insert_barrier(*this).first).share()};
+
+            return prior = std::move(tmp);
+        }
+
+        /**
+         * This command inserts a marker event into the accelerator_view's
+         * command queue with a prior dependent asynchronous event.
+         *
+         * This marker is returned as a completion_future object. When its
+         * dependent event and all commands submitted prior to the marker event
+         * creation have been completed, the future is ready.
+         *
+         * Regardless of the accelerator_view's execute_order
+         * (execute_any_order, execute_in_order), the marker always ensures
+         * older commands complete before the returned completion_future is
+         * marked ready. Thus, markers provide a mechanism to enforce order
+         * between commands in an execute_any_order accelerator_view.
+         *
+         * fence_scope controls the scope of the acquire and release fences
+         * applied after the marker executes.  Options are:
+         *   - no_scope : No fence operation is performed.
+         *   - accelerator_scope: Memory is acquired from and released to the
+         *     accelerator scope where the marker executes.
+         *   - system_scope: Memory is acquired from and released to system
+         *     scope (all accelerators including CPUs)
+         *
+         * dependent_futures may be recorded in another queue or another
+         * accelerator.  If in another accelerator, the runtime performs
+         * cross-accelerator synchronisation.
+         *
+         * @return A future which can be waited on, and will block until the
+         *         current batch of commands, plus the dependent event have
+         *         been completed.
+         */
+        completion_future create_blocking_marker(
+            completion_future& dependent_future,
+            memory_scope = system_scope) const
+        {
+            auto& prior = is_default_ ?
+                pending_tasks_for_default_av_().second : pending_tasks_;
+
+            completion_future tmp{std::async(std::launch::deferred, [=](
+                std::shared_future<void> prev,
+                std::shared_future<void> barrier) {
+                dependent_future.wait();
+                prev.wait();
+                barrier.wait();
+            }, std::move(prior), detail::insert_barrier(*this).first).share()};
+
+            std::unique_lock<std::mutex> lck{
+                pending_tasks_for_default_av_().first, std::defer_lock};
+
+            if (is_default_) lck.lock();
+
+            return prior = std::move(tmp);
+        }
+
+        /**
+         * This command inserts a marker event into the accelerator_view's
+         * command queue with arbitrary number of dependent asynchronous events.
+         *
+         * This marker is returned as a completion_future object. When its
+         * dependent events and all commands submitted prior to the marker event
+         * creation have been completed, the completion_future is ready.
+         *
+         * Regardless of the accelerator_view's execute_order
+         * (execute_any_order, execute_in_order), the marker always ensures
+         * older commands complete before the returned completion_future is
+         * marked ready. Thus, markers provide a mechanism to enforce order
+         * between commands in an execute_any_order accelerator_view.
+         *
+         * @return A future which can be waited on, and will block until the
+         *         current batch of commands, plus the dependent event have
+         *         been completed.
+         */
+        // TODO: constrain to take completion_future only.
+        template<typename InputIterator>
+        completion_future create_blocking_marker(
+            InputIterator first,
+            InputIterator last,
+            memory_scope = system_scope) const
+        {   // TODO: optimise by nesting the hsa_signal_t inside the
+            //       completion_future and then building AND AQL packets.
+            return pending_tasks_;
+            // std::vector<completion_future> tmp{first, last};
+            // completion_future fut{std::async(
+            //     [](std::vector<completion_future> futs) {
+            //         for (auto&& x : futs) if (x.valid()) x.wait();
+            //     }, std::move(tmp)).share()};
+
+            // auto& pending = is_default_ ?
+            //     pending_tasks_for_default_av_().second : pending_tasks_;
+
+            // std::unique_lock<std::mutex> lck{
+            //     pending_tasks_for_default_av_().first, std::defer_lock};
+
+            // if (is_default_) lck.lock();
+
+            // pending.push_front(std::move(fut));
+
+            // return pending.front();
+        }
+
+        /**
+         * This command inserts a marker event into the accelerator_view's
+         * command queue with arbitrary number of dependent asynchronous events.
+         *
+         * This marker is returned as a completion_future object. When its
+         * dependent events and all commands submitted prior to the marker event
+         * creation have been completed, the completion_future is ready.
+         *
+         * Regardless of the accelerator_view's execute_order
+         * (execute_any_order, execute_in_order), the marker always ensures
+         * older commands complete before the returned completion_future is
+         * marked ready. Thus, markers provide a mechanism to enforce order
+         * between commands in an execute_any_order accelerator_view.
+         *
+         * fence_scope controls the scope of the acquire and release fences
+         * applied after the marker executes.  Options are:
+         *   - no_scope : No fence operation is performed.
+         *   - accelerator_scope: Memory is acquired from and released to the
+         *     accelerator scope where the marker executes.
+         *   - system_scope: Memory is acquired from and released to system
+         *     scope (all accelerators including CPUs)
+         *
+         * @return A future which can be waited on, and will block until the
+         *         current batch of commands, plus the dependent event have
+         *         been completed.
+         */
+        completion_future create_blocking_marker(
+            std::initializer_list<completion_future> dependent_future_list,
+            memory_scope = system_scope) const
+        {
+            return create_blocking_marker(
+                dependent_future_list.begin(), dependent_future_list.end());
+        }
+
+        /**
+         * Copies size_bytes bytes from src to dst.
+         * Src and dst must not overlap.
+         * Note the src is the first parameter and dst is second, following C++
+         * convention. The copy command will execute after any commands already
+         * inserted into the accelerator_view finish. This is a synchronous copy
+         * command, and the copy operation complete before this call returns.
+         */
+        void copy(const void* src, void* dst, std::size_t size_bytes)
+        {
+            wait_for_all_pending_tasks_();
+
+            detail::throwing_hsa_result_check(
+                hsa_memory_copy(dst, src, size_bytes),
+                __FILE__, __func__, __LINE__);
+        }
+
+        /**
+         * Copies size_bytes bytes from src to dst.
+         * Src and dst must not overlap.
+         * Note the src is the first parameter and dst is second, following C++
+         * convention. This is an asynchronous copy command, and this call may
+         * return before the copy operation completes. If the source or dest is
+         * host memory, the memory must be pinned or a runtime exception will be
+         * thrown. Pinned memory can be created with am_alloc with
+         * flag=amHostPinned flag.
+         *
+         * The copy command will be implicitly ordered with respect to commands
+         * previously enqueued to this accelerator_view:
+         * - If the accelerator_view execute_order is execute_in_order
+         *   (the default), then the copy will execute after all previously sent
+         *   commands finish execution.
+         * - If the accelerator_view execute_order is execute_any_order, then
+         *   the
+         *   copy will start after all previously send commands start but can
+         *   execute in any order.
+         */
+        completion_future copy_async(
+            const void* src, void* dst, std::size_t size_bytes)
+        {
+            wait_for_all_pending_tasks_();
+
+            return completion_future{std::async([=]() {
+                detail::throwing_hsa_result_check(
+                    hsa_memory_copy(dst, src, size_bytes),
+                    __FILE__, __func__, __LINE__);
+            }).share()};
+        }
+
+        /**
+         * Compares "this" accelerator_view with the passed accelerator_view
+         * object to determine if they represent the same underlying object.
+         *
+         * @param[in] other The accelerator_view object to be compared against.
+         * @return A boolean value indicating whether the passed
+         *         accelerator_view object is same as "this" accelerator_view.
+         */
+        bool operator==(const accelerator_view& other) const noexcept
+        {
+            return queue_ == other.queue_;
+        }
+
+        /**
+         * Compares "this" accelerator_view with the passed accelerator_view
+         * object to determine if they represent different underlying objects.
+         *
+         * @param[in] other The accelerator_view object to be compared against.
+         * @return A boolean value indicating whether the passed
+         *         accelerator_view object is different from "this"
+         *         accelerator_view.
+         */
+        bool operator!=(const accelerator_view& other) const noexcept
+        {
+            return !(*this == other);
+        }
+
+        /**
+         * Returns an opaque handle which points to the underlying HSA queue.
+         *
+         * @return An opaque handle of the underlying HSA queue, if the
+         *         accelerator view is based on HSA.  NULL if otherwise.
+         */
+        void* get_hsa_queue() const
+        {
+            return queue_;
+        }
+
+        /**
+         * Set a CU affinity to specific command queues.
+         * The setting is permanent until the queue is destroyed or CU affinity
+         * is set again. This setting is "atomic", it won't affect the dispatch
+         * in flight.
+         *
+         * @param cu_mask a bool vector to indicate what CUs you want to use.
+         *                True represents using the cu. The first 32 elements
+         *                represents the first 32 CUs, and so on. If its size is
+         *                greater than physical CU number, the extra elements
+         *                are ignored. It is user's responsibility to make sure
+         *                the input is meaningful.
+         *
+         * @return true if operations succeeds or false if not.
+         */
+        bool set_cu_mask(const std::vector<bool>& cu_mask);
+    };
+
+    // ------------------------------------------------------------------------
+    // accelerator
+    // ------------------------------------------------------------------------
+
+    /**
+     * Represents a physical accelerated computing device. An object of
+     * this type can be created by enumerating the available devices, or
+     * getting the default device.
+     */
+    class accelerator {
+        // DATA - STATICS
+        static
+        std::once_flag& maybe_set_default_()
+        {
+            static std::once_flag r{};
+
+            return r;
+        }
+
+        // DATA
+        hsa_agent_t agent_{};
+
+        friend class accelerator_view;
+
+        // IMPLEMENTATION - CREATORS
+        explicit
+        accelerator(hsa_agent_t agent) : agent_{agent}
+        {
+            if (detail::Agent_pool::pool().count(agent) != 0) return;
+
+            throw std::logic_error{
+                "Tried to create accelerator from unknown HSA agent."};
+        }
+    public:
+        static
+        constexpr
+        const wchar_t* cpu_accelerator()
+        {
+            return L"cpu";
+        }
+        static
+        constexpr
+        const wchar_t* default_accelerator()
+        {
+            return L"default";
+        }
+
+        /**
+         * Constructs a new accelerator object that represents the default
+         * accelerator. This is equivalent to calling the constructor
+         * @code{.cpp}
+         * accelerator(accelerator::default_accelerator)
+         * @endcode
+         *
+         * The actual accelerator chosen as the default can be affected by
+         * calling accelerator::set_default().
+         */
+        accelerator() : accelerator{default_accelerator()} {}
+
+        /**
+         * Constructs a new accelerator object that represents the physical
+         * device named by the "path" argument. If the path represents an
+         * unknown or unsupported device, an exception will be thrown.
+         *
+         * The path can be one of the following:
+         * 1. accelerator::default_accelerator (or L"default"), which represents
+         *    the path of the fastest accelerator available, as chosen by the
+         *    runtime.
+         * 2. accelerator::cpu_accelerator (or L"cpu"), which represents the
+         *    CPU. Note that parallel_for_each shall not be invoked over this
+         *    accelerator.
+         * 3. A valid device path that uniquely identifies a hardware
+         *    accelerator available on the host system.
+         *
+         * @param[in] path The device path of this accelerator.
+         */
+        explicit
+        accelerator(const std::wstring& path)
+            : accelerator{
+                (path == default_accelerator()) ?
+                    detail::Agent_pool::default_agent() :
+                        ((path == cpu_accelerator()) ?
+                            detail::Agent_pool::cpu_agent() :
+                            hsa_agent_t{std::stoull(path)})}
+        {}
+
+        /**
+         * Copy constructs an accelerator object. This function does a shallow
+         * copy with the newly created accelerator object pointing to the same
+         * underlying device as the passed accelerator parameter.
+         *
+         * @param[in] other The accelerator object to be copied.
+         */
+        accelerator(const accelerator&) = default;
+        accelerator(accelerator&&) = default;
+
+        /**
+         * Returns a std::vector of accelerator objects (in no specific
+         * order) representing all accelerators that are available, including
+         * reference accelerators if available.
+         *
+         * @return A vector of accelerators.
+         */
+        static
+        std::vector<accelerator> get_all()
+        {
+            static std::vector<accelerator> r;
+            static std::once_flag f;
+
+            std::call_once(f, []() {
+                for(auto&& agent : detail::Agent_pool::pool()) {
+                    r.push_back(accelerator{agent.first});
+                }
+            });
+
+            return r;
+        }
+
+        /**
+         * Sets the default accelerator to the device path identified by the
+         * "path" argument. See the constructor
+         * accelerator(const std::wstring& path) for a description of the
+         * allowable path strings.
+         *
+         * This establishes a process-wide default accelerator and influences
+         * all subsequent operations that might use a default accelerator.
+         *
+         * @param[in] path The device path of the default accelerator.
+         * @return A Boolean flag indicating whether the default was set. If the
+         *         default has already been set for this process, this value
+         *         will be false, and the function will have no effect.
+         */
+        static
+        bool set_default(const std::wstring& path)
+        {
+            bool r{false};
+            std::call_once(maybe_set_default_(), [&]() {
+                r = true;
+
+                if (path == default_accelerator()) return;
+                if (path == cpu_accelerator()) {
+                    detail::Agent_pool::default_agent() =
+                        detail::Agent_pool::cpu_agent();
+
+                    return;
+                }
+
+                const hsa_agent_t tmp{std::stoull(path)};
+                if (detail::Agent_pool::pool().count(tmp) != 0) {
+                    detail::Agent_pool::default_agent() = tmp;
+
+                    return;
+                }
+
+                throw std::logic_error{
+                    "Tried to set unknown HSA agent as default."};
+            });
+
+            return r;
+        }
+
+        /**
+         * Returns an accelerator_view which when passed as the first argument
+         * to a parallel_for_each call causes the runtime to automatically
+         * select the target accelerator_view for executing the
+         * parallel_for_each kernel. In other words, a parallel_for_each
+         * invocation with the accelerator_view returned by
+         * get_auto_selection_view() is the same as a parallel_for_each
+         * invocation without an accelerator_view argument.
+         *
+         * For all other purposes, the accelerator_view returned by
+         * get_auto_selection_view() behaves the same as the default
+         * accelerator_view of the default accelerator
+         * (aka accelerator().get_default_view()).
+         *
+         * @return An accelerator_view than can be used to indicate auto
+         *         selection of the target for a parallel_for_each execution.
+         */
+        static
+        accelerator_view get_auto_selection_view()
+        {
+            set_default(default_accelerator());
+
+            static accelerator acc{default_accelerator()};
+
+            return acc.get_default_view();
+        }
+
+        /**
+         * Assigns an accelerator object to "this" accelerator object and
+         * returns a reference to "this" object. This function does a shallow
+         * assignment with the newly created accelerator object pointing to the
+         * same underlying device as the passed accelerator parameter.
+         *
+         * @param other The accelerator object to be assigned from.
+         * @return A reference to "this" accelerator object.
+         */
+        accelerator& operator=(const accelerator&) = default;
+        accelerator& operator=(accelerator&&) = default;
+
+        /**
+         * Returns the default accelerator_view associated with the accelerator.
+         * The queuing_mode of the default accelerator_view is
+         * queuing_mode_automatic.
+         *
+         * @return The default accelerator_view object associated with the
+         * accelerator.
+         */
+        accelerator_view get_default_view() const
+        {
+            return accelerator_view{
+                *this,
+                detail::Queue_pool::default_queue(agent_),
+                queuing_mode_automatic,
+                true};
+        }
+
+        /**
+         * Creates and returns a new accelerator view on the accelerator with
+         * the supplied queuing mode.
+         *
+         * @param[in] qmode The queuing mode of the accelerator_view to be
+         *                  created. See "Queuing Mode". The default value would
+         *                  be queueing_mode_automatic if not specified.
+         */
+        accelerator_view create_view(
+            execute_order = execute_in_order,
+            queuing_mode mode = queuing_mode_automatic)
+        {
+            return accelerator_view{
+                *this, detail::Queue_pool::defined_queue(agent_), mode};
+        }
+
+        /**
+         * Compares "this" accelerator with the passed accelerator object to
+         * determine if they represent the same underlying device.
+         *
+         * @param[in] other The accelerator object to be compared against.
+         * @return A boolean value indicating whether the passed accelerator
+         *         object is same as "this" accelerator.
+         */
+        bool operator==(const accelerator& other) const
+        {
+            return agent_.handle == other.agent_.handle;
+        }
+
+        /**
+         * Compares "this" accelerator with the passed accelerator object to
+         * determine if they represent different devices.
+         *
+         * @param[in] other The accelerator object to be compared against.
+         * @return A boolean value indicating whether the passed accelerator
+         *         object is different from "this" accelerator.
+         */
+        bool operator!=(const accelerator& other) const
+        {
+            return !(*this == other);
+        }
+
+        /**
+         * Sets the default_cpu_access_type for this accelerator.
+         *
+         * The default_cpu_access_type is used for arrays created on this
+         * accelerator or for implicit array_view memory allocations accessed on
+         * this accelerator.
+         *
+         * This method only succeeds if the default_cpu_access_type for the
+         * accelerator has not already been overriden by a previous call to this
+         * method and the runtime selected default_cpu_access_type for this
+         * accelerator has not yet been used for allocating an array or for an
+         * implicit array_view memory allocation on this accelerator.
+         *
+         * @param[in] default_cpu_access_type The default cpu access_type to be
+         *                                    used for array/array_view memory
+         *                                    allocations on this accelerator.
+         * @return A boolean value indicating if the default cpu access_type for
+         *         the accelerator was successfully set.
+         */
+        bool set_default_cpu_access_type(access_type type)
+        {
+            static std::unordered_map<hsa_agent_t, std::once_flag> done;
+
+            bool set{false};
+            std::call_once(done[agent_], [&](){
+                set = true;
+
+                detail::Agent_pool::pool()[agent_].default_cpu_access = type;
+            });
+
+            return set;
+        }
+
+        /**
+         * Returns a system-wide unique device instance path that matches the
+         * "Device Instance Path" property for the device in Device Manager, or
+         * one of the predefined path constants cpu_accelerator.
+         */
+        std::wstring get_device_path() const
+        {
+            return std::to_wstring(agent_.handle);
+        }
+
+        /**
+         * Returns a short textual description of the accelerator device.
+         */
+        std::wstring get_description() const
+        {
+            return detail::Agent_pool::pool()[agent_].name;
+        }
+
+        /**
+         * Returns a 32-bit unsigned integer representing the version number of
+         * this accelerator. The format of the integer is major.minor, where the
+         * major version number is in the high-order 16 bits, and the minor
+         * version number is in the low-order bits.
+         */
+        unsigned int get_version() const
+        {
+            return detail::Agent_pool::pool()[agent_].version;
+        }
+
+        /**
+         * This property indicates that the accelerator may be shared by (and
+         * thus have interference from) the operating system or other system
+         * software components for rendering purposes. A C++ AMP implementation
+         * may set this property to false should such interference not be
+         * applicable for a particular accelerator.
+         */
+        bool get_has_display() const
+        {   // FIXME: dummy implementation now
+            return false;
+        }
+
+        /**
+         * Returns the amount of dedicated memory (in KB) on an accelerator
+         * device. There is no guarantee that this amount of memory is actually
+         * available to use.
+         */
+        size_t get_dedicated_memory() const
+        {
+            return detail::Agent_pool::pool()[agent_].dedicated_memory;
+        }
+
+        /**
+         * Returns a Boolean value indicating whether this accelerator supports
+         * double-precision (double) computations. When this returns true,
+         * supports_limited_double_precision also returns true.
+         */
+        bool get_supports_double_precision() const
+        {   // This is true for all targets we support at the moment.
+            return true;
+        }
+
+        /**
+         * Returns a boolean value indicating whether the accelerator has
+         * limited double precision support (excludes double division,
+         * precise_math functions, int to double, double to int conversions) for
+         * a parallel_for_each kernel.
+         */
+        bool get_supports_limited_double_precision() const
+        {   // This is true for all targets we support at the moment.
+            return true;
+        }
+
+        /**
+         * Returns a boolean value indicating whether the accelerator supports
+         * debugging.
+         */
+        bool get_is_debug() const
+        {   // FIXME: dummy implementation now
+            return false;
+        }
+
+        /**
+         * Returns a boolean value indicating whether the accelerator is
+         * emulated. This is true, for example, with the reference, and CPU
+         * accelerators.
+         */
+        bool get_is_emulated() const
+        {
+            return detail::Agent_pool::pool()[agent_].is_cpu;
+        }
+
+        /**
+         * Returns a boolean value indicating whether the accelerator supports
+         * memory accessible both by the accelerator and the CPU.
+         */
+        bool get_supports_cpu_shared_memory() const
+        {
+            return detail::Agent_pool::pool()[agent_].has_cpu_shared_memory;
+        }
+
+        /**
+         * Get the default cpu access_type for buffers created on this
+         * accelerator
+         */
+        access_type get_default_cpu_access_type() const
+        {
+            return detail::Agent_pool::pool()[agent_].default_cpu_access;
+        }
+
+
+        /**
+         * Returns the maximum size of tile static area available on this
+         * accelerator.
+         */
+        std::size_t get_max_tile_static_size() const
+        {
+            return detail::Agent_pool::pool()[agent_].max_tile_static_size;
+        }
+
+        /**
+         * Returns an opaque handle which points to the AM region on the HSA
+         * agent. This region can be used to allocate accelerator memory which
+         * is accessible from the specified accelerator.
+         *
+         * @return An opaque handle of the region, if the accelerator is based
+         *         on HSA.  NULL otherwise.
+         */
+        void* get_hsa_am_region() const
+        {
+            auto& acg = detail::Agent_pool::pool()[agent_]
+                .agent_allocated_coarse_grained_region;
+            if (acg.handle) return &acg;
+
+            return nullptr;
+        }
+
+        /**
+         * Returns an opaque handle which points to the AM system region on the
+         * HSA agent. This region can be used to allocate system memory which is
+         * accessible from the specified accelerator.
+         *
+         * @return An opaque handle of the region, if the accelerator is based
+         *         on HSA.  NULL otherwise.
+         */
+        void* get_hsa_am_system_region() const
+        {
+            return &detail::Agent_pool::pool()[agent_]
+                .system_coarse_grained_region;
+        }
+
+        /**
+         * Returns an opaque handle which points to the AM system region on the
+         * HSA agent. This region can be used to allocate finegrained system
+         * memory which is accessible from the specified accelerator.
+         *
+         * @return An opaque handle of the region, if the accelerator is based
+         *         on HSA.  NULL otherwise.
+         */
+        void* get_hsa_am_finegrained_system_region() const
+        {
+            return &detail::Agent_pool::pool()[agent_].fine_grained_region;
+        }
+
+        /**
+         * Returns an opaque handle which points to the Kernarg region on the
+         * HSA agent.
+         *
+         * @return An opaque handle of the region, if the accelerator is based
+         *         on HSA.  NULL otherwise.
+         */
+        void* get_hsa_kernarg_region() const
+        {   // TODO: fix
+            return nullptr;
+        }
+
+        /**
+         * Returns if the accelerator is based on HSA.
+         */
+        bool is_hsa_accelerator() const
+        {
+            return true;
+        }
+
+        /**
+         * Returns the profile the accelerator.
+         * - accelerator_profile_none in case the accelerator is not based on
+         *   HSA.
+         * - accelerator_profile_base in case the accelerator implements the HSA
+         *   Base Profile.
+         * - accelerator_profile_full in case the accelerator implements the HSA
+         *   Full Profile.
+         */
+        accelerator_profile get_profile() const
+        {
+            return detail::Agent_pool::pool()[agent_].profile;
+        }
+
+        /**
+         * Returns an opaque handle which points to the underlying HSA agent.
+         *
+         * @return An opaque handle of the underlying HSA agent, if the
+         *         accelerator is based on HSA. NULL otherwise.
+         */
+        void* get_hsa_agent() const
+        {   // TODO: redo, should return the handle directly.
+            return const_cast<hsa_agent_t*>(&agent_);
+        }
+
+        /**
+         * Check if @p other is peer of this accelerator.
+         *
+         * @return true if other can access this accelerator's device memory
+         * pool or false if not. The accelerator is not its own peer.
+         */
+        bool get_is_peer(const accelerator& other) const
+        {
+            if (*this == other) return false;
+            if (!get_hsa_am_region()) return false;
+
+            hsa_amd_memory_pool_access_t r{};
+            detail::throwing_hsa_result_check(
+                hsa_amd_agent_memory_pool_get_info(
+                    *static_cast<hsa_agent_t*>(other.get_hsa_agent()),
+                    *static_cast<hsa_amd_memory_pool_t*>(get_hsa_am_region()),
+                    HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
+                    &r),
+                __FILE__, __func__, __LINE__);
+
+            return r != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
+        }
+
+        /**
+         * Return a std::vector of this accelerator's peers. peer is other
+         * accelerator which can access this accelerator's device memory using
+         * map_to_peer family of APIs.
+         */
+        std::vector<accelerator> get_peers() const
+        {   // TODO: remove / optimise.
+            std::vector<accelerator> peers;
+
+            static const auto accs = get_all();
+            for (auto&& acc : accs) if (get_is_peer(acc)) peers.push_back(acc);
+
+            return peers;
+        }
+
+        /**
+         * Return the compute unit count of the accelerator.
+         */
+        unsigned int get_cu_count() const
+        {
+            return detail::Agent_pool::pool()[agent_].compute_unit_count;
+        }
+
+        /**
+         * Return the unique integer sequence-number for the accelerator.
+         * Sequence-numbers are assigned in monotonically increasing order
+         * starting with 0.
+         */
+        int get_seqnum() const
+        {
+            return INT_MAX;
+        }
+
+
+        /**
+         * Return true if the accelerator's memory can be mapped into the CPU's
+         * address space, and the CPU is allowed to access the memory directly
+         * with CPU memory operations. Typically this is enabled with
+         * "large BAR" or "resizeable BAR" address mapping.
+         */
+        bool has_cpu_accessible_am() const
+        {   // TODO: fix.
+            return detail::Agent_pool::pool()[agent_]
+                .has_cpu_accessible_agent_allocated_coarse_grained;
+        }
+    };
+
+
+    inline
+    accelerator accelerator_view::get_accelerator() const
+    {
+        if (accelerator_) return *accelerator_;
+
+        throw std::logic_error{
+            "Tried to query accelerator from empty accelerator_view."};
+    }
+
+    // ------------------------------------------------------------------------
+    // member function implementations
+    // ------------------------------------------------------------------------
+    inline
+    std::pair<std::mutex, completion_future>& accelerator_view::
+        pending_tasks_for_default_av_() const
+    {
+        if (!accelerator_) {
+            throw std::logic_error{"Invariants of class accelerator broken."};
+        }
+
+        using ConcurrentFuture_ = std::pair<std::mutex, completion_future>;
+
+        static const auto cnt = detail::Agent_pool::pool().size();
+        static const auto del = [](ConcurrentFuture_* ptr) {
+            if (!ptr) return;
+
+            std::lock_guard<std::mutex> lck{ptr->first};
+
+            if (ptr->second.valid()) ptr->second.wait();
+
+            delete [] ptr;
+        };
+        static std::unique_ptr<ConcurrentFuture_[], decltype(del)> r{
+            new ConcurrentFuture_[cnt], del};
+
+        const auto idx = std::distance(
+            detail::Agent_pool::pool().begin(),
+            detail::Agent_pool::pool().find(
+                *static_cast<hsa_agent_t*>(accelerator_->get_hsa_agent())));
+
+        return r[idx];
+    }
+
+    inline
+    accelerator_view::~accelerator_view()
+    {
+        if (!is_default_) wait_for_all_pending_tasks_();
+    }
+
+    inline
+    bool accelerator_view::set_cu_mask(const std::vector<bool>& cu_mask)
+    {
+        const auto agent =
+            *static_cast<hsa_agent_t*>(accelerator_->get_hsa_agent());
+        const auto cnt = detail::Agent_pool::pool()[agent].compute_unit_count;
+
+        if (cnt == 0) return false;
+
+        static const auto round_up_to_next_multiple_of_32 = [](std::size_t x) {
+            x = x + 32 - 1;
+            return x - x % 32;
+        };
+
+        std::vector<std::uint32_t> mask{cu_mask.cbegin(), cu_mask.cend()};
+        mask.resize(round_up_to_next_multiple_of_32(cnt));
+
+        detail::throwing_hsa_result_check(
+            hsa_amd_queue_cu_set_mask(queue_, mask.size(), mask.data()),
+            __FILE__, __func__, __LINE__);
+
+        return true; // Unclear how this failing could be anything but an error.
+    }
+
+    // ------------------------------------------------------------------------
+    // extent
+    // ------------------------------------------------------------------------
+
+    /**
+     * Represents a unique position in N-dimensional space.
+     *
+     * @tparam N The dimension to this extent applies. Special constructors are
+     *           supplied for the cases where @f$N \in \{ 1,2,3 \}@f$, but N can
+     *           be any integer greater than or equal to 1.
+     */
+    template<int N>
+    class extent {
+        static_assert(N > 0, "Dimensionality must be positive");
+
+        using base =
+            detail::index_impl<typename detail::__make_indices<N>::type>;
+        base base_;
+
+        template<int, typename> friend struct detail::index_helper;
+        template<int, typename, typename> friend struct detail::amp_helper;
+    public:
+        /**
+         * A static member of extent<N> that contains the rank of this extent.
+         */
+        static constexpr int rank = N;
+
+        /**
+         * The element type of extent<N>.
+         */
+        typedef int value_type;
+
+        /**
+         * Default constructor. The value at each dimension is initialized to
+         * zero. Thus, "extent<3> ix;" initializes the variable to the position
+         * (0,0,0).
+         */
+        extent() [[cpu, hc]] = default;
+
+        /**
+         * Copy constructor. Constructs a new extent<N> from the supplied
+         * argument.
+         *
+         * @param other An object of type extent<N> from which to initialize
+         *              this new extent.
+         */
+        extent(const extent&) [[cpu, hc]] = default;
+
+        /** @{ */
+        /**
+         * Constructs an extent<N> with the coordinate values provided by
+         * @f$e_{0..2}@f$. These are specialized constructors that are only
+         * valid when the rank of the extent @f$N \in \{1,2,3\}@f$. Invoking a
+         * specialized constructor whose argument @f$count \ne N@f$ will result
+         * in a compilation error.
+         *
+         * @param[in] e0 The component values of the extent vector.
+         */
+        template<
+            typename... Ts,
+            typename std::enable_if<sizeof...(Ts) == N>::type* = nullptr>
+        explicit
+        extent(Ts... i_n) [[cpu, hc]] : base_{i_n...}
+        {
+            static_assert(
+                sizeof...(Ts) <= 3,
+                "Can only supply at most 3 individual coordinates in the "
+                "constructor.");
+        }
+
+        /** @} */
+
+        /**
+         * Constructs an extent<N> with the coordinate values provided the array
+         * of int component values. If the coordinate array length @f$\ne@f$ N,
+         * the behavior is undefined. If the array value is NULL or not a valid
+         * pointer, the behavior is undefined.
+         *
+         * @param[in] components An array of N int values.
+         */
+        explicit
+        extent(const int components[]) [[cpu, hc]] : base_{components} {}
+
+        /**
+         * Constructs an extent<N> with the coordinate values provided the array
+         * of int component values. If the coordinate array length @f$\ne@f$ N,
+         * the behavior is undefined. If the array value is NULL or not a valid
+         * pointer, the behavior is undefined.
+         *
+         * @param[in] components An array of N int values.
+         */
+        explicit
+        extent(int components[]) [[cpu, hc]] : base_{components} {}
+
+        /**
+         * Assigns the component values of "other" to this extent<N> object.
+         *
+         * @param[in] other An object of type extent<N> from which to copy into
+         *                  this extent.
+         * @return Returns *this.
+         */
+        extent& operator=(const extent&) [[cpu, hc]] = default;
+
+        /** @{ */
+        /**
+         * Returns the extent component value at position c.
+         *
+         * @param[in] c The dimension axis whose coordinate is to be accessed.
+         * @return A the component value at position c.
+         */
+        int operator[](unsigned int c) const [[cpu, hc]]
+        {
+            return base_[c];
+        }
+        int& operator[](unsigned int c) [[cpu, hc]]
+        {
+            return base_[c];
+        }
+
+        /** @} */
+
+        /**
+         * Tests whether the index "idx" is properly contained within this
+         * extent (with an assumed origin of zero).
+         *
+         * @param[in] idx An object of type index<N>
+         * @return Returns true if the "idx" is contained within the space
+         *         defined by this extent (with an assumed origin of zero).
+         */
+        bool contains(const index<N>& idx) const noexcept [[cpu, hc]]
+        {
+            return detail::amp_helper<N, index<N>, extent<N>>::contains(
+                idx, *this);
+        }
+
+        /**
+         * This member function returns the total linear size of this extent<N>
+         * (in units of elements), which is computed as:
+         * extent[0] * extent[1] ... * extent[N-1]
+         */
+        unsigned int size() const noexcept [[cpu, hc]]
+        {
+            return detail::index_helper<N, extent<N>>::count_size(*this);
+        }
+
+        /** @{ */
+        /**
+         * Produces a tiled_extent object with the tile extents given by t0, t1,
+         * and t2.
+         *
+         * tile(t0, t1, t2) is only supported on extent<1>. It will produce a
+         * compile-time error if used on an extent where N @f$\ne@f$ 3.
+         * tile(t0, t1) is only supported on extent<2>. It will produce a
+         * compile-time error if used on an extent where N @f$\ne@f$ 2.
+         * tile(t0) is only supported on extent<1>. It will produce a
+         * compile-time error if used on an extent where N @f$\ne@f$ 1.
+         */
+        tiled_extent<1> tile(int t0) const [[cpu, hc]];
+        tiled_extent<2> tile(int t0, int t1) const [[cpu, hc]];
+        tiled_extent<3> tile(int t0, int t1, int t2) const [[cpu, hc]];
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Produces a tiled_extent object with the tile extents given by t0, t1,
+         * and t2, plus a certain amount of dynamic group segment.
+         */
+        tiled_extent<1> tile_with_dynamic(
+            int t0, unsigned int dynamic_size) const;
+        tiled_extent<2> tile_with_dynamic(
+            int t0, int t1, unsigned int dynamic_size) const;
+        tiled_extent<3> tile_with_dynamic(
+            int t0, int t1, int t2, unsigned int dynamic_size) const;
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Compares two objects of extent<N>.
+         *
+         * The expression
+         * leftExt @f$\oplus@f$ rightExt
+         * is true if leftExt[i] @f$\oplus@f$ rightExt[i] for every i from 0 to
+         * N-1.
+         *
+         * @param[in] other The right-hand extent<N> to be compared.
+         */
+        bool operator==(const extent& other) const [[cpu, hc]]
+        {
+            return detail::index_helper<N, extent<N> >::equal(*this, other);
+        }
+        bool operator!=(const extent& other) const [[cpu, hc]]
+        {
+            return !(*this == other);
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Adds (or subtracts) an object of type extent<N> from this extent to
+         * form a new extent. The result extent<N> is such that for a given
+         * operator @f$\oplus@f$,
+         * result[i] = this[i] @f$\oplus@f$ ext[i]
+         *
+         * @param[in] ext The right-hand extent<N> to be added or subtracted.
+         */
+        extent& operator+=(const extent& __r) [[cpu, hc]]
+        {
+            base_.operator+=(__r.base_);
+            return *this;
+        }
+        extent& operator-=(const extent& __r) [[cpu, hc]]
+        {
+            base_.operator-=(__r.base_);
+            return *this;
+        }
+        extent& operator*=(const extent& __r) [[cpu, hc]]
+        {
+            base_.operator*=(__r.base_);
+            return *this;
+        }
+        extent& operator/=(const extent& __r) [[cpu, hc]]
+        {
+            base_.operator/=(__r.base_);
+            return *this;
+        }
+        extent& operator%=(const extent& __r) [[cpu, hc]]
+        {
+            base_.operator%=(__r.base_);
+            return *this;
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Adds (or subtracts) an object of type index<N> from this extent to
+         * form a new extent. The result extent<N> is such that for a given
+         * operator @f$\oplus@f$,
+         * result[i] = this[i] @f$\oplus@f$ idx[i]
+         *
+         * @param[in] idx The right-hand index<N> to be added or subtracted.
+         */
+        extent operator+(const index<N>& idx) const [[cpu, hc]]
+        {
+            extent __r = *this;
+            __r += idx;
+            return __r;
+        }
+        extent operator-(const index<N>& idx) const [[cpu, hc]]
+        {
+            extent __r = *this;
+            __r -= idx;
+            return __r;
+        }
+        extent& operator+=(const index<N>& idx) [[cpu, hc]]
+        {
+            base_.operator+=(idx.base_);
+            return *this;
+        }
+        extent& operator-=(const index<N>& idx) [[cpu, hc]]
+        {
+            base_.operator-=(idx.base_);
+            return *this;
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * For a given operator @f$\oplus@f$, produces the same effect as
+         * (*this) = (*this) @f$\oplus@f$ value
+         *
+         * The return value is "*this".
+         *
+         * @param[in] value The right-hand int of the arithmetic operation.
+         */
+        extent& operator+=(int value) [[cpu, hc]]
+        {
+            base_.operator+=(value);
+            return *this;
+        }
+        extent& operator-=(int value) [[cpu, hc]]
+        {
+            base_.operator-=(value);
+            return *this;
+        }
+        extent& operator*=(int value) [[cpu, hc]]
+        {
+            base_.operator*=(value);
+            return *this;
+        }
+        extent& operator/=(int value) [[cpu, hc]]
+        {
+            base_.operator/=(value);
+            return *this;
+        }
+        extent& operator%=(int value) [[cpu, hc]]
+        {
+            base_.operator%=(value);
+            return *this;
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * For a given operator @f$\oplus@f$, produces the same effect as
+         * (*this) = (*this) @f$\oplus@f$ 1
+         *
+         * For prefix increment and decrement, the return value is "*this".
+         * Otherwise a new extent<N> is returned.
+         */
+        extent& operator++() [[cpu, hc]]
+        {
+            base_.operator+=(1);
+            return *this;
+        }
+        extent operator++(int) [[cpu, hc]]
+        {
+            extent ret = *this;
+            base_.operator+=(1);
+            return ret;
+        }
+        extent& operator--() [[cpu, hc]]
+        {
+            base_.operator-=(1);
+            return *this;
+        }
+        extent operator--(int) [[cpu, hc]]
+        {
+            extent ret = *this;
+            base_.operator-=(1);
+            return ret;
+        }
+
+        /** @} */
+    };
+
+    // ------------------------------------------------------------------------
+    // global functions for extent
+    // ------------------------------------------------------------------------
+
+    /** @{ */
+    /**
+     * Adds (or subtracts) two objects of extent<N> to form a new extent. The
+     * result extent<N> is such that for a given operator @f$\oplus@f$,
+     * result[i] = leftExt[i] @f$\oplus@f$ rightExt[i]
+     * for every i from 0 to N-1.
+     *
+     * @param[in] lhs The left-hand extent<N> to be compared.
+     * @param[in] rhs The right-hand extent<N> to be compared.
+     */
+    // FIXME: the signature is not entirely the same as defined in:
+    //        C++AMP spec v1.2 #1253
+    template<int N>
+    inline
+    extent<N> operator+(const extent<N>& lhs, const extent<N>& rhs) [[cpu, hc]]
+    {
+        extent<N> __r = lhs;
+        __r += rhs;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator-(const extent<N>& lhs, const extent<N>& rhs) [[cpu, hc]]
+    {
+        extent<N> __r = lhs;
+        __r -= rhs;
+        return __r;
+    }
+
+    /** @} */
+
+    /** @{ */
+    /**
+     * Binary arithmetic operations that produce a new extent<N> that is the
+     * result of performing the corresponding binary arithmetic operation on the
+     * elements of the extent operands. The result extent<N> is such that for a
+     * given operator @f$\oplus@f$,
+     * result[i] = ext[i] @f$\oplus@f$ value
+     * or
+     * result[i] = value @f$\oplus@f$ ext[i]
+     * for every i from 0 to N-1.
+     *
+     * @param[in] ext The extent<N> operand
+     * @param[in] value The integer operand
+     */
+    // FIXME: the signature is not entirely the same as defined in:
+    //        C++AMP spec v1.2 #1259
+    template<int N>
+    inline
+    extent<N> operator+(const extent<N>& ext, int value) [[cpu, hc]]
+    {
+        extent<N> __r = ext;
+        __r += value;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator+(int value, const extent<N>& ext) [[cpu, hc]]
+    {
+        extent<N> __r = ext;
+        __r += value;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator-(const extent<N>& ext, int value) [[cpu, hc]]
+    {
+        extent<N> __r = ext;
+        __r -= value;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator-(int value, const extent<N>& ext) [[cpu, hc]]
+    {
+        extent<N> __r(value);
+        __r -= ext;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator*(const extent<N>& ext, int value) [[cpu, hc]]
+    {
+        extent<N> __r = ext;
+        __r *= value;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator*(int value, const extent<N>& ext) [[cpu, hc]]
+    {
+        extent<N> __r = ext;
+        __r *= value;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator/(const extent<N>& ext, int value) [[cpu, hc]]
+    {
+        extent<N> __r = ext;
+        __r /= value;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator/(int value, const extent<N>& ext) [[cpu, hc]]
+    {
+        extent<N> __r(value);
+        __r /= ext;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator%(const extent<N>& ext, int value) [[cpu, hc]]
+    {
+        extent<N> __r = ext;
+        __r %= value;
+        return __r;
+    }
+    template<int N>
+    inline
+    extent<N> operator%(int value, const extent<N>& ext) [[cpu, hc]]
+    {
+        extent<N> __r(value);
+        __r %= ext;
+        return __r;
+    }
+
+    /** @} */
+
+    // ------------------------------------------------------------------------
+    // tiled_extent
+    // ------------------------------------------------------------------------
+
+    /**
+     * Represents an extent subdivided into tiles.
+     * Tile sizes can be specified at runtime.
+     *
+     * @tparam N The dimension of the extent and the tile.
+     */
+    template<int n>
+    class tiled_extent : public extent<n> {
+        std::uint32_t dynamic_group_segment_size_{};
+    public:
+        static constexpr int rank{n};
+
+        /**
+         * Tile size for each dimension.
+         */
+        const int tile_dim[n]{};
+
+        // CREATORS
+        /**
+         * Default constructor. The origin and extent is default-constructed and
+         * thus zero.
+         */
+        tiled_extent() [[cpu, hc]] = default;
+
+        /**
+         * Copy constructor. Constructs a new tiled_extent from the supplied
+         * argument "other".
+         *
+         * @param[in] other An object of type tiled_extent from which to
+         *                  initialize this new extent.
+         */
+        tiled_extent(const tiled_extent&) [[cpu, hc]] = default;
+        tiled_extent(tiled_extent&&) [[cpu, hc]] = default;
+
+        /**
+         * Construct an tiled extent with the size of extent and the size of
+         * tile specified.
+         *
+         * @param[in] e# Size of extent in the #th dimension.
+         * @param[in] t# Size of tile in the #th dimension.
+         */
+        template<int m = n, typename std::enable_if<m == 1>::type* = nullptr>
+        tiled_extent(int e0, int t0) [[cpu, hc]] : tiled_extent{e0, t0, 0u}
+        {}
+
+        template<int m = n, typename std::enable_if<m == 2>::type* = nullptr>
+        tiled_extent(int e0, int e1, int t0, int t1) [[cpu, hc]]
+            : tiled_extent{e0, e1, t0, t1, 0u}
+        {}
+
+        template<int m = n, typename std::enable_if<m == 3>::type* = nullptr>
+        tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2) [[cpu, hc]]
+            : tiled_extent{e0, e1, e2, t0, t1, t2, 0u}
+        {}
+
+        /**
+         * Construct an tiled extent with the size of extent and the size of
+         * tile specified.
+         *
+         * @param[in] e# Size of extent in the #th dimension.
+         * @param[in] t# Size of tile in the #th dimension.
+         * @param[in] size Size of dynamic group segment.
+         */
+        template<int m = n, typename std::enable_if<m == 1>::type* = nullptr>
+        tiled_extent(int e0, int t0, std::uint32_t size) [[cpu, hc]]
+            : tiled_extent{hc::extent<n>{e0}, t0, size}
+        {}
+
+        template<int m = n, typename std::enable_if<m == 2>::type* = nullptr>
+        tiled_extent(
+            int e0, int e1, int t0, int t1, std::uint32_t size) [[cpu, hc]]
+            : tiled_extent{hc::extent<n>{e0, e1}, t0, t1, size}
+        {}
+
+        template<int m = n, typename std::enable_if<m == 3>::type* = nullptr>
+        tiled_extent(
+            int e0,
+            int e1,
+            int e2,
+            int t0,
+            int t1,
+            int t2,
+            std::uint32_t size) [[cpu, hc]]
+            : tiled_extent{hc::extent<n>{e0, e1, e2}, t0, t1, t2, size}
+        {}
+
+        /**
+         * Constructs a tiled_extent<N> with the extent "ext".
+         *
+         * @param[in] ext The extent of this tiled_extent
+         * @param[in] ts... Size of tile in dimensions....
+         */
+        template<   // TODO: tighten constraint.
+            typename... Ts,
+            typename std::enable_if<sizeof...(Ts) == n>::type* = nullptr>
+        tiled_extent(const extent<n>& ext, Ts... ts) [[cpu, hc]]
+            : tiled_extent{ext, ts..., 0u}
+        {}
+
+        /**
+         * Constructs a tiled_extent<N> with the extent "ext".
+         *
+         * @param[in] ext The extent of this tiled_extent
+         * @param[in] t# Size of tile in the #th dimension.
+         * @param[in] size Size of dynamic group segment
+         */
+        template<int m = n, typename std::enable_if<m == 1>::type* = nullptr>
+        tiled_extent(
+            const hc::extent<n>& ext, int t0, std::uint32_t size) [[cpu, hc]]
+            : extent<n>{ext}, dynamic_group_segment_size_{size}, tile_dim{t0}
+        {}
+
+        template<int m = n, typename std::enable_if<m == 2>::type* = nullptr>
+        tiled_extent(
+            const hc::extent<n>& ext,
+            int t0,
+            int t1,
+            std::uint32_t size) [[cpu, hc]]
+            :
+            extent<n>{ext}, dynamic_group_segment_size_{size}, tile_dim{t0, t1}
+        {}
+
+        template<int m = n, typename std::enable_if<m == 3>::type* = nullptr>
+        tiled_extent(
+            const hc::extent<n>& ext,
+            int t0,
+            int t1,
+            int t2,
+            std::uint32_t size) [[cpu, hc]]
+            :
+            extent<n>{ext},
+            dynamic_group_segment_size_{size},
+            tile_dim{t0, t1, t2}
+        {}
+
+        // MANIPULATORS
+        void set_dynamic_group_segment_size(std::uint32_t size) noexcept [[cpu]]
+        {
+            dynamic_group_segment_size_ = size;
+        }
+
+        // ACCESSORS
+        /**
+         * Return the size of dynamic group segment in bytes.
+         */
+        std::uint32_t get_dynamic_group_segment_size() const noexcept [[cpu]]
+        {
+            return dynamic_group_segment_size_;
+        }
+
+        tiled_extent pad() const noexcept [[cpu, hc]]
+        {
+            static const auto round_up_to_next_multiple = [](int x, int y) {
+                x = x + y - 1;
+                return x - x % y;
+            };
+
+            tiled_extent tmp{*this};
+            for (auto i = 0; i != n; ++i) {
+                tmp[i] = round_up_to_next_multiple(tmp[i], tile_dim[i]);
+            }
+
+            return tmp;
+        }
+
+        tiled_extent truncate() const noexcept [[cpu, hc]]
+        {
+            static const auto round_down_to_previous_multiple =
+                [](int x, int y) { return x - x % y; };
+
+            tiled_extent tmp{*this};
+            for (auto i = 0; i != n; ++i) {
+                tmp[i] = round_down_to_previous_multiple(tmp[i], tile_dim[i]);
+            }
+
+            return tmp;
+        }
+    };
+
+    // ------------------------------------------------------------------------
+    // implementation of extent<N>::tile()
+    // ------------------------------------------------------------------------
+
+    template <int N>
+    inline
+    tiled_extent<1> extent<N>::tile(int t0) const [[cpu, hc]]
+    {
+        static_assert(
+            N == 1,
+            "One-dimensional tile() method only available on extent<1>");
+        return tiled_extent<1>{*this, t0};
+    }
+
+    template <int N>
+    inline
+    tiled_extent<2> extent<N>::tile(int t0, int t1) const [[cpu, hc]]
+    {
+        static_assert(
+            N == 2,
+            "Two-dimensional tile() method only available on extent<2>");
+        return tiled_extent<2>{*this, t0, t1};
+    }
+
+    template <int N>
+    inline
+    tiled_extent<3> extent<N>::tile(int t0, int t1, int t2) const [[cpu, hc]]
+    {
+        static_assert(
+            N == 3,
+            "Three-dimensional tile() method only available on extent<3>");
+        return tiled_extent<3>{*this, t0, t1, t2};
+    }
+
+    // ------------------------------------------------------------------------
+    // implementation of extent<N>::tile_with_dynamic()
+    // ------------------------------------------------------------------------
+
+    template <int N>
+    inline
+    tiled_extent<1> extent<N>::tile_with_dynamic(
+        int t0, unsigned int dynamic_size) const [[cpu, hc]]
+    {
+        static_assert(
+            N == 1,
+            "One-dimensional tile() method only available on extent<1>");
+        return tiled_extent<1>{*this, t0, dynamic_size};
+    }
+
+    template <int N>
+    inline
+    tiled_extent<2> extent<N>::tile_with_dynamic(
+        int t0, int t1, unsigned int dynamic_size) const [[cpu, hc]]
+    {
+        static_assert(
+            N == 2,
+            "Two-dimensional tile() method only available on extent<2>");
+        return tiled_extent<2>{*this, t0, t1, dynamic_size};
+    }
+
+    template <int N>
+    inline
+    tiled_extent<3> extent<N>::tile_with_dynamic(
+        int t0, int t1, int t2, unsigned int dynamic_size) const [[cpu, hc]]
+    {
+        static_assert(
+            N == 3,
+            "Three-dimensional tile() method only available on extent<3>");
+        return tiled_extent<3>{*this, t0, t1, t2, dynamic_size};
+    }
+
+    // ------------------------------------------------------------------------
+    // Intrinsic functions for HSAIL instructions
+    // ------------------------------------------------------------------------
+
+    /**
+     * Fetch the size of a wavefront
+     *
+     * @return The size of a wavefront.
+     */
+    static constexpr auto __HSA_WAVEFRONT_SIZE__ = 64;
+
+    extern "C"
+    constexpr
+    unsigned int __wavesize() [[hc]];
+    #if __hcc_backend__ == HCC_BACKEND_AMDGPU
+        extern "C"
+        constexpr
+        inline
+        unsigned int __wavesize() [[hc]]
+        {
+            return __HSA_WAVEFRONT_SIZE__;
+        }
+    #endif
+
+    /**
+     * Count number of 1 bits in the input
+     *
+     * @param[in] input An unsigned 32-bit integer.
+     * @return Number of 1 bits in the input.
+     */
+    extern "C"
+    inline
+    unsigned int __popcount_u32_b32(unsigned int input) [[hc]]
+    {
+        return __builtin_popcount(input);
+    }
+
+    /**
+     * Count number of 1 bits in the input
+     *
+     * @param[in] input An unsigned 64-bit integer.
+     * @return Number of 1 bits in the input.
+     */
+    extern "C"
+    inline
+    unsigned int __popcount_u32_b64(unsigned long long int input) [[hc]]
+    {
+        return __builtin_popcountl(input);
+    }
+
+    /** @{ */
+    /**
+     * Extract a range of bits
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a>
+     * for more detailed specification of these functions.
+     */
+    extern "C"
+    inline
+    unsigned int __bitextract_u32(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]]
+    {
+        uint32_t offset = src1 & 31;
+        uint32_t width = src2 & 31;
+        return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
+    }
+
+    extern "C"
+    inline
+    std::uint64_t __bitextract_u64(
+        std::uint64_t src0, unsigned int src1, unsigned int src2) [[hc]]
+    {
+        uint64_t offset = src1 & 63;
+        uint64_t width = src2 & 63;
+        return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
+    }
+
+    extern "C"
+    int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    std::int64_t __bitextract_s64(
+        std::int64_t src0, unsigned int src1, unsigned int src2) [[hc]];
+    /** @} */
+
+    /** @{ */
+    /**
+     * Replace a range of bits
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a>
+     * for more detailed specification of these functions.
+     */
+    extern "C"
+    inline
+    unsigned int __bitinsert_u32(
+        unsigned int src0,
+        unsigned int src1,
+        unsigned int src2,
+        unsigned int src3) [[hc]]
+    {
+        uint32_t offset = src2 & 31;
+        uint32_t width = src3 & 31;
+        uint32_t mask = (1 << width) - 1;
+        return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+    }
+
+    extern "C"
+    inline
+    std::uint64_t __bitinsert_u64(
+        std::uint64_t src0,
+        std::uint64_t src1,
+        unsigned int src2,
+        unsigned int src3) [[hc]]
+    {
+        uint64_t offset = src2 & 63;
+        uint64_t width = src3 & 63;
+        uint64_t mask = (1 << width) - 1;
+        return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+    }
+
+    extern "C"
+    int __bitinsert_s32(
+        int src0, int src1, unsigned int src2, unsigned int src3) [[hc]];
+
+    extern "C"
+    std::int64_t __bitinsert_s64(
+        std::int64_t src0,
+        std::int64_t src1,
+        unsigned int src2,
+        unsigned int src3) [[hc]];
+    /** @} */
+
+    /** @{ */
+    /**
+     * Create a bit mask that can be used with bitselect
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a>
+     * for more detailed specification of these functions.
+     */
+    extern "C"
+    unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    std::uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) [[hc]];
+    /** @} */
+
+    /** @{ */
+    /**
+     * Reverse the bits
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a>
+     * for more detailed specification of these functions.
+     */
+
+    unsigned int __bitrev_b32(
+        unsigned int src0) [[hc]] __asm("llvm.bitreverse.i32");
+
+    std::uint64_t __bitrev_b64(
+        std::uint64_t src0) [[hc]] __asm("llvm.bitreverse.i64");
+
+    /** @} */
+
+    /** @{ */
+    /**
+     * Do bit field selection
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a>
+     * for more detailed specification of these functions.
+     */
+    extern "C"
+    inline
+    unsigned int __bitselect_b32(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]]
+    {
+        return (src1 & src0) | (src2 & ~src0);
+    }
+
+    extern "C"
+    inline
+    std::uint64_t __bitselect_b64(
+        std::uint64_t src0, std::uint64_t src1, std::uint64_t src2) [[hc]]
+    {
+        return (src1 & src0) | (src2 & ~src0);
+    }
+    /** @} */
+
+    /**
+     * Count leading zero bits in the input
+     *
+     * @param[in] input An unsigned 32-bit integer.
+     * @return Number of 0 bits until a 1 bit is found, counting start from the
+     *         most significant bit. -1 if there is no 0 bit.
+     */
+    extern "C"
+    inline
+    unsigned int __firstbit_u32_u32(unsigned int input) [[hc]]
+    {
+        return input == 0 ? -1 : __builtin_clz(input);
+    }
+
+    /**
+     * Count leading zero bits in the input
+     *
+     * @param[in] input An unsigned 64-bit integer.
+     * @return Number of 0 bits until a 1 bit is found, counting start from the
+     *         most significant bit. -1 if there is no 0 bit.
+     */
+    extern "C"
+    inline
+    unsigned int __firstbit_u32_u64(unsigned long long int input) [[hc]]
+    {
+        return input == 0 ? -1 : __builtin_clzl(input);
+    }
+
+    /**
+     * Count leading zero bits in the input
+     *
+     * @param[in] input An signed 32-bit integer.
+     * @return Finds the first bit set in a positive integer starting from the
+     *         most significant bit, or finds the first bit clear in a negative
+     *         integer from the most significant bit.
+     *         If no bits in the input are set, then dest is set to -1.
+     */
+    extern "C"
+    inline
+    unsigned int __firstbit_u32_s32(int input) [[hc]]
+    {
+        if (input == 0) {
+            return -1;
+        }
+
+        return input > 0 ?
+            __firstbit_u32_u32(input) : __firstbit_u32_u32(~input);
+    }
+
+
+    /**
+     * Count leading zero bits in the input
+     *
+     * @param[in] input An signed 64-bit integer.
+     * @return Finds the first bit set in a positive integer starting from the
+     *         most significant bit, or finds the first bit clear in a negative
+     *         integer from the most significant bit.
+     *         If no bits in the input are set, then dest is set to -1.
+     */
+    extern "C"
+    inline
+    unsigned int __firstbit_u32_s64(long long int input) [[hc]]
+    {
+        if (input == 0) {
+            return -1;
+        }
+
+        return input > 0 ?
+            __firstbit_u32_u64(input) : __firstbit_u32_u64(~input);
+    }
+
+    /** @{ */
+    /**
+     * Find the first bit set to 1 in a number starting from the least
+     * significant bit
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/bit_string.htm">HSA PRM 5.7</a>
+     * for more detailed specification of these functions.
+     */
+    extern "C"
+    inline
+    unsigned int __lastbit_u32_u32(unsigned int input) [[hc]]
+    {
+        return input == 0 ? -1 : __builtin_ctz(input);
+    }
+
+    extern "C"
+    inline
+    unsigned int __lastbit_u32_u64(unsigned long long int input) [[hc]]
+    {
+        return input == 0 ? -1 : __builtin_ctzl(input);
+    }
+
+    extern "C"
+    inline
+    unsigned int __lastbit_u32_s32(int input) [[hc]]
+    {
+        return __lastbit_u32_u32(input);
+    }
+
+    extern "C"
+    inline unsigned int __lastbit_u32_s64(unsigned long long input) [[hc]]
+    {
+        return __lastbit_u32_u64(input);
+    }
+    /** @} */
+
+    /** @{ */
+    /**
+     * Copy and interleave the lower half of the elements from
+     * each source into the destination
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a>
+     * for more detailed specification of these functions.
+     */
+    extern "C"
+    unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    std::uint64_t __unpacklo_u8x8(
+        std::uint64_t src0, std::uint64_t src1) [[hc]];
+
+    extern "C"
+    unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    std::uint64_t __unpacklo_u16x4(
+        std::uint64_t src0, std::uint64_t src1) [[hc]];
+
+    extern "C"
+    std::uint64_t __unpacklo_u32x2(
+        std::uint64_t src0, std::uint64_t src1) [[hc]];
+
+    extern "C"
+    int __unpacklo_s8x4(int src0, int src1) [[hc]];
+
+    extern "C"
+    std::int64_t __unpacklo_s8x8(std::int64_t src0, std::int64_t src1) [[hc]];
+
+    extern "C"
+    int __unpacklo_s16x2(int src0, int src1) [[hc]];
+
+    extern "C"
+    std::int64_t __unpacklo_s16x4(std::int64_t src0, std::int64_t src1) [[hc]];
+
+    extern "C"
+    std::int64_t __unpacklo_s32x2(std::int64_t src0, std::int64_t src1) [[hc]];
+    /** @} */
+
+    /** @{ */
+    /**
+     * Copy and interleave the upper half of the elements from
+     * each source into the destination
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a>
+     * for more detailed specification of these functions.
+     */
+    extern "C"
+    unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    std::uint64_t __unpackhi_u8x8(
+        std::uint64_t src0, std::uint64_t src1) [[hc]];
+
+    extern "C"
+    unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    std::uint64_t __unpackhi_u16x4(
+        std::uint64_t src0, std::uint64_t src1) [[hc]];
+
+    extern "C"
+    std::uint64_t __unpackhi_u32x2(
+        std::uint64_t src0, std::uint64_t src1) [[hc]];
+
+    extern "C"
+    int __unpackhi_s8x4(int src0, int src1) [[hc]];
+
+    extern "C"
+    std::int64_t __unpackhi_s8x8(std::int64_t src0, std::int64_t src1) [[hc]];
+
+    extern "C"
+    int __unpackhi_s16x2(int src0, int src1) [[hc]];
+
+    extern "C"
+    std::int64_t __unpackhi_s16x4(std::int64_t src0, std::int64_t src1) [[hc]];
+
+    extern "C"
+    std::int64_t __unpackhi_s32x2(std::int64_t src0, std::int64_t src1) [[hc]];
+    /** @} */
+
+    /** @{ */
+    /**
+     * Assign the elements of the packed value in src0, replacing
+     * the element specified by src2 with the value from src1
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a>
+     * for more detailed specification of these functions.
+     */
+    extern "C"
+    unsigned int __pack_u8x4_u32(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    std::uint64_t __pack_u8x8_u32(
+        std::uint64_t src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    unsigned __pack_u16x2_u32(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    std::uint64_t __pack_u16x4_u32(
+        std::uint64_t src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    std::uint64_t __pack_u32x2_u32(
+        std::uint64_t src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    int __pack_s8x4_s32(int src0, int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    std::int64_t __pack_s8x8_s32(
+        std::int64_t src0, int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    int __pack_s16x2_s32(int src0, int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    std::int64_t __pack_s16x4_s32(
+        std::int64_t src0, int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    std::int64_t __pack_s32x2_s32(
+        std::int64_t src0, int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    double __pack_f32x2_f32(double src0, float src1, unsigned int src2) [[hc]];
+    /** @} */
+
+    /** @{ */
+    /**
+     * Assign the elements specified by src1 from the packed value in src0
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/packed_data.htm">HSA PRM 5.9</a>
+     * for more detailed specification of these functions.
+     */
+    extern "C"
+    unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    unsigned int __unpack_u32_u16x2(
+        unsigned int src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    unsigned int __unpack_u32_u16x4(
+        std::uint64_t src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    unsigned int __unpack_u32_u32x2(
+        std::uint64_t src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    int __unpack_s32_s8x4(int src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    int __unpack_s32_s8x8(std::int64_t src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    int __unpack_s32_s16x2(int src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    int __unpack_s32_s16x4(std::int64_t src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    int __unpack_s32_s3x2(std::int64_t src0, unsigned int src1) [[hc]];
+
+    extern "C"
+    float __unpack_f32_f32x2(double src0, unsigned int src1) [[hc]];
+    /** @} */
+
+    /**
+     * Align 32 bits within 64 bits of data on an arbitrary bit boundary
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a>
+     * for more detailed specification.
+     */
+    extern "C"
+    unsigned int __bitalign_b32(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    /**
+     * Align 32 bits within 64 bis of data on an arbitrary byte boundary
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a>
+     * for more detailed specification.
+     */
+    extern "C"
+    unsigned int __bytealign_b32(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    /**
+     * Do linear interpolation and computes the unsigned 8-bit average of packed
+     * data
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a>
+     * for more detailed specification.
+     */
+    extern "C"
+    unsigned int __lerp_u8x4(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    /**
+     * Takes four floating-point number, convers them to unsigned integer
+     * values, and packs them into a packed u8x4 value
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a>
+     * for more detailed specification.
+     */
+    extern "C"
+    unsigned int __packcvt_u8x4_f32(
+        float src0, float src1, float src2, float src3) [[hc]];
+
+    /**
+     * Unpacks a single element from a packed u8x4 value and converts it to an
+     * f32.
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a>
+     * for more detailed specification.
+     */
+    extern "C"
+    float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) [[hc]];
+
+    /** @{ */
+    /**
+     * Computes the sum of the absolute differences of src0 and src1 and then
+     * adds src2 to the result
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a> 
+     * for more detailed specification.
+     */
+    extern "C"
+    unsigned int __sad_u32_u32(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    unsigned int __sad_u32_u16x2(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    extern "C"
+    unsigned int __sad_u32_u8x4(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]];
+    /** @} */
+
+    /**
+     * This function is mostly the same as sad except the sum of absolute
+     * differences is added to the most significant 16 bits of the result
+     *
+     * Please refer to <a href="http://www.hsafoundation.com/html/Content/PRM/Topics/05_Arithmetic/multimedia.htm">HSA PRM 5.15</a>
+     * for more detailed specification.
+     */
+    extern "C"
+    unsigned int __sadhi_u16x2_u8x4(
+        unsigned int src0, unsigned int src1, unsigned int src2) [[hc]];
+
+    /**
+     * Get system timestamp
+     */
+    extern "C"
+    std::uint64_t __clock_u64() [[hc]];
+
+    /**
+     * Get hardware cycle count
+     *
+     * Notice the return value of this function is implementation defined.
+     */
+    extern "C"
+    std::uint64_t __cycle_u64() [[hc]];
+
+    /**
+     * Get the count of the number of earlier (in flattened
+     * work-item order) active work-items within the same wavefront.
+     *
+     * @return The result will be in the range 0 to WAVESIZE - 1.
+     */
+    extern "C"
+    unsigned int __activelaneid_u32() [[hc]];
+
+    /**
+     * Return a bit mask shows which active work-items in the
+     * wavefront have a non-zero input. The affected bit position within the
+     * registers of dest corresponds to each work-item's lane ID.
+     *
+     * The HSAIL instruction would return 4 64-bit registers but the current
+     * implementation would only return the 1st one and ignore the other 3 as
+     * right now all HSA agents have wavefront of size 64.
+     *
+     * @param[in] input An unsigned 32-bit integer.
+     * @return The bitmask calculated.
+     */
+    extern "C"
+    std::uint64_t __activelanemask_v4_b64_b1(unsigned int input) [[hc]];
+
+    /**
+     * Count the number of active work-items in the current
+     * wavefront that have a non-zero input.
+     *
+     * @param[in] input An unsigned 32-bit integer.
+     * @return The number of active work-items in the current wavefront that
+     *         have a non-zero input.
+     */
+    extern "C"
+    inline
+    unsigned int __activelanecount_u32_b1(unsigned int input) [[hc]]
+    {
+        return  __popcount_u32_b64(__activelanemask_v4_b64_b1(input));
+    }
+
+    // ------------------------------------------------------------------------
+    // Wavefront Vote Functions
+    // ------------------------------------------------------------------------
+
+    /**
+     * Evaluate predicate for all active work-items in the wavefront and return
+     * non-zero if and only if predicate evaluates to non-zero for any of them.
+     */
+    extern "C"
+    bool __ockl_wfany_i32(int) [[hc]];
+    extern "C"
+    inline
+    int __any(int predicate) [[hc]]
+    {
+        return __ockl_wfany_i32(predicate);
+    }
+
+    /**
+     * Evaluate predicate for all active work-items in the wavefront and return
+     * non-zero if and only if predicate evaluates to non-zero for all of them.
+     */
+    extern "C"
+    bool __ockl_wfall_i32(int) [[hc]];
+    extern "C"
+    inline
+    int __all(int predicate) [[hc]]
+    {
+        return __ockl_wfall_i32(predicate);
+    }
+
+    /**
+     * Evaluate predicate for all active work-items in the wavefront and return
+     * an integer whose Nth bit is set if and only if predicate evaluates to
+     * non-zero for the Nth work-item of the wavefront and the Nth work-item is
+     * active.
+     */
+
+    // XXX from llvm/include/llvm/IR/InstrTypes.h
+    __attribute__((convergent))
+    unsigned long long __llvm_amdgcn_icmp_i32(
+        unsigned int x,
+        unsigned int y,
+        unsigned int z) [[hc]] __asm("llvm.amdgcn.icmp.i32");
+    extern "C"
+    inline
+    std::uint64_t __ballot(int predicate) [[hc]]
+    {
+        static constexpr unsigned int ICMP_NE = 33;
+        return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
+    }
+
+    // ------------------------------------------------------------------------
+    // Wavefront Shuffle Functions
+    // ------------------------------------------------------------------------
+
+    // utility union type
+    union __u {
+        int i;
+        unsigned int u;
+        float f;
+    };
+
+    /** @{ */
+    /**
+     * Direct copy from indexed active work-item within a wavefront.
+     *
+     * Work-items may only read data from another work-item which is active in
+     * the current wavefront. If the target work-item is inactive, the retrieved
+     * value is fixed as 0.
+     *
+     * The function returns the value of var held by the work-item whose ID is
+     * given by srcLane. If width is less than __HSA_WAVEFRONT_SIZE__ then each
+     * subsection of the wavefront behaves as a separate entity with a starting
+     * logical work-item ID of 0. If srcLane is outside the range [0:width-1],
+     * the value returned corresponds to the value of var held by:
+     * srcLane modulo width (i.e. within the same subsection).
+     *
+     * The optional width parameter must have a value which is a power of 2;
+     * results are undefined if it is not a power of 2, or is number greater
+     * than __HSA_WAVEFRONT_SIZE__.
+     */
+
+    #if __hcc_backend__ == HCC_BACKEND_AMDGPU
+        /*
+        * FIXME: We need to add __builtin_amdgcn_mbcnt_{lo,hi} to clang and call
+        * them here instead.
+        */
+
+        int __amdgcn_mbcnt_lo(
+            int mask, int src) [[hc]] __asm("llvm.amdgcn.mbcnt.lo");
+        int __amdgcn_mbcnt_hi(
+            int mask, int src) [[hc]] __asm("llvm.amdgcn.mbcnt.hi");
+
+        inline
+        int __lane_id(void) [[hc]]
+        {
+            int lo = __amdgcn_mbcnt_lo(-1, 0);
+            return __amdgcn_mbcnt_hi(-1, lo);
+        }
+    #endif
+
+    #if __hcc_backend__ == HCC_BACKEND_AMDGPU
+        /**
+         * ds_bpermute intrinsic
+         * FIXME: We need to add __builtin_amdgcn_ds_bpermute to clang and call
+         * it here instead.
+         */
+        int __amdgcn_ds_bpermute(
+            int index, int src) [[hc]] __asm("llvm.amdgcn.ds.bpermute");
+        inline
+        unsigned int __amdgcn_ds_bpermute(int index, unsigned int src) [[hc]]
+        {
+            __u tmp; tmp.u = src;
+            tmp.i = __amdgcn_ds_bpermute(index, tmp.i);
+            return tmp.u;
+        }
+        inline
+        float __amdgcn_ds_bpermute(int index, float src) [[hc]]
+        {
+            __u tmp; tmp.f = src;
+            tmp.i = __amdgcn_ds_bpermute(index, tmp.i);
+            return tmp.f;
+        }
+
+        /**
+         * ds_permute intrinsic
+         */
+        extern "C"
+        int __amdgcn_ds_permute(int index, int src) [[hc]];
+        inline
+        unsigned int __amdgcn_ds_permute(int index, unsigned int src) [[hc]]
+        {
+            __u tmp; tmp.u = src;
+            tmp.i = __amdgcn_ds_permute(index, tmp.i);
+            return tmp.u;
+        }
+        inline
+        float __amdgcn_ds_permute(int index, float src) [[hc]]
+        {
+            __u tmp; tmp.f = src;
+            tmp.i = __amdgcn_ds_permute(index, tmp.i);
+            return tmp.f;
+        }
+
+        /**
+         * ds_swizzle intrinsic
+         */
+        extern "C"
+        int __amdgcn_ds_swizzle(int src, int pattern) [[hc]];
+        inline
+        unsigned int __amdgcn_ds_swizzle(unsigned int src, int pattern) [[hc]]
+        {
+            __u tmp; tmp.u = src;
+            tmp.i = __amdgcn_ds_swizzle(tmp.i, pattern);
+            return tmp.u;
+        }
+        inline
+        float __amdgcn_ds_swizzle(float src, int pattern) [[hc]]
+        {
+            __u tmp; tmp.f = src;
+            tmp.i = __amdgcn_ds_swizzle(tmp.i, pattern);
+            return tmp.f;
+        }
+
+        /**
+         * move DPP intrinsic
+         */
+        extern "C"
+        int __amdgcn_move_dpp(
+            int src,
+            int dpp_ctrl,
+            int row_mask,
+            int bank_mask,
+            bool bound_ctrl) [[hc]];
+
+        /**
+         * Shift the value of src to the right by one thread within a wavefront.
+         *
+         * @param[in] src variable being shifted
+         * @param[in] bound_ctrl When set to true, a zero will be shifted into
+         *                       thread 0; otherwise, the original value will be
+         *                       returned for thread 0
+         * @return value of src being shifted into from the neighboring lane
+         *
+         */
+        extern "C"
+        int __amdgcn_wave_sr1(int src, bool bound_ctrl) [[hc]];
+        inline
+        unsigned int __amdgcn_wave_sr1(unsigned int src, bool bound_ctrl) [[hc]]
+        {
+            __u tmp; tmp.u = src;
+            tmp.i = __amdgcn_wave_sr1(tmp.i, bound_ctrl);
+            return tmp.u;
+        }
+        inline
+        float __amdgcn_wave_sr1(float src, bool bound_ctrl) [[hc]]
+        {
+            __u tmp; tmp.f = src;
+            tmp.i = __amdgcn_wave_sr1(tmp.i, bound_ctrl);
+            return tmp.f;
+        }
+
+        /**
+         * Shift the value of src to the left by one thread within a wavefront.
+         *
+         * @param[in] src variable being shifted
+         * @param[in] bound_ctrl When set to true, a zero will be shifted into
+         *                       thread 63; otherwise, the original value will
+         *                       be returned for thread 63
+         * @return value of src being shifted into from the neighboring lane
+         *
+         */
+        extern "C"
+        int __amdgcn_wave_sl1(int src, bool bound_ctrl) [[hc]];
+        inline
+        unsigned int __amdgcn_wave_sl1(unsigned int src, bool bound_ctrl) [[hc]]
+        {
+            __u tmp; tmp.u = src;
+            tmp.i = __amdgcn_wave_sl1(tmp.i, bound_ctrl);
+            return tmp.u;
+        }
+        inline
+        float __amdgcn_wave_sl1(float src, bool bound_ctrl) [[hc]]
+        {
+            __u tmp; tmp.f = src;
+            tmp.i = __amdgcn_wave_sl1(tmp.i, bound_ctrl);
+            return tmp.f;
+        }
+
+        /**
+         * Rotate the value of src to the right by one thread within a
+         * wavefront.
+         *
+         * @param[in] src variable being rotated
+         * @return value of src being rotated into from the neighboring lane
+         *
+         */
+        extern "C"
+        int __amdgcn_wave_rr1(int src) [[hc]];
+        inline
+        unsigned int __amdgcn_wave_rr1(unsigned int src) [[hc]]
+        {
+            __u tmp; tmp.u = src;
+            tmp.i = __amdgcn_wave_rr1(tmp.i);
+            return tmp.u;
+        }
+        inline
+        float __amdgcn_wave_rr1(float src) [[hc]]
+        {
+            __u tmp; tmp.f = src;
+            tmp.i = __amdgcn_wave_rr1(tmp.i);
+            return tmp.f;
+        }
+
+        /**
+         * Rotate the value of src to the left by one thread within a wavefront.
+         *
+         * @param[in] src variable being rotated
+         * @return value of src being rotated into from the neighboring lane
+         *
+         */
+        extern "C"
+        int __amdgcn_wave_rl1(int src) [[hc]];
+        inline
+        unsigned int __amdgcn_wave_rl1(unsigned int src) [[hc]]
+        {
+            __u tmp; tmp.u = src;
+            tmp.i = __amdgcn_wave_rl1(tmp.i);
+            return tmp.u;
+        }
+        inline
+        float __amdgcn_wave_rl1(float src) [[hc]]
+        {
+            __u tmp; tmp.f = src;
+            tmp.i = __amdgcn_wave_rl1(tmp.i);
+            return tmp.f;
+        }
+    #endif
+
+    /* definition to expand macro then apply to pragma message
+    #define VALUE_TO_STRING(x) #x
+    #define VALUE(x) VALUE_TO_STRING(x)
+    #define VAR_NAME_VALUE(var) #var "="  VALUE(var)
+    #pragma message(VAR_NAME_VALUE(__hcc_backend__))
+    */
+
+    #if __hcc_backend__ == HCC_BACKEND_AMDGPU
+        inline
+        int __shfl(
+            int var, int srcLane, int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            int self = __lane_id();
+            int index = srcLane + (self & ~(width-1));
+            return __amdgcn_ds_bpermute(index<<2, var);
+        }
+
+        inline
+        unsigned int __shfl(
+            unsigned int var,
+            int srcLane,
+            int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            __u tmp; tmp.u = var;
+            tmp.i = __shfl(tmp.i, srcLane, width);
+            return tmp.u;
+        }
+
+
+        inline
+        float __shfl(
+            float var, int srcLane, int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            __u tmp; tmp.f = var;
+            tmp.i = __shfl(tmp.i, srcLane, width);
+            return tmp.f;
+        }
+    #endif
+
+    // FIXME: support half type
+    /** @} */
+
+    /** @{ */
+    /**
+     * Copy from an active work-item with lower ID relative to caller within a
+     * wavefront.
+     *
+     * Work-items may only read data from another work-item which is active in
+     * the current wavefront. If the target work-item is inactive, the retrieved
+     * value is fixed as 0.
+     *
+     * The function calculates a source work-item ID by subtracting delta from
+     * the caller's work-item ID within the wavefront. The value of var held by
+     * the resulting lane ID is returned: in effect, var is shifted up the
+     * wavefront by delta work-items. If width is less than
+     * __HSA_WAVEFRONT_SIZE__ then each subsection of the wavefront behaves as a
+     * separate entity with a starting logical work-item ID of 0. The source
+     * work-item index will not wrap around the value of width, so effectively
+     * the lower delta work-items will be unchanged.
+     *
+     * The optional width parameter must have a value which is a power of 2;
+     * results are undefined if it is not a power of 2, or is number greater
+     * than __HSA_WAVEFRONT_SIZE__.
+     */
+
+    #if __hcc_backend__ == HCC_BACKEND_AMDGPU
+        inline
+        int __shfl_up(
+            int var,
+            unsigned int delta,
+            int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            int self = __lane_id();
+            int index = self - delta;
+            index = (index < (self & ~(width-1)))?self:index;
+            return __amdgcn_ds_bpermute(index<<2, var);
+        }
+
+        inline
+        unsigned int __shfl_up(
+            unsigned int var,
+            unsigned int delta,
+            int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            __u tmp; tmp.u = var;
+            tmp.i = __shfl_up(tmp.i, delta, width);
+            return tmp.u;
+        }
+
+        inline
+        float __shfl_up(
+            float var,
+            unsigned int delta,
+            int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            __u tmp; tmp.f = var;
+            tmp.i = __shfl_up(tmp.i, delta, width);
+            return tmp.f;
+        }
+    #endif
+
+    // FIXME: support half type
+    /** @} */
+
+    /** @{ */
+    /**
+     * Copy from an active work-item with higher ID relative to
+     * caller within a wavefront.
+     *
+     * Work-items may only read data from another work-item which is active in
+     * the current wavefront. If the target work-item is inactive, the retrieved
+     * value is fixed as 0.
+     *
+     * The function calculates a source work-item ID by adding delta from the
+     * caller's work-item ID within the wavefront. The value of var held by the
+     * resulting lane ID is returned: this has the effect of shifting var up the
+     * wavefront by delta work-items. If width is less than
+     * __HSA_WAVEFRONT_SIZE__ then each subsection of the wavefront behaves as a
+     * separate entity with a starting logical work-item ID of 0. The ID number
+     * of the source work-item index will not wrap around the value of width, so
+     * the upper delta work-items will remain unchanged.
+     *
+     * The optional width parameter must have a value which is a power of 2;
+     * results are undefined if it is not a power of 2, or is number greater
+     * than __HSA_WAVEFRONT_SIZE__.
+     */
+
+    #if __hcc_backend__ == HCC_BACKEND_AMDGPU
+        inline
+        int __shfl_down(
+            int var,
+            unsigned int delta,
+            int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            int self = __lane_id();
+            int index = self + delta;
+            index = (int)((self&(width-1))+delta) >= width?self:index;
+            return __amdgcn_ds_bpermute(index<<2, var);
+        }
+
+        inline
+        unsigned int __shfl_down(
+            unsigned int var,
+            unsigned int delta,
+            int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            __u tmp; tmp.u = var;
+            tmp.i = __shfl_down(tmp.i, delta, width);
+            return tmp.u;
+        }
+
+        inline
+        float __shfl_down(
+            float var,
+            unsigned int delta,
+            int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            __u tmp; tmp.f = var;
+            tmp.i = __shfl_down(tmp.i, delta, width);
+            return tmp.f;
+        }
+    #endif
+
+    // FIXME: support half type
+    /** @} */
+
+    /** @{ */
+    /**
+     * Copy from an active work-item based on bitwise XOR of caller work-item ID
+     * within a wavefront.
+     *
+     * Work-items may only read data from another work-item which is active in
+     * the current wavefront. If the target work-item is inactive, the retrieved
+     * value is fixed as 0.
+     *
+     * THe function calculates a source work-item ID by performing a bitwise XOR
+     * of the caller's work-item ID with laneMask: the value of var held by the
+     * resulting work-item ID is returned.
+     *
+     * The optional width parameter must have a value which is a power of 2;
+     * results are undefined if it is not a power of 2, or is number greater
+     * than __HSA_WAVEFRONT_SIZE__.
+     */
+
+    #if __hcc_backend__ == HCC_BACKEND_AMDGPU
+        inline
+        int __shfl_xor(
+            int var, int laneMask, int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            int self = __lane_id();
+            int index = self^laneMask;
+            index = index >= ((self+width)&~(width-1))?self:index;
+            return __amdgcn_ds_bpermute(index<<2, var);
+        }
+
+        inline
+        float __shfl_xor(
+            float var, int laneMask, int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            __u tmp; tmp.f = var;
+            tmp.i = __shfl_xor(tmp.i, laneMask, width);
+            return tmp.f;
+        }
+
+        // FIXME: support half type
+        /** @} */
+
+        inline
+        unsigned int __shfl_xor(
+            unsigned int var,
+            int laneMask,
+            int width = __HSA_WAVEFRONT_SIZE__) [[hc]]
+        {
+            __u tmp; tmp.u = var;
+            tmp.i = __shfl_xor(tmp.i, laneMask, width);
+            return tmp.u;
+        }
+    #endif
+
+    /**
+     * Multiply two unsigned integers (x,y) but only the lower 24 bits will be
+     * used in the multiplication.
+     *
+     * @param[in] x 24-bit unsigned integer multiplier
+     * @param[in] y 24-bit unsigned integer multiplicand
+     * @return 32-bit unsigned integer product
+     */
+    inline
+    unsigned int __mul24(unsigned int x, unsigned int y) [[hc]]
+    {
+        return (x & 0x00FFFFFF) * (y & 0x00FFFFFF);
+    }
+
+    /**
+     * Multiply two integers (x,y) but only the lower 24 bits will be used in
+     * the multiplication.
+     *
+     * @param[in] x 24-bit integer multiplier
+     * @param[in] y 24-bit integer multiplicand
+     * @return 32-bit integer product
+     */
+    inline
+    int __mul24(int x, int y) [[hc]]
+    {
+        return  ((x << 8) >> 8) * ((y << 8) >> 8);
+    }
+
+    /**
+     * Multiply two unsigned integers (x,y) but only the lower 24 bits will be
+     * used in the multiplication and then add the product to a 32-bit unsigned
+     * integer
+     *
+     * @param[in] x 24-bit unsigned integer multiplier
+     * @param[in] y 24-bit unsigned integer multiplicand
+     * @param[in] z 32-bit unsigned integer to be added to the product
+     * @return 32-bit unsigned integer result of mad24
+     */
+    inline
+    unsigned int __mad24(unsigned int x, unsigned int y, unsigned int z) [[hc]]
+    {
+        return __mul24(x,y) + z;
+    }
+
+    /**
+     * Multiply two integers (x,y) but only the lower 24 bits will be used in
+     * the multiplication and then add the product to a 32-bit integer
+     *
+     * @param[in] x 24-bit integer multiplier
+     * @param[in] y 24-bit integer multiplicand
+     * @param[in] z 32-bit integer to be added to the product
+     * @return 32-bit integer result of mad24
+     */
+    inline
+    int __mad24(int x, int y, int z) [[hc]]
+    {
+        return __mul24(x,y) + z;
+    }
+
+    inline
+    void abort() [[hc]]
+    {
+        __builtin_trap();
+    }
+
+    // ------------------------------------------------------------------------
+    // group segment
+    // ------------------------------------------------------------------------
+
+    /**
+     * Fetch the size of group segment. This includes both static group segment
+     * and dynamic group segment.
+     *
+     * @return The size of group segment used by the kernel in bytes. The value
+     *         includes both static group segment and dynamic group segment.
+     */
+    extern "C" unsigned int get_group_segment_size() [[hc]];
+
+    /**
+     * Fetch the size of static group segment
+     *
+     * @return The size of static group segment used by the kernel in bytes.
+     */
+    extern "C" unsigned int get_static_group_segment_size() [[hc]];
+
+    /**
+     * Fetch the address of the beginning of group segment.
+     */
+    extern "C" void* get_group_segment_base_pointer() [[hc]];
+
+    /**
+     * Fetch the address of the beginning of dynamic group segment.
+     */
+    extern "C" void* get_dynamic_group_segment_base_pointer() [[hc]];
+
+    // ------------------------------------------------------------------------
+    // tiled_barrier
+    // ------------------------------------------------------------------------
+
+    /**
+     * The tile_barrier class is a capability class that is only creatable by
+     * the system, and passed to a tiled parallel_for_each function object as
+     * part of the tiled_index parameter. It provides member functions, such as
+     * wait, whose purpose is to synchronize execution of threads running within
+     * the thread tile.
+     */
+    class tile_barrier {
+    public:
+        /**
+         * Copy constructor. Constructs a new tile_barrier from the supplied
+         * argument "other".
+         *
+         * @param[in] other An object of type tile_barrier from which to
+         *                  initialize this.
+         */
+        tile_barrier(const tile_barrier&) [[cpu, hc]] = default;
+
+        /**
+         * Blocks execution of all threads in the thread tile until all threads
+         * in the tile have reached this call. Establishes a memory fence on all
+         * tile_static and global memory operations executed by the threads in
+         * the tile such that all memory operations issued prior to hitting the
+         * barrier are visible to all other threads after the barrier has
+         * completed and none of the memory operations occurring after the
+         * barrier are executed before hitting the barrier. This is identical to
+         * wait_with_all_memory_fence().
+         */
+        void wait() const noexcept [[hc]]
+        {
+            wait_with_all_memory_fence();
+        }
+
+        /**
+         * Blocks execution of all threads in the thread tile until all threads in
+         * the tile have reached this call. Establishes a memory fence on all
+         * tile_static and global memory operations executed by the threads in the
+         * tile such that all memory operations issued prior to hitting the barrier
+         * are visible to all other threads after the barrier has completed and
+         * none of the memory operations occurring after the barrier are executed
+         * before hitting the barrier. This is identical to wait().
+         */
+        void wait_with_all_memory_fence() const noexcept [[hc]]
+        {
+            hc_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+        }
+
+        /**
+         * Blocks execution of all threads in the thread tile until all threads
+         * in the tile have reached this call. Establishes a memory fence on
+         * global memory operations (but not tile-static memory operations)
+         * executed by the threads in the tile such that all global memory
+         * operations issued prior to hitting the barrier are visible to all
+         * other threads after the barrier has completed and none of the global
+         * memory operations occurring after the barrier are executed before
+         * hitting the barrier.
+         */
+        void wait_with_global_memory_fence() const noexcept [[hc]]
+        {
+            hc_barrier(CLK_GLOBAL_MEM_FENCE);
+        }
+
+        /**
+         * Blocks execution of all threads in the thread tile until all threads
+         * in the tile have reached this call. Establishes a memory fence on
+         * tile-static memory operations (but not global memory operations)
+         * executed by the threads in the tile such that all tile_static memory
+         * operations issued prior to hitting the barrier are visible to all
+         * other threads after the barrier has completed and none of the
+         * tile-static memory operations occurring after the barrier are
+         * executed before hitting the barrier.
+         */
+        void wait_with_tile_static_memory_fence() const [[hc]] {
+            hc_barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+    private:
+        tile_barrier() [[hc]] = default;
+
+        template <int N> friend
+            class tiled_index;
+    };
+
+    // ------------------------------------------------------------------------
+    // other memory fences
+    // ------------------------------------------------------------------------
+
+    /**
+     * Establishes a thread-tile scoped memory fence for both global and
+     * tile-static memory operations. This function does not imply a barrier and
+     * is therefore permitted in divergent code.
+     */
+    // FIXME: this functions has not been implemented.
+    void all_memory_fence(const tile_barrier&) [[hc]];
+
+    /**
+     * Establishes a thread-tile scoped memory fence for global (but not
+     * tile-static) memory operations. This function does not imply a barrier
+     * and is therefore permitted in divergent code.
+     */
+    // FIXME: this functions has not been implemented.
+    void global_memory_fence(const tile_barrier&) [[hc]];
+
+    /**
+     * Establishes a thread-tile scoped memory fence for tile-static (but not
+     * global) memory operations. This function does not imply a barrier and is
+     * therefore permitted in divergent code.
+     */
+    // FIXME: this functions has not been implemented.
+    void tile_static_memory_fence(const tile_barrier&) [[hc]];
+
+    // ------------------------------------------------------------------------
+    // tiled_index
+    // ------------------------------------------------------------------------
+
+    /**
+     * Represents a set of related indices subdivided into 1-, 2-, or
+     * 3-dimensional tiles.
+     *
+     * @tparam n Tile dimension.
+     */
+    template<int n>
+    class tiled_index {
+        friend struct detail::Indexer;
+
+        template<typename Kernel>
+        friend
+        completion_future parallel_for_each(
+            const accelerator_view&, const tiled_extent<n>&, const Kernel&);
+
+        // TODO: convert to using the hc_ flavoured functions.
+        template<int m = n, typename std::enable_if<m == 1>::type* = nullptr>
+        tiled_index() [[hc]]
+            : global{hc_get_workitem_absolute_id(0)},
+            local{hc_get_workitem_id(0)},
+            tile{hc_get_group_id(0)},
+            tile_origin{global[0] - local[0]},
+            tile_dim{hc_get_group_size(0)}
+        {}
+        template<int m = n, typename std::enable_if<m == 2>::type* = nullptr>
+        tiled_index() [[hc]]
+            : global{
+                hc_get_workitem_absolute_id(1), hc_get_workitem_absolute_id(0)},
+            local{hc_get_workitem_id(1), hc_get_workitem_id(0)},
+            tile{hc_get_group_id(1), hc_get_group_id(0)},
+            tile_origin{global[0] - local[0], global[1] - local[1]},
+            tile_dim{hc_get_group_size(1), hc_get_group_size(0)}
+        {}
+
+        template<int m = n, typename std::enable_if<m == 3>::type* = nullptr>
+        tiled_index() [[hc]]
+            :
+            global{
+                hc_get_workitem_absolute_id(2),
+                hc_get_workitem_absolute_id(1),
+                hc_get_workitem_absolute_id(0)},
+            local{
+                hc_get_workitem_id(2),
+                hc_get_workitem_id(1),
+                hc_get_workitem_id(0)},
+            tile{hc_get_group_id(2), hc_get_group_id(1), hc_get_group_id(0)},
+            tile_origin{
+                global[0] - local[0],
+                global[1] - local[1],
+                global[2] - local[2]},
+            tile_dim{
+                hc_get_group_size(2),
+                hc_get_group_size(1),
+                hc_get_group_size(0)}
+        {}
+    public:
+        /**
+         * A static member of tiled_index that contains the rank of this tiled
+         * extent, and is either 1, 2, or 3 depending on the specialization
+         * used.
+         */
+        static constexpr int rank{n};
+
+        tiled_index(const index<n>& g) [[cpu, hc]] : tiled_index{}
+        {
+            const_cast<index<n>&>(global) = g; // TODO: remove yucky cast.
+        }
+
+        /**
+         * Copy constructor. Constructs a new tiled_index from the supplied
+         * argument "other".
+         *
+         * @param[in] other An object of type tiled_index from which to
+         *                  initialize this.
+         */
+        tiled_index(const tiled_index&) [[cpu, hc]] = default;
+        tiled_index(tiled_index&&) [[cpu, hc]] = default;
+
+        /**
+         * An index of rank 1, 2, or 3 that represents the global index within
+         * an extent.
+         */
+        const index<n> global;
+
+        /**
+         * An index of rank 1, 2, or 3 that represents the relative index within
+         * the current tile of a tiled extent.
+         */
+        const index<n> local;
+
+        /**
+         * An index of rank 1, 2, or 3 that represents the coordinates of the
+         * current tile of a tiled extent.
+         */
+        const index<n> tile;
+
+        /**
+         * An index of rank 1, 2, or 3 that represents the global coordinates of
+         * the origin of the current tile within a tiled extent.
+         */
+        const index<n> tile_origin;
+
+        /**
+         * An object which represents a barrier within the current tile of
+         * threads.
+         */
+        const tile_barrier barrier;
+
+        /**
+         * An index of rank 1, 2, 3 that represents the size of the tile.
+         */
+        const index<n> tile_dim;
+
+        /**
+         * Implicit conversion operator that converts a tiled_index<N> into
+         * an index<N>. The implicit conversion converts to the .global index
+         * member.
+         */
+        operator index<n>() const [[cpu, hc]]
+        {
+            return global;
+        }
+    };
+
+    // ------------------------------------------------------------------------
+    // utility helper classes for array_view
+    // ------------------------------------------------------------------------
+
+    template<typename T>
+    struct __has_data {
+    private:
+        struct two {char __lx; char __lxx;};
+        template<typename C>
+        static
+        char test(decltype(std::declval<C>().data()));
+        template<typename C>
+        static two test(...);
+    public:
+        static constexpr bool value = sizeof(test<T>(0)) == 1;
+    };
+
+    template<typename T>
+    struct __has_size {
+    private:
+        struct two {char __lx; char __lxx;};
+        template <typename C> static char test(decltype(&C::size));
+        template <typename C> static two test(...);
+    public:
+        static constexpr bool value = sizeof(test<T>(0)) == 1;
+    };
+
+    template<typename T>
+    struct __is_container {
+        using _T = typename std::remove_reference<T>::type;
+        static constexpr bool value =
+            __has_size<_T>::value && __has_data<_T>::value;
+    };
+
+
+    // ------------------------------------------------------------------------
+    // forward declarations of copy routines used by array / array_view
+    // ------------------------------------------------------------------------
+
+    template<typename T, int N>
+    void copy(const array_view<const T, N>& src, const array_view<T, N>& dest);
+
+    template<typename T, int N>
+    void copy(const array_view<T, N>& src, const array_view<T, N>& dest);
+
+    template<typename T, int N>
+    void copy(const array<T, N>& src, const array_view<T, N>& dest);
+
+    template<typename T, int N>
+    void copy(const array<T, N>& src, array<T, N>& dest);
+
+    template<typename T, int N>
+    void copy(const array_view<const T, N>& src, array<T, N>& dest);
+
+    template<typename T, int N>
+    void copy(const array_view<T, N>& src, array<T, N>& dest);
+
+    template<typename InputIter, typename T, int N>
+    void copy(
+        InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);
+
+    template<typename InputIter, typename T, int N>
+    void copy(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);
+
+    template<typename InputIter, typename T, int N>
+    void copy(InputIter srcBegin, const array_view<T, N>& dest);
+
+    template<typename InputIter, typename T, int N>
+    void copy(InputIter srcBegin, array<T, N>& dest);
+
+    template<typename OutputIter, typename T, int N>
+    void copy(const array_view<T, N> &src, OutputIter destBegin);
+
+    template<typename OutputIter, typename T, int N>
+    void copy(const array<T, N> &src, OutputIter destBegin);
+
+    // ------------------------------------------------------------------------
+    // array
+    // ------------------------------------------------------------------------
+
+    /**
+     * Represents an N-dimensional region of memory (with type T) located on an
+     * accelerator.
+     *
+     * @tparam T The element type of this array
+     * @tparam N The dimensionality of the array, defaults to 1 if elided.
+     */
+    struct array_base {
+        struct Deleter {
+            template<typename T>
+            void operator()(T* ptr) const noexcept
+            {
+                if (!ptr) return;
+                if (hsa_memory_free(ptr) == HSA_STATUS_SUCCESS) return;
+
+                std::cerr << "Failed to deallocate array memory; HC runtime may"
+                    << " be in an inconsistent state." << std::endl;
+            }
+        };
+        using GuardedWriterConcurrentList_ = std::pair<
+            std::atomic_flag,
+            std::pair<std::mutex, std::forward_list<std::shared_future<void>>>>;
+
+        static constexpr std::size_t max_array_cnt_{65536u};
+
+        static
+        std::vector<std::size_t>& captured_()
+        {
+            thread_local static std::vector<std::size_t> r{};
+
+            return r;
+        }
+
+        static
+        std::vector<GuardedWriterConcurrentList_>& writers_()
+        {   // TODO: this is a placeholder, and most dubious.
+            static std::vector<GuardedWriterConcurrentList_> r{max_array_cnt_};
+
+            return r;
+        }
+
+        static
+        detail::N_way_set_associative_cache<void*>& locked_ptr_cache_() noexcept
+        {
+            static detail::N_way_set_associative_cache<void*> r;
+
+            return r;
+        }
+
+        static
+        std::size_t writers_for_()
+        {
+            auto it = writers_().begin();
+            do {
+                if (it->first.test_and_set()) continue;
+
+                return std::distance(writers_().begin(), it);
+            } while (++it != writers_().end());
+
+            throw std::runtime_error{"Failed to associate writers for array."};
+        }
+    };
+
+    template <typename T, int N = 1>
+    class array : private array_base {
+        static_assert(!std::is_const<T>{}, "array<const T> is not supported");
+        static_assert(
+            std::is_trivially_copyable<T>{},
+            "Only trivially copyable types are supported.");
+        static_assert(
+            std::is_trivially_destructible<T>{},
+            "Only trivially destructible types are supported.");
+
+        accelerator_view owner_;
+        accelerator_view associate_;
+        extent<N> extent_;
+        access_type cpu_access_;
+        std::unique_ptr<T[], Deleter> data_;
+        std::size_t writers_for_this_{max_array_cnt_};
+
+        template<typename U, int M>
+        friend
+        void copy(const array<U, M>&, array<U, M>&);
+        template<typename U, int M>
+        friend
+        void copy(const array<U, M>&, const array_view<U, M>&);
+        template<typename O, typename U, int M>
+        friend
+        void copy(const array<U, M>&, O);
+        template<typename U, int M>
+        friend
+        void copy(const array<U, M>&, const array_view<U, M>&);
+        template<typename U, int M>
+        friend
+        void copy(const array_view<const U, M>&, array<U, M>&);
+
+        void add_to_captured_() const
+        {
+            captured_().push_back(writers_for_this_);
+        }
+
+        T* allocate_()
+        {
+            hsa_region_t* r{nullptr};
+            switch (cpu_access_) {
+            case access_type_none: case access_type_auto:
+                r = static_cast<hsa_region_t*>(
+                    owner_.get_accelerator().get_hsa_am_system_region());
+                break;
+            default:
+                r = static_cast<hsa_region_t*>(
+                    owner_.get_accelerator().get_hsa_am_system_region());
+            }
+
+            if (!r) {
+                r = static_cast<hsa_region_t*>(
+                    owner_.get_accelerator().get_hsa_am_system_region());
+            }
+
+            void* tmp{};
+            auto s = hsa_memory_allocate(*r, extent_.size() * sizeof(T), &tmp);
+
+            if (s != HSA_STATUS_SUCCESS) {
+                throw std::runtime_error{"Failed to allocate array storage."};
+            }
+
+            return static_cast<T*>(tmp);
+        }
+
+        void lock_this_()
+        {
+            void* tmp{};
+            auto s = hsa_amd_memory_lock(
+                this,
+                sizeof(*this),
+                static_cast<hsa_agent_t*>(
+                    owner_.get_accelerator().get_hsa_agent()),
+                1,
+                reinterpret_cast<void**>(&tmp));
+
+            if (s != HSA_STATUS_SUCCESS) {
+                throw std::runtime_error{"Failed to lock array address."};
+            }
+
+            while (!locked_ptr_cache_().insert(this, tmp).second);
+        }
+
+        array* const this_() const [[hc]]
+        {
+            const auto it = locked_ptr_cache_().find(this);
+
+            if (it == locked_ptr_cache_().end()) return nullptr;
+
+            return static_cast<array* const>(*it);
+        }
+
+        void wait_for_all_pending_writers_() const
+        {
+            decltype(writers_()[writers_for_this_].second.second) tmp;
+            {
+                std::lock_guard<std::mutex> lck{
+                    writers_()[writers_for_this_].second.first};
+
+                tmp = std::move(writers_()[writers_for_this_].second.second);
+            }
+            for (auto&& x : tmp) if (x.valid()) x.wait();
+        }
+    public:
+        /**
+         * The rank of this array.
+         */
+        static constexpr int rank = N;
+
+        /**
+         * The element type of this array.
+         */
+        using value_type = T;
+
+        /**
+         * There is no default constructor for array<T,N>.
+         */
+        array() = delete;
+
+        /**
+         * Copy constructor. Constructs a new array<T,N> from the supplied
+         * argument other. The new array is located on the same accelerator_view
+         * as the source array. A deep copy is performed.
+         *
+         * @param[in] other An object of type array<T,N> from which to
+         *                  initialize this new array.
+         */
+        array(const array& other)
+            : array{other.extent_, other.owner_, other.associate_}
+        {   // TODO: if both arrays resolve to the same slot this will deadlock.
+            copy(other, *this);
+        }
+
+        /**
+         * Move constructor. Constructs a new array<T,N> by moving from the
+         * supplied argument other.
+         *
+         * @param[in] other An object of type array<T,N> from which to
+         *                  initialize this new array.
+         */
+        array(array&& other)
+            :
+            owner_{std::move(other.owner_)},
+            associate_{std::move(other.associate_)},
+            extent_{std::move(other.extent_)},
+            cpu_access_{other.cpu_access_},
+            data_{std::move(other.data_)},
+            writers_for_this_{other.writers_for_this_}
+        {
+            lock_this_();
+            other.writers_for_this_ = max_array_cnt_;
+        }
+
+        /**
+         * Constructs a new array with the supplied extent, located on the
+         * default view of the default accelerator. If any components of the
+         * extent are non-positive, an exception will be thrown.
+         *
+         * @param[in] ext The extent in each dimension of this array.
+         */
+        explicit
+        array(const hc::extent<N>& ext)
+            : array{ext, accelerator::get_auto_selection_view()}
+        {}
+
+        /** @{ */
+        /**
+         * Equivalent to construction using
+         * "array(extent<N>(e0 [, e1 [, e2 ]]))".
+         *
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     this array.
+         */
+        explicit
+        array(int e0) : array{hc::extent<N>{e0}}
+        {
+            static_assert(N == 1, "illegal");
+        }
+        explicit
+        array(int e0, int e1) : array{hc::extent<N>{e0, e1}}
+        {
+            static_assert(N == 2, "illegal");
+        }
+        explicit
+        array(int e0, int e1, int e2) : array{hc::extent<N>{e0, e1, e2}}
+        {
+            static_assert(N == 3, "illegal");
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Constructs a new array with the supplied extent, located on the
+         * default accelerator, initialized with the contents of a source
+         * container specified by a beginning and optional ending iterator. The
+         * source data is copied by value into this array as if by calling
+         * "copy()".
+         *
+         * If the number of available container elements is less than
+         * this->extent.size(), undefined behavior results.
+         *
+         * @param[in] ext The extent in each dimension of this array.
+         * @param[in] srcBegin A beginning iterator into the source container.
+         * @param[in] srcEnd An ending iterator into the source container.
+         */
+        template<typename InputIter>
+        array(const hc::extent<N>& ext, InputIter srcBegin)
+            : array{ext, srcBegin, accelerator::get_auto_selection_view()}
+        {}
+        template<typename InputIter>
+        array(const hc::extent<N>& ext, InputIter srcBegin, InputIter srcEnd)
+            :
+            array{ext, srcBegin, srcEnd, accelerator::get_auto_selection_view()}
+        {}
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Equivalent to construction using
+         * "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
+         *
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     this array.
+         * @param[in] srcBegin A beginning iterator into the source container.
+         * @param[in] srcEnd An ending iterator into the source container.
+         */
+        template<typename InputIter>
+        array(int e0, InputIter srcBegin)
+            : array{hc::extent<N>{e0}, srcBegin}
+        {}
+        template<typename InputIter>
+        array(int e0, InputIter srcBegin, InputIter srcEnd)
+            : array{hc::extent<N>{e0}, srcBegin, srcEnd}
+        {}
+        template<typename InputIter>
+        array(int e0, int e1, InputIter srcBegin)
+            : array{hc::extent<N>{e0, e1}, srcBegin}
+        {}
+        template<typename InputIter>
+        array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
+            : array{hc::extent<N>{e0, e1}, srcBegin, srcEnd}
+        {}
+        template<typename InputIter>
+        array(int e0, int e1, int e2, InputIter srcBegin)
+            : array{hc::extent<N>{e0, e1, e2}, srcBegin}
+        {}
+        template<typename InputIter>
+        array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
+            : array{hc::extent<N>{e0, e1, e2}, srcBegin, srcEnd}
+        {}
+
+        /** @} */
+
+        /**
+         * Constructs a new array, located on the default view of the default
+         * accelerator, initialized with the contents of the array_view "src".
+         * The extent of this array is taken from the extent of the source
+         * array_view. The "src" is copied by value into this array as if by
+         * calling "copy(src, *this)".
+         *
+         * @param[in] src An array_view object from which to copy the data into
+         *                this array (and also to determine the extent of this
+         *                array).
+         */
+        explicit
+        array(const array_view<const T, N>& src)
+            : array{src.get_extent(), accelerator::get_auto_selection_view()}
+        {
+            copy(src, *this);
+        }
+
+        /**
+         * Constructs a new array with the supplied extent, located on the
+         * accelerator bound to the accelerator_view "av".
+         *
+         * Users can optionally specify the type of CPU access desired for
+         * "this" array thus requesting creation of an array that is accessible
+         * both on the specified accelerator_view "av" as well as the CPU (with
+         * the specified CPU access_type). If a value other than
+         * access_type_auto or access_type_none is specified for the
+         * cpu_access_type parameter and the accelerator corresponding to the
+         * accelerator_view "av" does not support cpu_shared_memory, a
+         * runtime_exception is thrown. The cpu_access_type parameter has a
+         * default value of access_type_auto which leaves it up to the
+         * implementation to decide what type of allowed CPU access should the
+         * array be created with. The actual CPU access_type allowed for the
+         * created array can be queried using the get_cpu_access_type member
+         * method.
+         *
+         * @param[in] ext The extent in each dimension of this array.
+         * @param[in] av An accelerator_view object which specifies the location
+         *               of this array.
+         * @param[in] access_type The type of CPU access desired for this array.
+         */
+        array(
+            const hc::extent<N>& ext,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+        try :
+            owner_{std::move(av)},
+            associate_{owner_},
+            extent_{ext},
+            cpu_access_{cpu_access_type},
+            data_{allocate_(), Deleter{}},
+            writers_for_this_{writers_for_()}
+        {
+            lock_this_();
+        }
+        catch (const std::exception& ex) {
+            if (ext.size() != 0) throw ex;
+
+            throw std::domain_error{"Tried to construct zero-sized array."};
+        }
+
+        /** @{ */
+        /**
+         * Constructs an array instance based on the given pointer on the device
+         * memory.
+         */
+        array(int e0, void* accelerator_pointer)
+            :
+            array{
+                hc::extent<N>{e0},
+                static_cast<T*>(accelerator_pointer),
+                accelerator::get_auto_selection_view(),
+                access_type_none}
+        {}
+        array(int e0, int e1, void* accelerator_pointer)
+            :
+            array{
+                hc::extent<N>{e0, e1},
+                static_cast<T*>(accelerator_pointer),
+                accelerator::get_auto_selection_view(),
+                access_type_none}
+        {}
+        array(int e0, int e1, int e2, void* accelerator_pointer)
+            :
+            array{
+                hc::extent<N>{e0, e1, e2},
+                static_cast<T*>(accelerator_pointer),
+                accelerator::get_auto_selection_view(),
+                access_type_none}
+        {}
+
+        array(const hc::extent<N>& ext, void* accelerator_pointer)
+            :
+            array{
+                ext,
+                static_cast<T*>(accelerator_pointer),
+                accelerator::get_auto_selection_view(),
+                access_type_none}
+        {}
+        /** @} */
+
+        /**
+         * Constructs an array instance based on the given pointer on the device
+         * memory.
+         *
+         * @param[in] ext The extent in each dimension of this array.
+         * @param[in] av An accelerator_view object which specifies the location
+         *               of this array.
+         * @param[in] accelerator_pointer The pointer to the device memory.
+         * @param[in] access_type The type of CPU access desired for this array.
+         */
+        array(
+            const extent<N>& ext,
+            accelerator_view av,
+            void* accelerator_pointer,
+            access_type cpu_access_type = access_type_none)
+            :
+            array{
+                ext,
+                static_cast<T*>(accelerator_pointer),
+                std::move(av),
+                cpu_access_type}
+        {
+            // TODO: handle access types other than none.
+        }
+
+        /** @{ */
+        /**
+         * Equivalent to construction using
+         * "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
+         *
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     this array.
+         * @param[in] av An accelerator_view object which specifies the location
+         *               of this array.
+         * @param[in] access_type The type of CPU access desired for this array.
+         */
+        array(
+            int e0,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            : array{hc::extent<N>{e0}, std::move(av), cpu_access_type}
+        {}
+        array(
+            int e0,
+            int e1,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            : array{hc::extent<N>{e0, e1}, std::move(av), cpu_access_type}
+        {}
+        array(
+            int e0,
+            int e1,
+            int e2,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            : array{hc::extent<N>{e0, e1, e2}, std::move(av), cpu_access_type}
+        {}
+
+        /** @} */
+
+        /**
+         * Constructs a new array with the supplied extent, located on the
+         * accelerator bound to the accelerator_view "av", initialized with the
+         * contents of the source container specified by a beginning and
+         * optional ending iterator. The data is copied by value into this array
+         * as if by calling "copy()".
+         *
+         * Users can optionally specify the type of CPU access desired for
+         * "this" array thus requesting creation of an array that is accessible
+         * both on the specified accelerator_view "av" as well as the CPU (with
+         * the specified CPU access_type). If a value other than
+         * access_type_auto or access_type_none is specified for the
+         * cpu_access_type parameter and the accelerator corresponding to the
+         * accelerator_view "av" does not support cpu_shared_memory, a
+         * runtime_exception is thrown. The cpu_access_type parameter has a
+         * default value of access_type_auto which leaves it up to the
+         * implementation to decide what type of allowed CPU access should the
+         * array be created with. The actual CPU access_type allowed for the
+         * created array can be queried using the get_cpu_access_type member
+         * method.
+         *
+         * @param[in] ext The extent in each dimension of this array.
+         * @param[in] srcBegin A beginning iterator into the source container.
+         * @param[in] srcEnd An ending iterator into the source container.
+         * @param[in] av An accelerator_view object which specifies the home
+         *               location of this array.
+         * @param[in] access_type The type of CPU access desired for this array.
+         */
+        template<typename InputIter>
+        array(
+            const hc::extent<N>& ext,
+            InputIter srcBegin,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            : array{ext, std::move(av), cpu_access_type}
+        {
+            copy(srcBegin, *this);
+        }
+        template<typename InputIter>
+        array(
+            const hc::extent<N>& ext,
+            InputIter srcBegin,
+            InputIter srcEnd,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            : array{ext, std::move(av), cpu_access_type}
+        {
+            copy(srcBegin, srcEnd, *this);
+        }
+
+        /** @} */
+
+        /**
+         * Constructs a new array initialized with the contents of the
+         * array_view "src". The extent of this array is taken from the extent
+         * of the source array_view. The "src" is copied by value into this
+         * array as if by calling "copy(src, *this)". The new array is located
+         * on the accelerator bound to the accelerator_view "av".
+         *
+         * Users can optionally specify the type of CPU access desired for
+         * "this" array thus requesting creation of an array that is accessible
+         * both on the specified accelerator_view "av" as well as the CPU (with
+         * the specified CPU access_type). If a value other than
+         * access_type_auto or access_type_none is specified for the
+         * cpu_access_type parameter and the accelerator corresponding to the
+         * accelerator_view “av” does not support cpu_shared_memory, a
+         * runtime_exception is thrown. The cpu_access_type parameter has a
+         * default value of access_type_auto which leaves it up to the
+         * implementation to decide what type of allowed CPU access should the
+         * array be created with. The actual CPU access_type allowed for the
+         * created array can be queried using the get_cpu_access_type member
+         * method.
+         *
+         * @param[in] src An array_view object from which to copy the data into
+         *                this array (and also to determine the extent of this
+         *                array).
+         * @param[in] av An accelerator_view object which specifies the home
+         *               location of this array.
+         * @param[in] access_type The type of CPU access desired for this array.
+         */
+        array(
+            const array_view<const T, N>& src,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            : array{src.get_extent(), std::move(av), cpu_access_type}
+        {
+            copy(src, *this);
+        }
+
+        /** @{ */
+        /**
+         * Equivalent to construction using
+         * "array(
+         *     extent<N>(e0 [, e1 [, e2 ]]),
+         *     srcBegin [, srcEnd],
+         *     av,
+         *     cpu_access_type)".
+         *
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     this array.
+         * @param[in] srcBegin A beginning iterator into the source container.
+         * @param[in] srcEnd An ending iterator into the source container.
+         * @param[in] av An accelerator_view object which specifies the home
+         *               location of this array.
+         * @param[in] access_type The type of CPU access desired for this array.
+         */
+        template<typename InputIter>
+        array(
+            int e0,
+            InputIter srcBegin,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            : array{hc::extent<N>{e0}, srcBegin, std::move(av), cpu_access_type}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            InputIter srcBegin,
+            InputIter srcEnd,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            :
+            array{
+                hc::extent<N>{e0},
+                srcBegin,
+                srcEnd,
+                std::move(av),
+                cpu_access_type}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            int e1,
+            InputIter srcBegin,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            :
+            array{
+                hc::extent<N>{e0, e1}, srcBegin, std::move(av), cpu_access_type}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            int e1,
+            InputIter srcBegin,
+            InputIter srcEnd,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            :
+            array{
+                hc::extent<N>{e0, e1},
+                srcBegin,
+                srcEnd,
+                std::move(av),
+                cpu_access_type}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            int e1,
+            int e2,
+            InputIter srcBegin,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            :
+            array{
+                hc::extent<N>{e0, e1, e2},
+                srcBegin,
+                std::move(av),
+                cpu_access_type}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            int e1,
+            int e2,
+            InputIter srcBegin,
+            InputIter srcEnd,
+            accelerator_view av,
+            access_type cpu_access_type = access_type_auto)
+            :
+            array{
+                hc::extent<N>{e0, e1, e2},
+                srcBegin,
+                srcEnd,
+                std::move(av),
+                cpu_access_type}
+        {}
+
+        /** @} */
+
+        /**
+         * Constructs a staging array with the given extent, which acts as a
+         * staging area between accelerator views "av" and "associated_av". If
+         * "av" is a cpu accelerator view, this will construct a staging array
+         * which is optimized for data transfers between the CPU and
+         * "associated_av".
+         *
+         * @param[in] ext The extent in each dimension of this array.
+         * @param[in] av An accelerator_view object which specifies the home
+         *               location of this array.
+         * @param[in] associated_av An accelerator_view object which specifies a
+         *                          target device accelerator.
+         */
+        array(
+            const hc::extent<N>& ext,
+            accelerator_view av,
+            accelerator_view associated_av)
+        try :
+            owner_{std::move(av)},
+            associate_{std::move(associated_av)},
+            extent_{ext},
+            cpu_access_{access_type_auto},
+            data_{allocate_(), Deleter{}},
+            writers_for_this_{writers_for_()}
+        {
+            lock_this_();
+        }
+        catch (const std::exception& ex) {
+            if (ext.size() != 0) throw ex;
+
+            throw std::domain_error{"Tried to construct zero-sized array."};
+        }
+
+        /** @{ */
+        /**
+         * Equivalent to construction using
+         * "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
+         *
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     this array.
+         * @param[in] av An accelerator_view object which specifies the home
+         *               location of this array.
+         * @param[in] associated_av An accelerator_view object which specifies a
+         *                          target device accelerator.
+         */
+        array(int e0, accelerator_view av, accelerator_view associated_av)
+            : array{hc::extent<N>{e0}, std::move(av), associated_av}
+        {}
+        array(
+            int e0, int e1, accelerator_view av, accelerator_view associated_av)
+            : array{hc::extent<N>{e0, e1}, std::move(av), associated_av}
+        {}
+        array(
+            int e0,
+            int e1,
+            int e2,
+            accelerator_view av,
+            accelerator_view associated_av)
+            : array{hc::extent<N>{e0, e1, e2}, std::move(av), associated_av}
+        {}
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Constructs a staging array with the given extent, which acts as a
+         * staging area between accelerator_views "av" (which must be the CPU
+         * accelerator) and "associated_av". The staging array will be
+         * initialized with the data specified by "src" as if by calling
+         * "copy(src, *this)".
+         *
+         * @param[in] ext The extent in each dimension of this array.
+         * @param[in] srcBegin A beginning iterator into the source container.
+         * @param[in] srcEnd An ending iterator into the source container.
+         * @param[in] av An accelerator_view object which specifies the home
+         *               location of this array.
+         * @param[in] associated_av An accelerator_view object which specifies a
+         *                          target device accelerator.
+         */
+        template<typename InputIter>
+        array(
+            const hc::extent<N>& ext,
+            InputIter srcBegin,
+            accelerator_view av,
+            accelerator_view associated_av)
+            : array{ext, std::move(av), std::move(associated_av)}
+        {
+            copy(srcBegin, *this);
+        }
+        template<typename InputIter>
+        array(
+            const hc::extent<N>& ext,
+            InputIter srcBegin,
+            InputIter srcEnd,
+            accelerator_view av,
+            accelerator_view associated_av)
+            : array{ext, std::move(av), associated_av}
+        {
+            copy(srcBegin, srcEnd, *this);
+        }
+
+        /** @} */
+
+        /**
+         * Constructs a staging array initialized with the array_view given by
+         * "src", which acts as a staging area between accelerator_views "av"
+         * (which must be the CPU accelerator) and "associated_av". The extent
+         * of this array is taken from the extent of the source array_view. The
+         * staging array will be initialized from "src" as if by calling
+         * "copy(src, *this)".
+         *
+         * @param[in] src An array_view object from which to copy the data into
+         *                this array (and also to determine the extent of this
+         *                array).
+         * @param[in] av An accelerator_view object which specifies the home
+         *               location of this array.
+         * @param[in] associated_av An accelerator_view object which specifies a
+         *                          target device accelerator.
+         */
+        array(
+            const array_view<const T, N>& src,
+            accelerator_view av,
+            accelerator_view associated_av)
+            : array{src.get_extent(), std::move(av), associated_av}
+        {
+            copy(src, *this);
+        }
+
+        /** @{ */
+        /**
+         * Equivalent to construction using
+         * "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
+         *
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     this array.
+         * @param[in] srcBegin A beginning iterator into the source container.
+         * @param[in] srcEnd An ending iterator into the source container.
+         * @param[in] av An accelerator_view object which specifies the home
+         *               location of this array.
+         * @param[in] associated_av An accelerator_view object which specifies a
+         *                          target device accelerator.
+         */
+        template<typename InputIter>
+        array(
+            int e0,
+            InputIter srcBegin,
+            accelerator_view av,
+            accelerator_view associated_av)
+            : array{hc::extent<N>{e0}, srcBegin, std::move(av), associated_av}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            InputIter srcBegin,
+            InputIter srcEnd,
+            accelerator_view av,
+            accelerator_view associated_av)
+            :
+            array{
+                hc::extent<N>{e0},
+                srcBegin,
+                srcEnd,
+                std::move(av),
+                associated_av}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            int e1,
+            InputIter srcBegin,
+            accelerator_view av,
+            accelerator_view associated_av)
+            :
+            array{hc::extent<N>{e0, e1}, srcBegin, std::move(av), associated_av}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            int e1,
+            InputIter srcBegin,
+            InputIter srcEnd,
+            accelerator_view av,
+            accelerator_view associated_av)
+            :
+            array{
+                hc::extent<N>{e0, e1},
+                srcBegin,
+                srcEnd,
+                std::move(av),
+                associated_av}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            int e1,
+            int e2,
+            InputIter srcBegin,
+            accelerator_view av,
+            accelerator_view associated_av)
+            :
+            array{
+                hc::extent<N>{e0, e1, e2},
+                srcBegin,
+                std::move(av),
+                associated_av}
+        {}
+        template<typename InputIter>
+        array(
+            int e0,
+            int e1,
+            int e2,
+            InputIter srcBegin,
+            InputIter srcEnd,
+            accelerator_view av,
+            accelerator_view associated_av)
+            :
+            array{
+                hc::extent<N>{e0, e1, e2},
+                srcBegin,
+                srcEnd,
+                std::move(av),
+                associated_av}
+        {}
+
+        /** @} */
+
+        /**
+         * Access the extent that defines the shape of this array.
+         */
+        hc::extent<N> get_extent() const [[cpu, hc]]
+        {
+            return extent_;
+        }
+
+        /**
+         * This property returns the accelerator_view representing the location
+         * where this array has been allocated.
+         */
+        accelerator_view get_accelerator_view() const
+        {
+            return owner_;
+        }
+
+        /**
+         * This property returns the accelerator_view representing the preferred
+         * target where this array can be copied.
+         */
+        accelerator_view get_associated_accelerator_view() const
+        {
+            return associate_;
+        }
+
+        /**
+         * This property returns the CPU "access_type" allowed for this array.
+         */
+        access_type get_cpu_access_type() const
+        {
+            return cpu_access_;
+        }
+
+        /**
+         * Assigns the contents of the array "other" to this array, using a deep
+         * copy.
+         *
+         * @param[in] other An object of type array<T,N> from which to copy into
+         *                  this array.
+         * @return Returns *this.
+         */
+        array& operator=(const array& other) {
+            if (this != &other) {
+                array arr(other);
+                *this = std::move(arr);
+            }
+            return *this;
+        }
+
+        /**
+         * Moves the contents of the array "other" to this array.
+         *
+         * @param[in] other An object of type array<T,N> from which to move into
+         *                  this array.
+         * @return Returns *this.
+         */
+        array& operator=(array&& other)
+        {   // TODO: fix infinite recursion, this is temporary bad, explosive juju.
+            array tmp{std::move(other)};
+            std::swap(*this, tmp);
+
+            return *this;
+        }
+
+        /**
+         * Assigns the contents of the array_view "src", as if by calling
+         * "copy(src, *this)".
+         *
+         * @param[in] src An object of type array_view<T,N> from which to copy
+         *                into this array.
+         * @return Returns *this.
+         */
+        array& operator=(const array_view<const T,N>& src)
+        {
+            using std::swap;
+
+            array tmp{src};
+            swap(*this, tmp);
+
+            return *this;
+        }
+
+        /**
+         * Copies the contents of this array to the array given by "dest", as
+         * if by calling "copy(*this, dest)".
+         *
+         * @param[out] dest An object of type array<T,N> to which to copy data
+         *                  from this array.
+         */
+        void copy_to(array& dest) const
+        {
+            copy(*this, dest);
+        }
+
+        /**
+         * Copies the contents of this array to the array_view given by "dest",
+         * as if by calling "copy(*this, dest)".
+         *
+         * @param[out] dest An object of type array_view<T,N> to which to copy
+         *                  data from this array.
+         */
+        void copy_to(const array_view<T,N>& dest) const
+        {
+            copy(*this, dest);
+        }
+
+        /**
+         * Returns a pointer to the raw data underlying this array.
+         *
+         * @return A pointer to the (const) first element in the linearised
+         *         array.
+         */
+        T* data() const [[cpu, hc]]
+        {
+            return data_.get();
+        }
+
+        /**
+         * Returns a pointer to the device memory underlying this array.
+         *
+         * @return A (const) pointer to the first element in the array on the
+         *         device memory.
+         */
+        T* accelerator_pointer() const [[cpu, hc]]
+        {   // TODO: this is dumb, array is an owning owned container i.e. data_
+            //       IS an accelerator pointer; it is NOT array_view, and this
+            //       function should be removed.
+            return data_.get();
+        }
+
+        /**
+         * Implicitly converts an array to a std::vector, as if by
+         * "copy(*this, vector)".
+         *
+         * @return An object of type vector<T> which contains a copy of the data
+         *         contained on the array.
+         */
+        operator std::vector<T>() const {
+            std::vector<T> vec(extent_.size());
+            hc::copy(*this, vec.data());
+            return vec;
+        }
+
+        /** @{ */
+        /**
+         * Returns a reference to the element of this array that is at the
+         * location in N-dimensional space specified by "idx". Accessing array
+         * data on a location where it is not resident (e.g. from the CPU when
+         * it is resident on a GPU) results in an exception (in CPU context) or
+         * undefined behavior (in GPU context).
+         *
+         * @param[in] idx An object of type index<N> from that specifies the
+         *                location of the element.
+         */
+        T& operator[](const index<N>& idx) [[cpu]]
+        {   // TODO: simplify, this is a placeholder.
+            static const accelerator cpu{L"cpu"};
+
+            switch (cpu_access_) {
+            case access_type_none:
+                throw
+                    runtime_exception{"The array is not accessible on CPU.", 0};
+            case access_type_auto:
+                if (owner_.get_accelerator() == cpu) break;
+                throw
+                    runtime_exception{"The array is not accessible on CPU.", 0};
+            default:
+                break;
+            }
+
+            return data_[detail::amp_helper<
+                N, index<N>, hc::extent<N>>::flatten(idx, extent_)];
+        }
+        T& operator[](const index<N>& idx) [[hc]]
+        {
+            return this_()->data_[detail::amp_helper<
+                N, index<N>, hc::extent<N>>::flatten(idx, this_()->extent_)];
+        }
+        template<int m = N, typename std::enable_if<m == 1>::type* = nullptr>
+        T& operator[](int i0) [[cpu, hc]]
+        {
+            return operator[](index<1>{i0});
+        }
+        T& operator()(const index<N>& idx) [[cpu, hc]]
+        {
+            return (*this)[idx];
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Returns a const reference to the element of this array that is at the
+         * location in N-dimensional space specified by "idx". Accessing array
+         * data on a location where it is not resident (e.g. from the CPU when
+         * it is resident on a GPU) results in an exception (in cpu context) or
+         * undefined behavior (in GPU context).
+         *
+         * @param[in] idx An object of type index<N> from that specifies the
+         *                location of the element.
+         */
+        const T& operator[](const index<N>& idx) const [[cpu, hc]]
+        {   // TODO: semi-ghastly, even though Scott Meyers approves of it.
+            return (*const_cast<array* const>(this))[idx];
+        }
+        template<int m = N, typename std::enable_if<m == 1>::type* = nullptr>
+        const T& operator[](int i0) const [[cpu, hc]]
+        {
+            return operator[](index<m>{i0});
+        }
+        const T& operator()(const index<N>& idx) const [[cpu, hc]]
+        {
+            return operator[](idx);
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Equivalent to
+         * "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
+         *
+         * @param[in] i0,i1,i2 The component values that will form the index
+         *                     into this array.
+         */
+        template<int m = N, typename std::enable_if<m == 1>::type* = nullptr>
+        T& operator()(int i0) [[cpu, hc]]
+        {
+            return operator[](index<1>{i0});
+        }
+        template<int m = N, typename std::enable_if<m == 2>::type* = nullptr>
+        T& operator()(int i0, int i1) [[cpu, hc]]
+        {
+            return operator[](index<2>{i0, i1});
+        }
+        template<int m = N, typename std::enable_if<m == 3>::type* = nullptr>
+        T& operator()(int i0, int i1, int i2) [[cpu, hc]]
+        {
+            return operator[](index<3>{i0, i1, i2});
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Equivalent to
+         * "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
+         *
+         * @param[in] i0,i1,i2 The component values that will form the index
+         *                     into this array.
+         */
+        template<int m = N, typename std::enable_if<m == 1>::type* = nullptr>
+        const T& operator()(int i0) const [[cpu, hc]]
+        {
+            return (*const_cast<array* const>(this))(i0);
+        }
+        template<int m = N, typename std::enable_if<m == 2>::type* = nullptr>
+        const T& operator()(int i0, int i1) const [[cpu, hc]]
+        {
+            return (*const_cast<array* const>(this))(i0, i1);
+        }
+        template<int m = N, typename std::enable_if<m == 3>::type* = nullptr>
+        const T& operator()(int i0, int i1, int i2) const [[cpu, hc]]
+        {
+            return (*const_cast<array* const>(this))(i0, i1, i2);
+        }
+
+        /** @{ */
+        /**
+         * This overload is defined for array<T,N> where @f$N \ge 2@f$.
+         * This mode of indexing is equivalent to projecting on the
+         * most-significant dimension. It allows C-style indexing. For example:
+         *
+         * @code{.cpp}
+         * array<float,4> myArray(myExtents, …);
+         * myArray[index<4>(5,4,3,2)] = 7;
+         * assert(myArray[5][4][3][2] == 7);
+         * @endcode
+         *
+         * @param i0 An integer that is the index into the most-significant
+         *           dimension of this array.
+         * @return Returns an array_view whose dimension is one lower than that
+         *         of this array.
+         */
+        template<int m = N, typename std::enable_if<(m > 1)>::type* = nullptr>
+        array_view<T, m - 1> operator[](int i0) [[cpu, hc]]
+        {
+            hc::extent<m - 1> tmp;
+            for (auto i = 1; i != m; ++i) tmp[i - 1] = extent_[i];
+
+            return array_view<T, m - 1>{tmp, data() + i0 * tmp.size()};
+        }
+
+        template<int m = N, typename std::enable_if<(m > 1)>::type* = nullptr>
+        array_view<const T, m - 1> operator[](int i0) const [[cpu, hc]]
+        {
+            hc::extent<m - 1> tmp;
+            for (auto i = 1; i != m; ++i) tmp[i - 1] = extent_[i];
+
+            return array_view<const T, m - 1>{tmp, data() + i0 * tmp.size()};
+        }
+
+        template<int m = N, typename std::enable_if<(m > 1)>::type* = nullptr>
+        array_view<T, m - 1> operator()(int i0) [[cpu, hc]]
+        {
+            return (*this)[i0];
+        }
+
+        template<int m = N, typename std::enable_if<(m > 1)>::type* = nullptr>
+        array_view<const T, m - 1> operator()(int i0) const [[cpu, hc]]
+        {
+            return (*this)[i0];
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Returns a subsection of the source array view at the origin specified
+         * by "idx" and with the extent specified by "ext".
+         *
+         * Example:
+         * @code{.cpp}
+         * array<float,2> a(extent<2>(200,100));
+         * array_view<float,2> v1(a); // v1.extent = <200,100>
+         * array_view<float,2> v2 =
+         *     v1.section(index<2>(15,25), extent<2>(40,50));
+         * assert(v2(0,0) == v1(15,25));
+         * @endcode
+         *
+         * @param[in] origin Provides the offset/origin of the resulting
+         *            section.
+         * @param[in] ext Provides the extent of the resulting section.
+         * @return Returns a subsection of the source array at specified origin,
+         *         and with the specified extent.
+         */
+        array_view<T, N> section(
+            const index<N>& origin, const hc::extent<N>& ext) [[cpu]]
+        {
+            if (extent_.size() < (ext + origin).size()) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            return array_view<T, N>{*this}.section(origin, ext);
+        }
+        array_view<T, N> section(
+            const index<N>& origin, const hc::extent<N>& ext) [[hc]]
+        {
+            return array_view<T, N>{*this}.section(origin, ext);
+        }
+
+        array_view<const T, N> section(
+            const index<N>& origin, const hc::extent<N>& ext) const [[cpu]]
+        {
+            if (extent_.size() < (ext + origin).size()) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            return array_view<const T, N>{*this}.section(origin, ext);
+        }
+        array_view<const T, N> section(
+            const index<N>& origin, const hc::extent<N>& ext) const [[hc]]
+        {
+            return array_view<const T, N>{*this}.section(origin, ext);
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Equivalent to "section(idx, this->extent – idx)".
+         */
+        array_view<T, N> section(const index<N>& idx) [[cpu]]
+        {
+            if (!extent_.contains(idx)) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            return array_view<T, N>{*this}.section(idx);
+        }
+        array_view<T, N> section(const index<N>& idx) [[hc]]
+        {
+            return array_view<T, N>{*this}.section(idx);
+        }
+
+        array_view<const T, N> section(const index<N>& idx) const [[cpu]]
+        {
+            if (!extent_.contains(idx)) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            return array_view<const T, N>{*this}.section(idx);
+        }
+        array_view<const T, N> section(const index<N>& idx) const [[hc]]
+        {
+            return array_view<const T, N>{*this}.section(idx);
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Equivalent to "section(index<N>(), ext)".
+         */
+        array_view<T, N> section(const hc::extent<N>& ext) [[cpu, hc]]
+        {
+            return array_view<T, N>{*this}.section(ext);
+        }
+        array_view<const T, N> section(
+            const hc::extent<N>& ext) const [[cpu, hc]]
+        {
+            return array_view<const T, N>{*this}.section(ext);
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Equivalent to
+         * "array<T,N>::section(
+         *      index<N>{i0 [, i1 [, i2 ]]},
+         *      extent<N>{e0 [, e1 [, e2 ]]}) const".
+         *
+         * @param[in] i0,i1,i2 The component values that will form the origin of
+         *                     the section
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     the section
+         */
+        array_view<T, 1> section(int i0, int e0) [[cpu, hc]]
+        {
+            static_assert(N == 1, "Rank must be 1.");
+
+            return section(index<1>{i0}, hc::extent<1>{e0});
+        }
+        array_view<T, 2> section(int i0, int i1, int e0, int e1) [[cpu, hc]]
+        {
+            static_assert(N == 2, "Rank must be 2.");
+
+            return section(index<2>{i0, i1}, hc::extent<2>{e0, e1});
+        }
+        array_view<T, 3> section(
+            int i0, int i1, int i2, int e0, int e1, int e2) [[cpu, hc]]
+        {
+            static_assert(N == 3, "Rank must be 3.");
+
+            return section(index<3>{i0, i1, i2}, hc::extent<3>{e0, e1, e2});
+        }
+
+        array_view<const T, 1> section(int i0, int e0) const [[cpu, hc]]
+        {
+            static_assert(N == 1, "Rank must be 1.");
+
+            return section(index<1>{i0}, hc::extent<1>{e0});
+        }
+        array_view<const T, 2> section(
+            int i0, int i1, int e0, int e1) const [[cpu, hc]]
+        {
+            static_assert(N == 2, "Rank must be 2.");
+
+            return section(index<2>{i0, i1}, hc::extent<2>{e0, e1});
+        }
+        array_view<const T, 3> section(
+            int i0, int i1, int i2, int e0, int e1, int e2) const [[cpu, hc]]
+        {
+            static_assert(N == 3, "Rank must be 3.");
+
+            return section(index<3>{i0, i1, i2}, hc::extent<3>{e0, e1, e2});
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Sometimes it is desirable to view the data of an N-dimensional array
+         * as a linear array, possibly with a (unsafe) reinterpretation of the
+         * element type. This can be achieved through the reinterpret_as member
+         * function. Example:
+         *
+         * @code{.cpp}
+         * struct RGB { float r; float g; float b; };
+         * array<RGB,3> a = ...;
+         * array_view<float,1> v = a.reinterpret_as<float>();
+         * assert(v.extent == 3*a.extent);
+         * @endcode
+         *
+         * The size of the reinterpreted ElementType must evenly divide into the
+         * total size of this array.
+         *
+         * @return Returns an array_view from this array<T,N> with the element
+         *         type reinterpreted from T to ElementType, and the rank
+         *         reduced from N to 1.
+         */
+        template<typename U>
+        array_view<U, 1> reinterpret_as() [[cpu]]
+        {
+            int size{extent_.size() / sizeof(U) * sizeof(T)};
+
+            if (size * sizeof(U) != extent_.size() * sizeof(T)) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            return array_view<U, 1>{extent<1>{size}, data()};
+        }
+        template<typename U>
+        array_view<U, 1> reinterpret_as() [[hc]]
+        {
+            int size{extent_.size() / sizeof(U) * sizeof(T)};
+
+            return array_view<U, 1>{extent<1>{size}, data()};
+        }
+
+        template<typename U>
+        array_view<const U, 1> reinterpret_as() const [[cpu]]
+        {
+            int size{extent_.size() / sizeof(U) * sizeof(T)};
+
+            if (size * sizeof(U) != extent_.size() * sizeof(T)) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            return array_view<const U, 1>{extent<1>{size}, data()};
+        }
+        template<typename U>
+        array_view<const U, 1> reinterpret_as() const [[hc]]
+        {
+            int size{extent_.size() / sizeof(U) * sizeof(T)};
+
+            return array_view<const U, 1>{extent<1>{size}, data()};
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * An array of higher rank can be reshaped into an array of lower rank,
+         * or vice versa, using the view_as member function. Example:
+         *
+         * @code{.cpp}
+         * array<float,1> a(100);
+         * array_view<float,2> av = a.view_as(extent<2>(2,50));
+         * @endcode
+         *
+         * @return Returns an array_view from this array<T,N> with the rank
+         *         changed to K from N.
+         */
+        template<int m>
+        array_view<T, m> view_as(const hc::extent<m>& view_extent) [[cpu]]
+        {
+            if (extent_.size() < view_extent.size()) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            return array_view<T, m>{view_extent, data()};
+        }
+        template<int m>
+        array_view<T, m> view_as(const hc::extent<m>& view_extent) [[hc]]
+        {
+            return array_view<T, m>{view_extent, data()};
+        }
+
+        template<int m>
+        array_view<const T, m> view_as(
+            const hc::extent<m>& view_extent) const [[cpu]]
+        {
+            if (extent_.size() < view_extent.size()) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            return array_view<const T, m>{view_extent, data()};
+        }
+        template<int m>
+        array_view<const T, m> view_as(
+            const hc::extent<m>& view_extent) const [[hc]]
+        {
+            return array_view<const T, m>{view_extent, data()};
+        }
+
+        /** @} */
+
+        ~array()
+        {
+            [[maybe_unused]]
+            static constexpr auto force_emission_ = &array::add_to_captured_;
+
+            if (writers_for_this_ != max_array_cnt_) {
+                writers_()[writers_for_this_].first.clear();
+            }
+            if (locked_ptr_cache_().erase(this) == 0u) return;
+
+            if (hsa_amd_memory_unlock(this) != HSA_STATUS_SUCCESS) {
+                std::cerr << "Failed to unlock locked array pointer; HC runtime"
+                    << " may be in an inconsistent state." << std::endl;
+            }
+        }
+    };
+
+    // ------------------------------------------------------------------------
+    // array_view
+    // ------------------------------------------------------------------------
+    /**
+     * The array_view<T, N> type represents a possibly cached view into the data
+     * held in an array<T, N>, or a section thereof. It also provides such views
+     * over native CPU data. It exposes an indexing interface congruent to that
+     * of array<T, N>.
+     */
+    struct array_view_base {
+        using GuardedWriterConcurrentList_ =
+            array_base::GuardedWriterConcurrentList_;
+
+        static constexpr std::size_t max_array_view_cnt_{65536};
+
+        static
+        std::unordered_map<const void*, std::shared_ptr<void>>& cache_()
+        {
+            static std::unordered_map<const void*, std::shared_ptr<void>> r;
+
+            return r;
+        }
+
+        const std::shared_ptr<void>& cache_for_(
+            const void* ptr, std::size_t byte_cnt)
+        {
+            if (ptr == this) return cache_for_sourceless_(this, byte_cnt);
+
+            const auto info = detail::pointer_info(ptr);
+
+            std::lock_guard<std::mutex> lck{mutex_()};
+
+            const auto it = cache_().find(ptr);
+
+            if (it != cache_().cend()) return it->second;
+
+            switch (info.type) {
+            case HSA_EXT_POINTER_TYPE_HSA: case HSA_EXT_POINTER_TYPE_LOCKED:
+                return cache_().emplace(
+                    std::piecewise_construct,
+                    std::make_tuple(ptr),
+                    std::make_tuple(info.agentBaseAddress, [](void*) { return HSA_STATUS_SUCCESS; })).first->second;
+            default:
+                void* tmp{};
+                detail::throwing_hsa_result_check(
+                    hsa_amd_memory_lock(
+                        const_cast<void*>(ptr), byte_cnt, nullptr, 0, &tmp),
+                    __FILE__, __func__, __LINE__);
+
+                return cache_().emplace(
+                    std::piecewise_construct,
+                    std::make_tuple(ptr),
+                    std::make_tuple(tmp, hsa_amd_memory_unlock)).first->second;
+            }
+        }
+
+        static
+        const std::shared_ptr<void>& cache_for_sourceless_(
+            void* ptr, std::size_t byte_cnt)
+        {
+            static const accelerator acc{};
+
+            detail::throwing_hsa_result_check(
+                hsa_memory_allocate(
+                    *static_cast<hsa_region_t*>(acc.get_hsa_am_system_region()),
+                    byte_cnt,
+                    &ptr),
+                __FILE__, __func__, __LINE__);
+
+            std::lock_guard<std::mutex> lck{mutex_()};
+
+            return cache_().emplace(
+                std::piecewise_construct,
+                std::make_tuple(ptr),
+                std::make_tuple(ptr, hsa_memory_free)).first->second;
+        }
+
+        static
+        std::unordered_set<std::size_t>& captured_()
+        {
+            thread_local static std::unordered_set<std::size_t> r{};
+
+            return r;
+        }
+
+        static
+        std::mutex& mutex_()
+        {
+            static std::mutex r{}; // TODO: use shared_mutex if C++17 feasible.
+
+            return r;
+        }
+
+        static
+        std::vector<GuardedWriterConcurrentList_>& writers_() noexcept
+        {
+            static std::vector<GuardedWriterConcurrentList_> r{
+                max_array_view_cnt_};
+
+            return r;
+        }
+
+        static
+        std::vector<std::vector<hsa_signal_t>>& writer_signals_() noexcept
+        {
+            static std::vector<std::vector<hsa_signal_t>> r{
+                max_array_view_cnt_};
+
+            return r;
+        }
+
+        static
+        std::size_t writers_for_()
+        {   // TODO: should be fused with the definition in array_base.
+            auto it = writers_().begin();
+            do {
+                if (it->first.test_and_set()) continue;
+
+                return std::distance(writers_().begin(), it);
+            } while (++it != writers_().end());
+
+            throw std::runtime_error{
+                "Failed to associate writers for array_view."};
+        }
+    };
+
+    template <typename T, int N = 1>
+    class array_view : private array_view_base {
+        static_assert(
+            std::is_trivially_copyable<T>{},
+            "Only trivially copyable types are supported.");
+        static_assert(
+            std::is_trivially_destructible<T>{},
+            "Only trivially destructible types are supported.");
+
+        using ValT_ = typename std::remove_const<T>::type;
+
+        // TODO: compress data layout to make array_view more pointer like in cost.
+        #if !defined(__HCC_ACCELERATOR__) // TODO: temporary, assess shared_ptr use.
+            std::shared_ptr<void> data_;
+        #else
+            struct {
+                typename std::aligned_storage<
+                    sizeof(std::shared_ptr<void>),
+                    alignof(std::shared_ptr<void>)>::type pad_;
+
+                void* get() const [[cpu, hc]] { return nullptr; }
+            } data_;
+        #endif
+        const accelerator* owner_;
+        hc::extent<N> extent_;
+        T* base_ptr_;
+        typename std::conditional<
+            std::is_const<T>{}, const void*, void*>::type source_;
+        std::size_t writers_for_this_;
+        hsa_amd_pointer_type_t source_type_;
+
+        template<typename, int> friend class array;
+        template<typename, int> friend class array_view;
+
+        template<typename Q, int K>
+        friend
+        void copy(const array<Q, K>&, const array_view<Q, K>&);
+        template<typename InputIter, typename Q, int K>
+        friend
+        void copy(InputIter, InputIter, const array_view<Q, K>&);
+        template<typename Q, int K>
+        friend
+        void copy(const array_view<const Q, K>&, array<Q, K>&);
+        template<typename OutputIter, typename Q, int K>
+        friend
+        void copy(const array_view<Q, K>&, OutputIter);
+        template<typename Q, int K>
+        friend
+        void copy(const array_view<const Q, K>&, const array_view<Q, K>&);
+
+        T* updated_data_() const [[cpu]]
+        {
+            decltype(writers_()[writers_for_this_].second.second) tmp;
+            if (writers_for_this_ != max_array_view_cnt_) {
+                std::lock_guard<std::mutex> lck{
+                    writers_()[writers_for_this_].second.first};
+
+                tmp = std::move(writers_()[writers_for_this_].second.second);
+            }
+            for (auto&& x : tmp) if (x.valid()) x.wait();
+
+            return static_cast<T*>(
+                detail::pointer_info(base_ptr_).hostBaseAddress);
+        }
+
+        T* updated_data_() const [[hc]]
+        {
+            return base_ptr_;
+        }
+    public:
+        /**
+         * The rank of this array.
+         */
+        static const int rank = N;
+
+        /**
+         * The element type of this array.
+         */
+        typedef T value_type;
+
+        /**
+         * There is no default constructor for array_view<T,N>.
+         */
+        array_view() = delete;
+
+        /**
+         * Constructs an array_view which is bound to the data contained in the
+         * "src" array. The extent of the array_view is that of the src array,
+         * and the origin of the array view is at zero.
+         *
+         * @param[in] src An array which contains the data that this array_view
+         *                is bound to.
+         */
+        array_view(hc::array<T, N>& src) [[cpu]]
+            : array_view{src.get_extent(), src.data()}
+        {   // TODO: refactor to pass owner directly to delegated to ctor.
+            static const auto accs = accelerator::get_all();
+
+            for (auto&& acc : accs) {
+                if (acc == src.get_accelerator_view().get_accelerator()) {
+                    owner_ = &acc;
+                    break;
+                }
+            }
+        }
+        array_view(hc::array<T, N>& src) [[hc]]
+            : array_view{src.get_extent(), src.data()}
+        {}
+
+        template<
+            typename Container,
+            typename std::enable_if<
+                N == 1 && __is_container<Container>::value>::type* = nullptr>
+        explicit
+        array_view(Container& src) : array_view{hc::extent<1>(src.size()), src}
+        {}
+        template<int m>
+        explicit
+        array_view(value_type (&src)[m]) [[cpu, hc]]
+            : array_view{hc::extent<1>{m}, src}
+        {}
+
+        /**
+         * Constructs an array_view which is bound to the data contained in the
+         * "src" container. The extent of the array_view is that given by the
+         * "extent" argument, and the origin of the array view is at zero.
+         *
+         * @param[in] src A template argument that must resolve to a linear
+         *                container that supports .data() and .size() members
+         *                (such as std::vector or std::array)
+         * @param[in] extent The extent of this array_view.
+         */
+        template<   // TODO: redo the type predicates.
+            typename Container,
+            typename std::enable_if<
+                __is_container<Container>::value>::type* = nullptr>
+        array_view(const hc::extent<N>& extent, Container& src)
+            : array_view{extent, src.data()}
+        {
+            static_assert(
+                std::is_same<typename Container::value_type, ValT_>::value,
+                "container element type and array view element type must "
+                    "match");
+        }
+
+        /**
+         * Constructs an array_view which is bound to the data contained in the
+         * "src" container. The extent of the array_view is that given by the
+         * "extent" argument, and the origin of the array view is at zero.
+         *
+         * @param[in] src A pointer to the source data this array_view will bind
+         *                to. If the number of elements pointed to is less than
+         *                the size of extent, the behavior is undefined.
+         * @param[in] ext The extent of this array_view.
+         */
+        array_view(const hc::extent<N>& ext, value_type* src) [[cpu]]
+        try :
+            data_{cache_for_(src, ext.size() * sizeof(T))},
+            owner_{nullptr},
+            extent_{ext},
+            base_ptr_{static_cast<T*>(data_.get())},
+            source_{
+                (src == reinterpret_cast<value_type*>(this)) ? base_ptr_ : src},
+            writers_for_this_{
+                std::is_const<T>{} ? max_array_view_cnt_ : writers_for_()},
+            source_type_{detail::pointer_info(source_).type}
+        {
+            if (source_ == base_ptr_) return;
+
+            auto s = hsa_memory_copy(
+                const_cast<ValT_*>(base_ptr_),
+                source_,
+                extent_.size() * sizeof(T));
+
+            if (s == HSA_STATUS_SUCCESS) return;
+
+            throw std::runtime_error{
+                "Failed to copy source data into array_view."};
+        }
+        catch (const std::exception& ex) {
+            if (ext.size() != 0) throw ex;
+
+            throw
+                std::domain_error{"Tried to construct zero-sized array_view."};
+        }
+        array_view(const hc::extent<N>& ext, value_type* src) [[hc]]
+            :
+            owner_{nullptr},
+            extent_{ext},
+            base_ptr_{src},
+            source_{nullptr},
+            writers_for_this_{max_array_view_cnt_}
+        {}
+
+        /**
+         * Constructs an array_view which is not bound to a data source. The
+         * extent of the array_view is that given by the "extent" argument, and
+         * the origin of the array view is at zero. An array_view thus
+         * constructed represents uninitialized data and the underlying
+         * allocations are created lazily as the array_view is accessed on
+         * different locations (on an accelerator_view or on the CPU).
+         *
+         * @param[in] ext The extent of this array_view.
+         */
+        explicit
+        array_view(const hc::extent<N>& ext)
+            : array_view{ext, reinterpret_cast<value_type*>(this)}
+        {}
+
+        /**
+         * Equivalent to construction using
+         * "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)".
+         *
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     this array_view.
+         * @param[in] src A template argument that must resolve to a contiguous
+         *                container that supports .data() and .size() members
+         *                (such as std::vector or std::array)
+         */
+        template<
+            typename Container,
+            typename std::enable_if<
+                N == 1 && __is_container<Container>::value>::type* = nullptr>
+        array_view(int e0, Container& src)
+            : array_view{hc::extent<N>{e0}, src}
+        {}
+        template<
+            typename Container,
+            typename std::enable_if<
+                N == 2 && __is_container<Container>::value>::type* = nullptr>
+        array_view(int e0, int e1, Container& src)
+            : array_view{hc::extent<N>{e0, e1}, src}
+        {}
+        template<
+            typename Container,
+            typename std::enable_if<
+                N == 3 && __is_container<Container>::value>::type* = nullptr>
+        array_view(int e0, int e1, int e2, Container& src)
+            : array_view{hc::extent<N>{e0, e1, e2}, src}
+        {}
+
+        /**
+         * Equivalent to construction using
+         * "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)".
+         *
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     this array_view.
+         * @param[in] src A pointer to the source data this array_view will bind
+         *                to. If the number of elements pointed to is less than
+         *                the size of extent, the behavior is undefined.
+         */
+        template<int m = N, typename std::enable_if<m == 1>::type* = nullptr>
+        array_view(int e0, value_type *src) [[cpu, hc]]
+            : array_view{hc::extent<N>{e0}, src}
+        {}
+        template<int m = N, typename std::enable_if<m == 2>::type* = nullptr>
+        array_view(int e0, int e1, value_type *src) [[cpu, hc]]
+            : array_view{hc::extent<N>{e0, e1}, src}
+        {}
+        template<int m = N, typename std::enable_if<m == 3>::type* = nullptr>
+        array_view(int e0, int e1, int e2, value_type *src) [[cpu, hc]]
+            : array_view{hc::extent<N>{e0, e1, e2}, src}
+        {}
+
+        /**
+         * Equivalent to construction using
+         * "array_view(extent<N>(e0 [, e1 [, e2 ]]))".
+         *
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     this array_view.
+         */
+        template<int m = N, typename std::enable_if<m == 1>::type* = nullptr>
+        explicit
+        array_view(int e0) : array_view{hc::extent<N>{e0}}
+        {}
+        template<int m = N, typename std::enable_if<m == 2>::type* = nullptr>
+        array_view(int e0, int e1) : array_view{hc::extent<N>{e0, e1}}
+        {}
+        template<int m = N, typename std::enable_if<m == 3>::type* = nullptr>
+        array_view(int e0, int e1, int e2)
+            : array_view{hc::extent<N>{e0, e1, e2}}
+        {}
+
+        /**
+         * Copy constructor. Constructs an array_view from the supplied argument
+         * other. A shallow copy is performed.
+         *
+         * @param[in] other An object of type array_view<T,N> or
+         *                  array_view<const T,N> from which to initialize this
+         *                  new array_view.
+         */
+        template<
+            typename U = T,
+            typename std::enable_if<!std::is_const<U>{}>::type* = nullptr>
+        array_view(const array_view& other) [[cpu]]
+            :
+            data_{other.data_},
+            owner_{other.owner_},
+            extent_{other.extent_},
+            base_ptr_{other.base_ptr_},
+            source_{other.source_},
+            writers_for_this_{other.writers_for_this_},
+            source_type_{other.source_type_}
+        {   // N.B.: this is coupled with make_registered_kernel, and relies on
+            //       it copying the user provided Callable.
+            captured_().insert(writers_for_this_);
+        }
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_const<U>{}>::type* = nullptr>
+        array_view(const array_view& other) [[cpu]] // TODO: use = default.
+            :
+            data_{other.data_},
+            owner_{other.owner_},
+            extent_{other.extent_},
+            base_ptr_{other.base_ptr_},
+            source_{other.source_},
+            writers_for_this_{other.writers_for_this_},
+            source_type_{other.source_type_}
+        {}
+
+        array_view(const array_view& other) [[hc]]
+            :
+            owner_{nullptr},
+            extent_{other.extent_},
+            base_ptr_{other.base_ptr_},
+            writers_for_this_{max_array_view_cnt_}
+        {}
+
+        template<
+            typename U,
+            typename V = T,
+            typename std::enable_if<
+                !std::is_const<U>{} && std::is_const<V>{}>::type* = nullptr>
+        array_view(const array_view<U, N>& other) [[cpu]]
+            :
+            data_{other.data_},
+            owner_{other.owner_},
+            extent_{other.extent_},
+            base_ptr_{other.base_ptr_},
+            source_{other.source_},
+            writers_for_this_{other.writers_for_this_},
+            source_type_{other.source_type_}
+        {}
+        template<
+            typename U,
+            typename V = T,
+            typename std::enable_if<
+                !std::is_const<U>{} && std::is_const<V>{}>::type* = nullptr>
+        array_view(const array_view<U, N>& other) [[hc]]
+            :
+            owner_{nullptr},
+            extent_{other.extent_},
+            base_ptr_{other.base_ptr_},
+            writers_for_this_{max_array_view_cnt_}
+        {}
+        /**
+         * Move constructor. Constructs an array_view from the supplied argument
+         * other.
+         *
+         * @param[in] other An object of type array_view<T,N> or
+         *                  array_view<const T,N> from which to initialize this
+         *                  new array_view.
+         */
+        array_view(array_view&& other) [[cpu, hc]]
+            :
+            data_{std::move(other.data_)},
+            owner_{other.owner_},
+            extent_{std::move(other.extent_)},
+            base_ptr_{other.base_ptr_},
+            source_{other.source_},
+            writers_for_this_{other.writers_for_this_},
+            source_type_{other.source_type_}
+        {
+            other.base_ptr_ = nullptr;
+            other.source_ = nullptr;
+            other.writers_for_this_ = max_array_view_cnt_;
+        }
+
+        /**
+         * Access the extent that defines the shape of this array_view.
+         */
+        hc::extent<N> get_extent() const [[cpu, hc]]
+        {
+            return extent_;
+        }
+
+        /**
+         * Access the accelerator_view where the data source of the array_view
+         * is located.
+         *
+         * When the data source of the array_view is native CPU memory, the
+         * method returns
+         * accelerator{accelerator::cpu_accelerator}.default_view. When the data
+         * source underlying the array_view is an array, the method returns the
+         * accelerator_view where the source array is located.
+         */
+        accelerator_view get_source_accelerator_view() const
+        {
+            static const auto cpu_av{
+                accelerator{accelerator::cpu_accelerator()}.get_default_view()};
+
+            return owner_ ? owner_->get_default_view() : cpu_av;
+        }
+
+        /**
+         * Assigns the contents of the array_view "other" to this array_view,
+         * using a shallow copy. Both array_views will refer to the same data.
+         *
+         * @param[in] other An object of type array_view<T,N> from which to copy
+         *                  into this array.
+         * @return Returns *this.
+         */
+        array_view& operator=(const array_view& other) [[cpu, hc]]
+        {
+            using std::swap;
+
+            array_view tmp{other};
+            swap(*this, tmp);
+
+            return *this;
+        }
+
+        /**
+         * Moves the contents of the array_view "other" to this array_view,
+         * leaving "other" in a moved-from state.
+         *
+         * @param[in] other An object of type array_view<T,N> from which to move
+         *                  into this array.
+         * @return Returns *this.
+         */
+        array_view& operator=(array_view&& other) [[cpu]]
+        {   // TODO: redo.
+            using std::swap;
+
+            swap(data_, other.data_);
+            swap(owner_, other.owner_);
+            swap(extent_, other.extent_);
+            swap(base_ptr_, other.base_ptr_);
+            swap(source_, other.source_);
+            swap(writers_for_this_, other.writers_for_this_);
+
+            return *this;
+        }
+        array_view& operator=(array_view&& other) [[hc]]
+        {   // TODO: redo.
+            using std::swap;
+
+            swap(owner_, other.owner_);
+            swap(extent_, other.extent_);
+            swap(base_ptr_, other.base_ptr_);
+
+            return *this;
+        }
+
+        /**
+         * Copies the data referred to by this array_view to the array given by
+         * "dest", as if by calling "copy(*this, dest)"
+         *
+         * @param[in] dest An object of type array <T,N> to which to copy data
+         *                 from this array.
+         */
+        void copy_to(array<T, N>& dest) const
+        {
+            copy(*this, dest);
+        }
+
+        /**
+         * Copies the contents of this array_view to the array_view given by
+         * "dest", as if by calling "copy(*this, dest)"
+         *
+         * @param[in] dest An object of type array_view<T,N> to which to copy
+         *                 data from this array.
+         */
+        void copy_to(const array_view& dest) const
+        {
+            copy(*this, dest);
+        }
+
+        /**
+         * Returns a pointer to the first data element underlying this
+         * array_view. This is only available on array_views of rank 1.
+         *
+         * When the data source of the array_view is native CPU memory, the
+         * pointer returned by data() is valid for the lifetime of the data
+         * source.
+         *
+         * When the data source underlying the array_view is an array, or the
+         * array_view is created without a data source, the pointer returned by
+         * data() in CPU context is ephemeral and is invalidated when the
+         * original data source or any of its views are accessed on an
+         * accelerator_view through a parallel_for_each or a copy operation.
+         *
+         * @return A pointer to the first element in the linearised array.
+         */
+        T* data() const [[cpu]]
+        {
+            static_assert(
+                N == 1, "data() is only permissible on array views of rank 1");
+
+            return updated_data_();
+        }
+        T* data() const [[hc]]
+        {
+            static_assert(
+                N == 1, "data() is only permissible on array views of rank 1");
+
+            return base_ptr_;
+        }
+
+        /**
+         * Returns a pointer to the device memory underlying this array_view.
+         *
+         * @return A (const) pointer to the first element in the array_view on
+         *         the device memory.
+         */
+        T* accelerator_pointer() const [[cpu, hc]] // TODO: this should be removed.
+        {
+            return base_ptr_;
+        }
+
+        /**
+         * Calling this member function informs the array_view that its bound
+         * memory has been modified outside the array_view interface. This will
+         * render all cached information stale.
+         */
+        void refresh() const
+        {
+            static const accelerator cpu{accelerator::cpu_accelerator()};
+
+            if (owner_ && *owner_ == cpu) return;
+            if (base_ptr_ == source_) return;
+
+            auto s = hsa_memory_copy(
+                const_cast<ValT_*>(base_ptr_),
+                source_,
+                extent_.size() * sizeof(T));
+            if (s == HSA_STATUS_SUCCESS) return;
+
+            throw std::runtime_error{"Failed to refresh cache for array_view."};
+        }
+
+        /**
+         * Calling this member function synchronizes any modifications made to
+         * the data underlying "this" array_view to its source data container.
+         * For example, for an array_view on system memory, if the data
+         * underlying the view are modified on a remote accelerator_view through
+         * a parallel_for_each invocation, calling synchronize ensures that the
+         * modifications are synchronized to the source data and will be visible
+         * through the system memory pointer which the array_view was created
+         * over.
+         *
+         * For writable array_view objects, callers of this functional can
+         * optionally specify the type of access desired on the source data
+         * container through the "type" parameter. For example specifying a
+         * "access_type_read" (which is also the default value of the parameter)
+         * indicates that the data has been synchronized to its source location
+         * only for reading. On the other hand, specifying an access_type of
+         * "access_type_read_write" synchronizes the data to its source location
+         * both for reading and writing; i.e. any modifications to the source
+         * data directly through the source data container are legal after
+         * synchronizing the array_view with write access and before
+         * subsequently accessing the array_view on another remote location.
+         *
+         * It is advisable to be precise about the access_type specified in the
+         * synchronize call; i.e. if only write access it required, specifying
+         * access_type_write may yield better performance that calling synchronize
+         * with "access_type_read_write" since the later may require any
+         * modifications made to the data on remote locations to be synchronized to
+         * the source location, which is unnecessary if the contents are
+         * intended to be overwritten without reading.
+         *
+         * @param[in] type An argument of type "access_type" which specifies the
+         *                 type of access on the data source that the array_view
+         *                 is synchronized for.
+         */
+        template<
+            typename U = T,
+            typename std::enable_if<!std::is_const<U>{}>::type* = nullptr>
+        void synchronize(access_type type = access_type_read) const
+        {
+            if (type == access_type_none || type == access_type_write) return;
+
+            std::vector<hsa_signal_t> tmp;
+            std::forward_list<std::shared_future<void>> tmp1;
+            {
+                std::lock_guard<std::mutex> lck{
+                    writers_()[writers_for_this_].second.first};
+
+                tmp = std::move(writer_signals_()[writers_for_this_]);
+                tmp1 = std::move(writers_()[writers_for_this_].second.second);
+            }
+            for (auto&& x : tmp) {
+                if (x.handle != 0) detail::Signal_pool::wait(x);
+            }
+
+            if (source_ == base_ptr_) return;
+
+            auto s = hsa_memory_copy(
+                source_, base_ptr_, extent_.size() * sizeof(T));
+
+            if (s == HSA_STATUS_SUCCESS) return;
+
+            throw std::runtime_error{"Failed to synchronise array_view."};
+        }
+
+        template<
+            typename U = T,
+            typename std::enable_if<std::is_const<U>{}>::type* = nullptr>
+        void synchronize(access_type = access_type_read) const
+        {
+            return;
+        }
+
+        /**
+         * An asynchronous version of synchronize, which returns a completion
+         * future object. When the future is ready, the synchronization
+         * operation is complete.
+         *
+         * @return An object of type completion_future that can be used to
+         *         determine the status of the asynchronous operation or can be
+         *         used to chain other operations to be executed after the
+         *         completion of the asynchronous operation.
+         */
+        completion_future synchronize_async(
+            access_type type = access_type_read) const
+        {
+            if (type == access_type_none || type == access_type_write) {
+                return completion_future{
+                    std::async(std::launch::deferred, [](){}).share()};
+            }
+
+            return completion_future{
+                std::async([this]() { synchronize(); }).share()};
+        }
+
+        /**
+         * Calling this member function synchronizes any modifications made to
+         * the data underlying "this" array_view to the specified
+         * accelerator_view "av". For example, for an array_view on system
+         * memory, if the data underlying the view is modified on the CPU, and
+         * synchronize_to is called on "this" array_view, then the array_view
+         * contents are cached on the specified accelerator_view location.
+         *
+         * For writable array_view objects, callers of this functional can
+         * optionally specify the type of access desired on the specified target
+         * accelerator_view "av", through the "type" parameter. For example
+         * specifying a "access_type_read" (which is also the default value of
+         * the parameter) indicates that the data has been synchronized to "av"
+         * only for reading. On the other hand, specifying an access_type of
+         * "access_type_read_write" synchronizes the data to "av" both for
+         * reading and writing; i.e. any modifications to the data on "av" are
+         * legal after synchronizing the array_view with write access and before
+         * subsequently accessing the array_view on a location other than "av".
+         *
+         * It is advisable to be precise about the access_type specified in the
+         * synchronize call; i.e. if only write access it required, specifying
+         * access_type_write may yield better performance that calling
+         * synchronize with "access_type_read_write" since the later may require
+         * any modifications made to the data on remote locations to be
+         * synchronized to "av", which is unnecessary if the contents are
+         * intended to be immediately overwritten without reading.
+         *
+         * @param[in] av The target accelerator_view that "this" array_view is
+         *               synchronized for access on.
+         * @param[in] type An argument of type "access_type" which specifies the
+         *                 type of access on the data source that the array_view
+         *                 is synchronized for.
+         */
+        void synchronize_to(
+            const accelerator_view& av,
+            access_type type = access_type_read) const
+        {   // TODO: assess optimisation opportunities.
+            if (owner_ && av.get_accelerator() == *owner_) return;
+
+            synchronize(type);
+        }
+
+        /**
+         * An asynchronous version of synchronize_to, which returns a completion
+         * future object. When the future is ready, the synchronization
+         * operation is complete.
+         *
+         * @param[in] av The target accelerator_view that "this" array_view is
+         *               synchronized for access on.
+         * @param[in] type An argument of type "access_type" which specifies the
+         *                 type of access on the data source that the array_view
+         *                 is synchronized for.
+         * @return An object of type completion_future that can be used to
+         *         determine the status of the asynchronous operation or can be
+         *         used to chain other operations to be executed after the
+         *         completion of the asynchronous operation.
+         */
+        completion_future synchronize_to_async(
+            const accelerator_view& av,
+            access_type type = access_type_read) const
+        {
+            if (type == access_type_none || type == access_type_write) {
+                return completion_future{
+                    std::async(std::launch::deferred, [](){}).share()};
+            }
+            if (owner_ && av.get_accelerator() == *owner_) {
+                return completion_future{
+                    std::async(std::launch::deferred, [](){}).share()};
+            }
+
+            return synchronize_async(type);
+        }
+
+        /**
+         * Indicates to the runtime that it may discard the current logical
+         * contents of this array_view. This is an optimization hint to the
+         * runtime used to avoid copying the current contents of the view to a
+         * target accelerator_view, and its use is recommended if the existing
+         * content is not needed.
+         */
+        void discard_data() const
+        {
+            if (std::is_const<T>{}) return;
+
+            decltype(writers_()[writers_for_this_].second.second) tmp;
+            {
+                std::lock_guard<std::mutex> lck{
+                    writers_()[writers_for_this_].second.first};
+
+                tmp = std::move(writers_()[writers_for_this_].second.second);
+            }
+        }
+
+        /** @{ */
+        /**
+         * Returns a reference to the element of this array_view that is at the
+         * location in N-dimensional space specified by "idx".
+         *
+         * @param[in] idx An object of type index<N> that specifies the location
+         *                of the element.
+         */
+        T& operator[](const index<N>& idx) const [[cpu, hc]]
+        {
+            return updated_data_()[detail::amp_helper<
+                N, index<N>, hc::extent<N>>::flatten(idx, extent_)];
+        }
+
+        template<int m = N, typename std::enable_if<(m == 1)>::type* = nullptr>
+        T& operator[](int i0) const [[cpu]][[hc]]
+        {
+            return operator[](index<1>{i0});
+        }
+
+
+        T& operator()(const index<N>& idx) const [[cpu, hc]]
+        {
+            return operator[](idx);
+        }
+
+        /** @} */
+
+        /**
+         * Returns a reference to the element of this array_view that is at the
+         * location in N-dimensional space specified by "idx".
+         *
+         * Unlike the other indexing operators for accessing the array_view on
+         * the CPU, this method does not implicitly synchronize this
+         * array_view's contents to the CPU. After accessing the array_view on a
+         * remote location or performing a copy operation involving this
+         * array_view, users are responsible to explicitly synchronize the
+         * array_view to the CPU before calling this method. Failure to do so
+         * results in undefined behavior.
+         */
+        T& get_ref(const index<N>& idx) const [[cpu, hc]]
+        {
+            return base_ptr_[detail::amp_helper<N, index<N>, hc::extent<N>>::
+                flatten(idx, extent_)];
+        }
+
+        /** @{ */
+        /**
+         * Equivalent to
+         * "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
+         *
+         * @param[in] i0,i1,i2 The component values that will form the index 
+         *                     into this array.
+         */
+        T& operator()(int i0) const [[cpu, hc]]
+        {
+            static_assert(
+                N == 1,
+                "T& array_view::operator()(int) is only permissible on "
+                    "array_view<T, 1>");
+
+            return operator[](index<1>{i0});
+        }
+        T& operator()(int i0, int i1) const [[cpu, hc]]
+        {
+            static_assert(
+                N == 2,
+                "T& array_view::operator()(int, int) is only permissible on "
+                    "array_view<T, 2>");
+
+            return operator[](index<2>{i0, i1});
+        }
+        T& operator()(int i0, int i1, int i2) const [[cpu, hc]]
+        {
+            static_assert(
+                N == 3,
+                "T& array_view::operator()(int, int, int) is only permissible "
+                    "on array_view<T, 3>");
+
+            return operator[](index<3>{i0, i1, i2});
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * This overload is defined for array_view<T,N> where @f$N \ge 2@f$.
+         *
+         * This mode of indexing is equivalent to projecting on the
+         * most-significant dimension. It allows C-style indexing. For example:
+         *
+         * @code{.cpp}
+         * array<float,4> myArray(myExtents, ...);
+         *
+         * myArray[index<4>(5,4,3,2)] = 7;
+         * assert(myArray[5][4][3][2] == 7);
+         * @endcode
+         *
+         * @param[in] i0 An integer that is the index into the most-significant
+         *               dimension of this array.
+         * @return Returns an array_view whose dimension is one lower than that
+         *         of this array_view.
+         */
+        template<int m = N, typename std::enable_if<(m > 1)>::type* = nullptr>
+        array_view<T, N - 1> operator[](int i0) const [[cpu, hc]]
+        {
+            hc::extent<N - 1> ext;
+            for (auto i = 1; i != N; ++i) ext[i - 1] = extent_[i];
+
+            array_view<T, N - 1> tmp{ext, static_cast<T*>(base_ptr_)}; // TODO: this is incorrect.
+            tmp.base_ptr_ += i0 * ext.size();
+
+            return tmp;
+        }
+
+        template<int m = N, typename std::enable_if<(m > 1)>::type* = nullptr>
+        array_view<T, N - 1> operator()(int i0) const [[cpu, hc]]
+        {
+            return operator[](i0);
+        }
+        /** @} */
+
+        /**
+         * Returns a subsection of the source array view at the origin specified
+         * by "idx" and with the extent specified by "ext".
+         *
+         * Example:
+         *
+         * @code{.cpp}
+         * array<float,2> a(extent<2>(200,100));
+         * array_view<float,2> v1(a); // v1.extent = <200,100>
+         * array_view<float,2> v2 =
+         *     v1.section(index<2>(15,25), extent<2>(40,50));
+         * assert(v2(0,0) == v1(15,25));
+         * @endcode
+         *
+         * @param[in] idx Provides the offset/origin of the resulting section.
+         * @param[in] ext Provides the extent of the resulting section.
+         * @return Returns a subsection of the source array at specified origin,
+         *         and with the specified extent.
+         */
+        array_view<T, N> section(
+            const index<N>& origin, const hc::extent<N>& ext) const [[cpu]]
+        {
+            if (extent_.size() < (ext + origin).size()) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            const auto dx = detail::amp_helper<N, index<N>, hc::extent<N>>::
+                flatten(origin, extent_);
+
+            array_view<T, N> tmp{*this};
+            tmp.extent_ = ext;
+            tmp.base_ptr_ += dx;
+            tmp.source_ = static_cast<T*>(tmp.source_) + dx;
+
+            return tmp;
+        }
+        array_view<T, N> section(
+            const index<N>& origin, const hc::extent<N>& ext) const [[hc]]
+        {
+            const auto dx = detail::amp_helper<N, index<N>, hc::extent<N>>::
+                flatten(origin, extent_);
+
+            array_view<T, N> tmp{*this};
+            tmp.extent_ = ext;
+            tmp.base_ptr_ += dx;
+            tmp.source_ = static_cast<T*>(tmp.source_) + dx;
+
+            return tmp;
+        }
+
+        /**
+         * Equivalent to "section(idx, this->extent – idx)".
+         */
+        array_view<T, N> section(const index<N>& idx) const [[cpu, hc]]
+        {
+            hc::extent<N> ext{extent_};
+            detail::amp_helper<N, index<N>, hc::extent<N>>::minus(idx, ext);
+
+            return section(idx, ext);
+        }
+
+        /**
+         * Equivalent to "section(index<N>(), ext)".
+         */
+        array_view<T, N> section(const hc::extent<N>& ext) const [[cpu, hc]]
+        {
+            return section(index<N>{}, ext);
+        }
+
+        /** @{ */
+        /**
+         * Equivalent to
+         * "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
+         *
+         * @param[in] i0,i1,i2 The component values that will form the origin of
+         *                     the section
+         * @param[in] e0,e1,e2 The component values that will form the extent of
+         *                     the section
+         */
+        array_view<T, 1> section(int i0, int e0) const [[cpu, hc]]
+        {
+            static_assert(N == 1, "Rank must be 1.");
+
+            return section(index<1>{i0}, hc::extent<1>{e0});
+        }
+
+        array_view<T, 2> section(
+            int i0, int i1, int e0, int e1) const [[cpu, hc]]
+        {
+            static_assert(N == 2, "Rank must be 2.");
+
+            return section(index<2>{i0, i1}, hc::extent<2>{e0, e1});
+        }
+
+        array_view<T, 3> section(
+            int i0, int i1, int i2, int e0, int e1, int e2) const [[cpu, hc]]
+        {
+            static_assert(N == 3, "Rank must be 3.");
+
+            return section(index<3>{i0, i1, i2}, hc::extent<3>{e0, e1, e2});
+        }
+
+        /** @} */
+
+        /**
+         * This member function is similar to "array<T,N>::reinterpret_as",
+         * although it only supports array_views of rank 1 (only those guarantee
+         * that all elements are laid out contiguously).
+         *
+         * The size of the reinterpreted ElementType must evenly divide into the
+         * total size of this array_view.
+         *
+         * @return Returns an array_view from this array_view<T,1> with the
+         *         element type reinterpreted from T to ElementType.
+         */
+        template<typename U>
+        array_view<U, 1> reinterpret_as() const [[cpu]]
+        {
+            static_assert(
+                N == 1,
+                "reinterpret_as is only permissible on array views of rank 1.");
+
+            hc::extent<1> tmp{extent_.size() / sizeof(U)};
+
+            if (extent_.size() * sizeof(T) != tmp.size() * sizeof(U)) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            if (source_) return array_view<U, 1>{tmp, source_};
+            return array_view<U, 1>{tmp};
+        }
+        template<typename U>
+        array_view<U, 1> reinterpret_as() const [[hc]]
+        {
+            static_assert(
+                N == 1,
+                "reinterpret_as is only permissible on array views of rank 1.");
+
+            hc::extent<1> tmp{extent_.size() / sizeof(U)};
+
+            return array_view<U, 1>{tmp, base_ptr_};
+        }
+
+        /**
+         * This member function is similar to "array<T,N>::view_as", although it
+         * only supports array_views of rank 1 (only those guarantee that all
+         * elements are laid out contiguously).
+         *
+         * @return Returns an array_view from this array_view<T,1> with the rank
+         * changed to K from 1.
+         */
+        template<int m>
+        array_view<T, m> view_as(const hc::extent<m>& view_extent) const [[cpu]]
+        {
+            static_assert(
+                N == 1, "view_as is only permissible on array views of rank 1");
+
+            if (extent_.size() < view_extent.size()) {
+                throw runtime_exception{"errorMsg_throw", 0};
+            }
+
+            return array_view<T, m>{view_extent, source_};
+        }
+        template<int m>
+        array_view<T, m> view_as(const hc::extent<m>& view_extent) const [[hc]]
+        {
+            static_assert(
+                N == 1, "view_as is only permissible on array views of rank 1");
+
+            return array_view<T, m>{view_extent, source_};
+        }
+
+        ~array_view() [[cpu, hc]]
+        {
+            #if __HCC_ACCELERATOR__ != 1
+                if (!data_) return;
+
+                std::size_t n{0u};
+                if (writers_for_this_ != max_array_view_cnt_) {
+                    auto& writers = writers_()[writers_for_this_];
+                    std::lock_guard<std::mutex> lck{writers.second.first};
+
+                    n = std::distance(
+                        writers.second.second.cbegin(),
+                        writers.second.second.cend());
+
+                    if (data_.use_count() - n > 2) return;
+                }
+
+                try {
+                    synchronize(access_type_read_write);
+                }
+                catch (const std::exception& ex) {
+                    std::cerr << ex.what() << std::endl;
+                }
+
+                {
+                    std::lock_guard<std::mutex> lck{mutex_()};
+
+                    cache_().erase(source_);
+                }
+
+                if (writers_for_this_ == max_array_view_cnt_) return;
+
+                std::lock_guard<std::mutex> lck{
+                    writers_()[writers_for_this_].second.first};
+                writers_()[writers_for_this_].second.second.clear();
+                writer_signals_()[writers_for_this_].clear();
+                writers_()[writers_for_this_].first.clear();
+            #endif
+        }
+    };
+
+    // ------------------------------------------------------------------------
+    // copy
+    // ------------------------------------------------------------------------
+
+    /**
+     * The contents of "src" are copied into "dest". The source and destination
+     * may reside on different accelerators. If the extents of "src" and "dest"
+     * don't match, a runtime exception is thrown.
+     *
+     * @param[in] src An object of type array<T,N> to be copied from.
+     * @param[out] dest An object of type array<T,N> to be copied to.
+     */
+    template<typename T, int N>
+    inline
+    void copy(const array<T, N>& src, array<T, N>& dest)
+    {
+        if (src.get_extent() != dest.get_extent()) {
+            throw std::logic_error{
+                "Tried to copy arrays of mismatched extents."};
+        }
+
+        src.wait_for_all_pending_writers_();
+
+        auto s = hsa_memory_copy(
+            dest.data(), src.data(), src.get_extent().size() * sizeof(T));
+
+        if (s == HSA_STATUS_SUCCESS) return;
+
+        throw std::runtime_error{"Array copy failed."};
+    }
+
+    /** @{ */
+    /**
+     * The contents of "src" are copied into "dest". If the extents of "src" and
+     * "dest" don't match, a runtime exception is thrown.
+     *
+     * @param[in] src An object of type array<T,N> to be copied from.
+     * @param[out] dest An object of type array_view<T,N> to be copied to.
+     */
+    template<typename T, int N>
+    inline
+    void copy(const array<T, N>& src, const array_view<T, N>& dest)
+    {   // TODO: assess optimisation opportunities.
+        if (src.get_extent() != dest.get_extent()) {
+            throw std::logic_error{
+                "Tried to copy array to an array_view with a mismatched "
+                "extent."};
+        }
+
+        src.wait_for_all_pending_writers_();
+
+        auto s = hsa_memory_copy(
+            dest.data(), src.base_ptr_, src.get_extent().size() * sizeof(T));
+
+        if (s == HSA_STATUS_SUCCESS) return;
+
+        throw std::runtime_error{"array_view to array copy failed."};
+    }
+    /** @} */
+
+    /** @{ */
+    /**
+     * The contents of "src" are copied into "dest". If the extents of "src" and
+     * "dest" don't match, a runtime exception is thrown.
+     *
+     * @param[in] src An object of type array_view<T,N> (or
+     *                array_view<const T, N>) to be copied from.
+     * @param[out] dest An object of type array<T,N> to be copied to.
+     */
+    template<typename T, int N>
+    inline
+    void copy(const array_view<const T, N>& src, array<T, N>& dest)
+    {
+        if (src.get_extent() != dest.get_extent()) {
+            throw std::logic_error{
+                "Tried to copy array_view to an array with a mismatched "
+                "extent."};
+        }
+
+        auto s = hsa_memory_copy(
+            dest.data(), src.data(), src.get_extent().size() * sizeof(T));
+
+        if (s == HSA_STATUS_SUCCESS) return;
+
+        throw std::runtime_error{"array_view to array copy failed."};
+    }
+
+    template<typename T, int N>
+    inline
+    void copy(const array_view<T, N>& src, array<T, N>& dest)
+    {
+        copy(array_view<const T, N>{src}, dest);
+    }
+    /** @} */
+
+    /** @{ */
+    /**
+     * The contents of "src" are copied into "dest". If the extents of "src" and
+     * "dest" don't match, a runtime exception is thrown.
+     *
+     * @param[in] src An object of type array_view<T,N> (or
+     *                array_view<const T, N>) to be copied from.
+     * @param[out] dest An object of type array_view<T,N> to be copied to.
+     */
+    template<typename T, int N>
+    inline
+    void copy(const array_view<const T, N>& src, const array_view<T, N>& dest)
+    {
+        if (src.get_extent() != dest.get_extent()) {
+            throw std::logic_error{
+                "Tried to copy array_views with mismatched extents."};
+        }
+
+        auto s = hsa_memory_copy(
+            dest.base_ptr_, src.data(), src.get_extent().size() * sizeof(T));
+
+        if (s == HSA_STATUS_SUCCESS) return;
+
+        throw std::runtime_error{"array_view to array_view copy failed."};
+    }
+
+    template <typename T, int N>
+    inline
+    void copy(const array_view<T, N>& src, const array_view<T, N>& dest)
+    {
+        copy(array_view<const T, N>{src}, dest);
+    }
+    /** @} */
+
+    /** @{ */
+    /**
+     * The contents of a source container from the iterator range
+     * [srcBegin,srcEnd) are copied into "dest". If the number of elements in
+     * the iterator range is not equal to "dest.extent.size()", an exception is
+     * thrown.
+     *
+     * In the overloads which don't take an end-iterator it is assumed that the
+     * source iterator is able to provide at least dest.extent.size() elements,
+     * but no checking is performed (nor possible).
+     *
+     * @param[in] srcBegin An iterator to the first element of a source
+     *            container.
+     * @param[in] srcEnd An interator to the end of a source container.
+     * @param[out] dest An object of type array<T,N> to be copied to.
+     */
+    template<typename InputIterator, typename T, int N>
+    inline
+    void copy(InputIterator first, InputIterator last, array<T, N>& dst)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        if (first == last) return;
+
+        if (std::distance(first, last) != dst.get_extent().size()) {
+            throw std::logic_error{"Mismatched copy sizes."};
+        }
+
+        copy(first, dst);
+    }
+
+    template<typename InputIterator, typename T, int N>
+    inline
+    void copy(InputIterator first, array<T, N>& dst)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        auto s = hsa_memory_copy( // TODO: add to_address(), use it and not &*.
+            dst.data(), &*first, dst.get_extent().size() * sizeof(T));
+
+        if (s == HSA_STATUS_SUCCESS) return;
+
+        throw std::runtime_error{"Failed iterator range to array copy."};
+    }
+
+    /** @} */
+
+    /** @{ */
+    /**
+     * The contents of a source container from the iterator range
+     * [srcBegin,srcEnd) are copied into "dest". If the number of elements in
+     * the iterator range is not equal to "dest.extent.size()", an exception is
+     * thrown.
+     *
+     * In the overloads which don't take an end-iterator it is assumed that the
+     * source iterator is able to provide at least dest.extent.size() elements,
+     * but no checking is performed (nor possible).
+     *
+     * @param[in] srcBegin An iterator to the first element of a source
+     *            container.
+     * @param[in] srcEnd An interator to the end of a source container.
+     * @param[out] dest An object of type array_view<T,N> to be copied to.
+     */
+    template<typename InputIterator, typename T, int N>
+    inline
+    void copy(
+        InputIterator first, InputIterator last, const array_view<T, N>& dst)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        if (first == last) return;
+
+        if (std::distance(first, last) != dst.get_extent().size()) {
+            throw std::logic_error{"Mismatched copy sizes."};
+        }
+
+        auto s = hsa_memory_copy( // TODO: add to_address(), use it and not &*.
+            dst.base_ptr_, &*first, dst.get_extent().size() * sizeof(T));
+
+        if (s == HSA_STATUS_SUCCESS) return;
+
+        throw std::runtime_error{"Failed iterator range to array_view copy."};
+    }
+
+    template<typename InputIterator, typename T, int N>
+    inline
+    void copy(InputIterator first, const array_view<T, N>& dst)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        copy(first, first + dst.get_extent().size(), dst);
+    }
+
+    /** @} */
+
+    /**
+     * The contents of a source array are copied into "dest" starting with
+     * iterator destBegin. If the number of elements in the range starting
+     * destBegin in the destination container is smaller than
+     * "src.extent.size()", the behavior is undefined.
+     *
+     * @param[in] src An object of type array<T,N> to be copied from.
+     * @param[out] destBegin An output iterator addressing the position of the
+     *                       first element in the destination container.
+     */
+    template<typename OutputIterator, typename T, int N>
+    inline
+    void copy(const array<T, N>& src, OutputIterator first_out)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<
+                    OutputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<OutputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        src.wait_for_all_pending_writers_();
+
+        // TODO: must add to_address() and use instead of &*.
+        auto s = hsa_memory_copy(
+            &*first_out, src.data(), src.get_extent().size() * sizeof(T));
+
+        if (s == HSA_STATUS_SUCCESS) return;
+
+        throw std::runtime_error{"array to iterator range copy failed."};
+    }
+
+    /**
+     * The contents of a source array are copied into "dest" starting with
+     * iterator destBegin. If the number of elements in the range starting
+     * destBegin in the destination container is smaller than
+     * "src.extent.size()", the behavior is undefined.
+     *
+     * @param[in] src An object of type array_view<T,N> to be copied from.
+     * @param[out] destBegin An output iterator addressing the position of the
+     *                       first element in the destination container.
+     */
+    template<typename OutputIterator, typename T, int N>
+    inline
+    void copy(const array_view<T, N>& src, OutputIterator first_out)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<
+                    OutputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<OutputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        src.synchronize(); // TODO: conservative, temporary.
+
+        // TODO: add to_address() and use it and not &*.
+        auto s = hsa_memory_copy(
+            &*first_out, src.data(), src.get_extent().size() * sizeof(T));
+
+        if (s == HSA_STATUS_SUCCESS) return;
+
+        throw std::runtime_error{"array_view to iterator range copy failed."};
+    }
+
+    // ------------------------------------------------------------------------
+    // copy_async
+    // ------------------------------------------------------------------------
+
+    /**
+     * The contents of "src" are copied into "dest". The source and destination
+     * may reside on different accelerators. If the extents of "src" and "dest"
+     * don't match, a runtime exception is thrown.
+     *
+     * @param[in] src An object of type array<T,N> to be copied from.
+     * @param[out] dest An object of type array<T,N> to be copied to.
+     */
+    template<typename T, int N>
+    inline
+    completion_future copy_async(const array<T, N>& src, array<T, N>& dst)
+    {
+        return completion_future{std::async([&]() { copy(src, dst); }).share()};
+    }
+
+    /**
+     * The contents of "src" are copied into "dest". If the extents of "src" and
+     * "dest" don't match, a runtime exception is thrown.
+     *
+     * @param[in] src An object of type array<T,N> to be copied from.
+     * @param[out] dest An object of type array_view<T,N> to be copied to.
+     */
+    template<typename T, int N>
+    inline
+    completion_future copy_async(
+        const array<T, N>& src, const array_view<T, N>& dst)
+    {   // TODO: should this count as a writer to the array_view?
+        return completion_future{
+            std::async([&, dst]() { copy(src, dst); }).share()};
+    }
+
+    /** @{ */
+    /**
+     * The contents of "src" are copied into "dest". If the extents of "src" and
+     * "dest" don't match, a runtime exception is thrown.
+     *
+     * @param[in] src An object of type array_view<T,N> (or
+     *                array_view<const T, N>) to be copied from.
+     * @param[out] dest An object of type array<T,N> to be copied to.
+     */
+    template<typename T, int N>
+    inline
+    completion_future copy_async(
+        const array_view<const T, N>& src, array<T, N>& dst)
+    {
+        return completion_future{
+            std::async([&, src]() { copy(src, dst); }).share()};
+    }
+
+    template<typename T, int N>
+    inline
+    completion_future copy_async(const array_view<T, N>& src, array<T, N>& dst)
+    {
+        return completion_future{
+            std::async([&, src]() { copy(src, dst); }).share()};
+    }
+
+    /** @} */
+
+    /** @{ */
+    /**
+     * The contents of "src" are copied into "dest". If the extents of "src" and
+     * "dest" don't match, a runtime exception is thrown.
+     *
+     * @param[in] src An object of type array_view<T,N> (or
+     *                array_view<const T, N>) to be copied from.
+     * @param[out] dest An object of type array_view<T,N> to be copied to.
+     */
+    template<typename T, int N>
+    inline
+    completion_future copy_async(
+        const array_view<const T, N>& src, const array_view<T, N>& dst)
+    {   // TODO: should this count as a writer to the array_view?
+        return completion_future{std::async([=]() { copy(src, dst); }).share()};
+    }
+
+    template<typename T, int N>
+    inline
+    completion_future copy_async(
+        const array_view<T, N>& src, const array_view<T, N>& dst)
+    {   // TODO: should this count as a writer to the array_view?
+        return completion_future{std::async([=]() { copy(src, dst); }).share()};
+    }
+
+    /** @} */
+
+    /** @{ */
+    /**
+     * The contents of a source container from the iterator range
+     * [srcBegin,srcEnd) are copied into "dest". If the number of elements in
+     * the iterator range is not equal to "dest.extent.size()", an exception is
+     * thrown.
+     *
+     * In the overloads which don't take an end-iterator it is assumed that the
+     * source iterator is able to provide at least dest.extent.size() elements,
+     * but no checking is performed (nor possible).
+     *
+     * @param[in] srcBegin An iterator to the first element of a source
+     * container.
+     * @param[in] srcEnd An interator to the end of a source container.
+     * @param[out] dest An object of type array<T,N> to be copied to.
+     */
+    template<typename InputIterator, typename T, int N>
+    inline
+    completion_future copy_async(
+        InputIterator first, InputIterator last, array<T, N>& dst)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        if (std::distance(first, last) != dst.get_extent().size()) {
+            throw std::logic_error{"Mismatched copy sizes."};
+        }
+
+        return completion_future{
+            std::async([=, &dst]() { copy(first, last, dst); }).share()};
+    }
+
+    template<typename InputIterator, typename T, int N>
+    inline
+    completion_future copy_async(InputIterator first, array<T, N>& dst)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        return copy_async(first, first + dst.get_extent().size(), dst);
+    }
+
+    /** @} */
+
+    /** @{ */
+    /**
+     * The contents of a source container from the iterator range
+     * [srcBegin,srcEnd) are copied into "dest". If the number of elements in
+     * the iterator range is not equal to "dest.extent.size()", an exception is
+     * thrown.
+     *
+     * In the overloads which don't take an end-iterator it is assumed that the
+     * source iterator is able to provide at least dest.extent.size() elements,
+     * but no checking is performed (nor possible).
+     *
+     * @param[in] srcBegin An iterator to the first element of a source
+     *            container.
+     * @param[in] srcEnd An interator to the end of a source container.
+     * @param[out] dest An object of type array_view<T,N> to be copied to.
+     */
+    template<typename InputIterator, typename T, int N>
+    inline
+    completion_future copy_async(
+        InputIterator first, InputIterator last, const array_view<T, N>& dst)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        if (std::distance(first, last) != dst.get_extent().size()) {
+            throw std::logic_error{"Mismatched copy sizes."};
+        }
+
+        return completion_future{
+            std::async([=]() { copy(first, last, dst); }).share()};
+    }
+
+    template<typename InputIterator, typename T, int N>
+    inline
+    completion_future copy_async(
+        InputIterator first, const array_view<T, N>& dst)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<InputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        return copy_async(first, first + dst.get_extent().size(), dst);
+    }
+
+    /** @} */
+
+    /**
+     * The contents of a source array are copied into "dest" starting with
+     * iterator destBegin. If the number of elements in the range starting
+     * destBegin in the destination container is smaller than
+     * "src.extent.size()", the behavior is undefined.
+     *
+     * @param[in] src An object of type array<T,N> to be copied from.
+     * @param[out] destBegin An output iterator addressing the position of the
+     *                       first element in the destination container.
+     */
+    template<typename OutputIterator, typename T, int N>
+    inline
+    completion_future copy_async(
+        const array<T, N>& src, OutputIterator first_out)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<
+                    OutputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<OutputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        return completion_future{
+            std::async([&, first_out]() { copy(src, first_out); }).share()};
+    }
+
+    /**
+     * The contents of a source array are copied into "dest" starting with
+     * iterator destBegin. If the number of elements in the range starting
+     * destBegin in the destination container is smaller than
+     * "src.extent.size()", the behavior is undefined.
+     *
+     * @param[in] src An object of type array_view<T,N> to be copied from.
+     * @param[out] destBegin An output iterator addressing the position of the
+     *                       first element in the destination container.
+     */
+    template<typename OutputIterator, typename T, int N>
+    inline
+    completion_future copy_async(
+        const array_view<T, N>& src, OutputIterator first_out)
+    {
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<
+                    OutputIterator>::iterator_category,
+                std::random_access_iterator_tag>{},
+            "Only contiguous random access iterators supported.");
+        static_assert(
+            std::is_same<
+                typename std::iterator_traits<OutputIterator>::value_type, T>{},
+            "Only same type copies supported.");
+
+        return completion_future{
+            std::async([=]() { copy(src, first_out); }).share()};
+    }
+
+    // ------------------------------------------------------------------------
+    // parallel_for_each
+    // ------------------------------------------------------------------------
+
+    template<typename Kernel, int n>
+    completion_future parallel_for_each(
+        const accelerator_view&, const hc::extent<n>&, const Kernel&);
+
+    template<typename Kernel, int n>
+    completion_future parallel_for_each(
+        const accelerator_view&, const tiled_extent<n>&, const Kernel&);
+
+    template<typename Kernel, int n>
+    inline
+    completion_future parallel_for_each(
+        const hc::extent<n>& compute_domain, const Kernel& f)
+    {
+        return parallel_for_each(
+            accelerator::get_auto_selection_view(), compute_domain, f);
+    }
+
+    template<int n, typename Kernel>
+    inline
+    completion_future parallel_for_each(
+        const tiled_extent<n>& compute_domain, const Kernel& f) {
+        return parallel_for_each(
+            accelerator::get_auto_selection_view(), compute_domain, f);
+    }
+
+    template<int n>
+    inline
+    void validate_compute_domain(const hc::extent<n>& compute_domain)
+    {
+        std::size_t sz{1};
+        for (auto i = 0; i != n; ++i) {
+            sz *= compute_domain[i];
+
+            if (sz < 1) throw invalid_compute_domain{"Extent is not positive."};
+            if (sz > UINT_MAX) {
+                throw invalid_compute_domain{"Extent is too large."};
+            }
+        }
+    }
+
+    template<typename Kernel>
+    inline
+    std::forward_list<std::shared_future<void>> predecessors_for(
+        const Kernel&)
+    {   // TODO: cleanup & optimise; the iteration can be collapsed.
+        using AR = array_base;
+        using AV = array_view_base;
+
+        std::forward_list<std::shared_future<void>> r;
+        for (auto&& widx : AR::captured_()) {
+            std::lock_guard<std::mutex> lck{AR::writers_()[widx].second.first};
+
+            r.splice_after(
+                r.before_begin(),
+                std::move(AR::writers_()[widx].second.second),
+                AR::writers_()[widx].second.second.before_begin());
+        }
+        for (auto&& widx : AV::captured_()) {
+            std::lock_guard<std::mutex> lck{AV::writers_()[widx].second.first};
+
+            r.splice_after(
+                r.before_begin(),
+                std::move(AV::writers_()[widx].second.second),
+                AV::writers_()[widx].second.second.before_begin());
+        }
+
+        return r;
+    }
+
+    inline
+    void register_writer(
+        const std::pair<std::shared_future<void>, hsa_signal_t>& writer)
+    {   // TODO: cleanup & optimise; the iteration can be collapsed.
+        using AR = array_base;
+        using AV = array_view_base;
+
+        for (auto&& widx : AR::captured_()) {
+            std::lock_guard<std::mutex> lck{AR::writers_()[widx].second.first};
+
+            AR::writers_()[widx].second.second.emplace_front(writer.first);
+        }
+        for (auto&& widx : AV::captured_()) {
+            std::lock_guard<std::mutex> lck{AV::writers_()[widx].second.first};
+
+            AV::writers_()[widx].second.second.emplace_front(writer.first);
+            AV::writer_signals_()[widx].push_back(writer.second);
+        }
+
+        AR::captured_().clear();
+    }
+
+    //ND parallel_for_each, nontiled
+    template<typename Kernel, int n>
+    inline
+    __attribute__((annotate("__HC_PFE__"), warn_unused_result))
+    completion_future parallel_for_each(
+        const accelerator_view& av,
+        const hc::extent<n>& compute_domain,
+        const Kernel& f)
+    {   // TODO: unify with tiled, everything is essentially tiled
+        if (compute_domain.size() == 0) {
+            return completion_future{
+                std::async(std::launch::deferred, [](){}).share()};
+        }
+
+        if (av.get_accelerator().get_device_path() == L"cpu") {
+        throw hc::runtime_exception{
+            detail::__errorMsg_UnsupportedAccelerator, detail::E_FAIL};
+        }
+
+        validate_compute_domain(compute_domain);
+
+        array_view_base::captured_().clear();
+        {
+            [[maybe_unused]]
+            const auto register_captured_avs = f;
+        }
+        for (auto&& x : predecessors_for(f)) if (x.valid()) x.wait();
+
+        auto tmp = detail::launch_kernel_async(av, compute_domain, f);
+
+        av.add_pending_task_(tmp.first);
+        register_writer(tmp);
+
+        return tmp.first;
+    }
+
+    template<int n>
+    inline
+    void validate_tiled_compute_domain(const tiled_extent<n>& compute_domain)
+    {
+        validate_compute_domain(compute_domain);
+
+        size_t sz{1};
+        for (auto i = 0u; i != n; ++i) {
+            if (compute_domain.tile_dim[i] < 0) {
+                throw invalid_compute_domain{
+                    "The extent of the tile must be positive."};
+            }
+
+            constexpr int max_tile_dim{1024}; // Should be read via the HSArt.
+            sz *= compute_domain.tile_dim[i];
+            if (max_tile_dim < sz) {
+                throw invalid_compute_domain{
+                    "The extent of the tile exceeds the device limit"};
+            }
+
+            if (compute_domain[i] < compute_domain.tile_dim[i]) {
+                throw invalid_compute_domain{
+                    "The extent of the tile exceeds the compute grid extent"};
+            }
+        }
+    }
+
+    //ND parallel_for_each, tiled
+    template <typename Kernel, int n>
+    inline
+    __attribute__((annotate("__HC_PFE__"), warn_unused_result))
+    completion_future parallel_for_each(
+        const accelerator_view& av,
+        const tiled_extent<n>& compute_domain,
+        const Kernel& f)
+    {
+        if (compute_domain.size() == 0) {
+            return completion_future{
+                std::async(std::launch::deferred, [](){}).share()};
+        }
+
+        if (av.get_accelerator().get_device_path() == L"cpu") {
+            throw hc::runtime_exception{
+                detail::__errorMsg_UnsupportedAccelerator, detail::E_FAIL};
+        }
+
+        validate_tiled_compute_domain(compute_domain);
+
+        array_view_base::captured_().clear();
+        {
+            [[maybe_unused]]
+            const auto register_captured_avs = f;
+        }
+        for (auto&& x : predecessors_for(f)) if (x.valid()) x.wait();
+
+        auto tmp = detail::launch_kernel_async(av, compute_domain, f);
+
+        av.add_pending_task_(tmp.first);
+        register_writer(tmp);
+
+        return tmp.first;
+    }
+} // namespace hc
\ No newline at end of file
diff --git a/include/hc/hc_agent_pool.hpp b/include/hc/hc_agent_pool.hpp
new file mode 100644
index 00000000000..ff0a5bdb14c
--- /dev/null
+++ b/include/hc/hc_agent_pool.hpp
@@ -0,0 +1,472 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "hc_runtime.hpp"
+
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace std
+{
+    template<>
+    struct hash<hsa_agent_t> {
+        std::size_t operator()(hsa_agent_t x) const noexcept
+        {
+            return std::hash<decltype(x.handle)>{}(x.handle);
+        }
+    };
+}
+
+inline
+bool operator==(hsa_agent_t x, hsa_agent_t y) noexcept
+{
+    return x.handle == y.handle;
+}
+
+inline
+bool operator==(hsa_region_t x, hsa_region_t y) noexcept
+{
+    return x.handle == y.handle;
+}
+
+namespace hc
+{
+    namespace detail
+    {
+        class Agent_pool {
+            // IMPLEMENTATION - TYPES
+            class HSA_agent;
+
+            // IMPLEMENTATION - STATICS
+            static
+            const std::vector<hsa_agent_t>& agents_();
+            static
+            hsa_agent_t cpu_agent_();
+            static
+            hsa_agent_t default_agent_();
+            static
+            hsa_region_t system_cg_();
+        public:
+            // STATICS
+            static
+            std::unordered_map<hsa_agent_t, HSA_agent>& pool();
+            static
+            hsa_agent_t cpu_agent();
+            static
+            hsa_agent_t& default_agent();
+        };
+
+        class Agent_pool::HSA_agent {
+            friend class Agent_pool;
+
+            // IMPLEMENTATION - DATA
+            hsa_agent_t agent_;
+
+            // IMPLEMENTATION - STATICS
+            static
+            std::vector<hsa_region_t> global_regions_(hsa_agent_t x)
+            {
+                using C = std::vector<hsa_region_t>;
+
+                C r;
+                throwing_hsa_result_check(
+                    hsa_agent_iterate_regions(x, [](hsa_region_t rg, void* pr) {
+                        hsa_region_segment_t s{};
+                        throwing_hsa_result_check(
+                            hsa_region_get_info(
+                                rg, HSA_REGION_INFO_SEGMENT, &s),
+                            __FILE__, __func__, __LINE__);
+
+                        if (s == HSA_REGION_SEGMENT_GLOBAL) {
+                            static_cast<C*>(pr)->push_back(rg);
+                        }
+
+                        return HSA_STATUS_SUCCESS;
+                    }, &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            std::uint32_t cu_count_(hsa_agent_t x)
+            {
+                std::uint32_t r{};
+                throwing_hsa_result_check(
+                    hsa_agent_get_info(
+                        x,
+                        static_cast<hsa_agent_info_t>(
+                            HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT),
+                        &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            hsa_region_t fine_grained_(hsa_agent_t x)
+            {
+                for (auto&& region : global_regions_(x)) {
+                    std::uint32_t f{};
+                    throwing_hsa_result_check(
+                        hsa_region_get_info(
+                            region, HSA_REGION_INFO_GLOBAL_FLAGS, &f),
+                        __FILE__, __func__, __LINE__);
+
+                    if (f & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) return region;
+                }
+
+                return {};
+            }
+
+            static
+            hsa_region_t group_(hsa_agent_t x)
+            {
+                hsa_region_t g{};
+                throwing_hsa_result_check(
+                    hsa_agent_iterate_regions(x, [](hsa_region_t r, void* pg) {
+                        hsa_region_segment_t s{};
+                        throwing_hsa_result_check(
+                            hsa_region_get_info(r, HSA_REGION_INFO_SEGMENT, &s),
+                            __FILE__, __func__, __LINE__);
+
+                        if (s == HSA_REGION_SEGMENT_GROUP) {
+                            *static_cast<hsa_region_t*>(pg) = r;
+                        }
+
+                        return HSA_STATUS_SUCCESS;
+                    }, &g),
+                    __FILE__, __func__, __LINE__);
+
+                return g;
+            }
+
+            static
+            bool is_cpu_accessible_(hsa_region_t x)
+            {
+                if (x.handle == 0) return false;
+
+                bool r{false};
+                throwing_hsa_result_check(
+                    hsa_region_get_info(
+                        x,
+                        static_cast<hsa_region_info_t>(
+                            HSA_AMD_REGION_INFO_HOST_ACCESSIBLE),
+                        &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            std::uint32_t max_queue_cnt_(hsa_agent_t x)
+            {   // We assume that 8 queues per SE, out of which 3 / 4 are
+                // dedicated to compute. TODO: assess if we need to subtract the
+                // queues implicitly created by ROCr.
+                static constexpr double compute_dedicated{0.75};
+                static constexpr std::uint32_t queues_per_se{8u};
+
+                std::uint32_t se_cnt{};
+                throwing_hsa_result_check(
+                    hsa_agent_get_info(
+                        x,
+                        static_cast<hsa_agent_info_t>(
+                            HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES),
+                        &se_cnt),
+                    __FILE__, __func__, __LINE__);
+
+                return se_cnt * queues_per_se * compute_dedicated;
+            }
+
+            static
+            std::uint32_t max_queue_sz_(hsa_agent_t x)
+            {
+                std::uint32_t r{};
+                throwing_hsa_result_check(
+                    hsa_agent_get_info(x, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            std::uint32_t min_queue_sz_(hsa_agent_t x)
+            {
+                std::uint32_t r{};
+                throwing_hsa_result_check(
+                    hsa_agent_get_info(x, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            std::wstring name_(hsa_agent_t x)
+            {
+                static constexpr std::size_t max_name_length{64};
+
+                char tmp[max_name_length]{};
+                throwing_hsa_result_check(
+                    hsa_agent_get_info(x, HSA_AGENT_INFO_NAME, tmp),
+                    __FILE__, __func__, __LINE__);
+
+                return std::wstring{tmp, tmp + max_name_length};
+            }
+
+            static
+            enums::accelerator_profile profile_(hsa_agent_t x)
+            {   // N.B.: AMD is not going to expose more than one ISA per agent
+                //       at this point in time.
+                bool p[2]{};
+                throwing_hsa_result_check(
+                    hsa_agent_iterate_isas(x, [](hsa_isa_t i, void* pp) {
+                        throwing_hsa_result_check(
+                            hsa_isa_get_info_alt(i, HSA_ISA_INFO_PROFILES, pp),
+                            __FILE__, __func__, __LINE__);
+
+                        return HSA_STATUS_SUCCESS;
+                    }, p),
+                    __FILE__, __func__, __LINE__);
+
+                if (p[HSA_PROFILE_BASE]) return enums::accelerator_profile_base;
+                if (p[HSA_PROFILE_FULL]) return enums::accelerator_profile_full;
+                return enums::accelerator_profile_none;
+            }
+
+            static
+            std::size_t size_(hsa_region_t x)
+            {
+                if (x.handle == 0) return 0u;
+
+                std::size_t r{};
+                throwing_hsa_result_check(
+                    hsa_region_get_info(x, HSA_REGION_INFO_SIZE, &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            hsa_region_t agent_allocated_cg_(hsa_agent_t x)
+            {
+                std::vector<hsa_region_t> r{global_regions_(x)};
+
+                for (auto&& agent : agents_()) {
+                    if (agent == x) continue;
+
+                    auto tmp = global_regions_(agent);
+                    r.erase(
+                        std::remove_if(r.begin(), r.end(), [&](hsa_region_t a) {
+                            return std::find(
+                                tmp.cbegin(), tmp.cend(), a) != tmp.cend();
+                        }),
+                        r.end());
+                }
+
+                if (r.empty()) return {};
+
+                return r.front();
+            }
+
+            static
+            hsa_device_type_t type_(hsa_agent_t x)
+            {
+                hsa_device_type_t r{};
+                throwing_hsa_result_check(
+                    hsa_agent_get_info(x, HSA_AGENT_INFO_DEVICE, &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            std::uint32_t version_(hsa_agent_t x)
+            {
+                std::uint16_t hi{};
+                throwing_hsa_result_check(
+                    hsa_agent_get_info(x, HSA_AGENT_INFO_VERSION_MAJOR, &hi),
+                    __FILE__, __func__, __LINE__);
+
+                std::uint16_t lo{};
+                throwing_hsa_result_check(
+                    hsa_agent_get_info(x, HSA_AGENT_INFO_VERSION_MINOR, &lo),
+                    __FILE__, __func__, __LINE__);
+
+                return (hi << 16u) | lo;
+            }
+
+            // IMPLEMENTATION - CREATORS
+            explicit
+            HSA_agent(hsa_agent_t x)
+                :
+                agent_{x},
+                agent_allocated_coarse_grained_region{agent_allocated_cg_(x)},
+                compute_unit_count{cu_count_(x)},
+                dedicated_memory{size_(agent_allocated_coarse_grained_region)},
+                default_cpu_access{(type_(x) == HSA_DEVICE_TYPE_CPU) ?
+                    enums::access_type_read_write : enums::access_type_auto},
+                fine_grained_region{fine_grained_(x)},
+                has_cpu_accessible_agent_allocated_coarse_grained{
+                    is_cpu_accessible_(agent_allocated_coarse_grained_region)},
+                has_cpu_shared_memory{size_(fine_grained_region) > 0},
+                is_cpu{type_(x) == HSA_DEVICE_TYPE_CPU},
+                is_gpu{type_(x) == HSA_DEVICE_TYPE_GPU},
+                max_queue_count{max_queue_cnt_(x)},
+                max_queue_size{max_queue_sz_(x)},
+                max_tile_static_size{size_(group_(x))},
+                min_queue_size{min_queue_sz_(x)},
+                name{name_(x)},
+                profile{is_gpu ? profile_(x) : enums::accelerator_profile_none},
+                system_coarse_grained_region{system_cg_()},
+                version{version_(x)}
+            {}
+        public:
+            // DATA
+            hsa_region_t agent_allocated_coarse_grained_region{};
+            std::uint32_t compute_unit_count{};
+            std::size_t dedicated_memory{};
+            enums::access_type default_cpu_access{};
+            hsa_region_t fine_grained_region{};
+            bool has_cpu_accessible_agent_allocated_coarse_grained{};
+            bool has_cpu_shared_memory{};
+            bool is_cpu{};
+            bool is_gpu{};
+            std::uint32_t max_queue_count{};
+            std::uint32_t max_queue_size{};
+            std::size_t max_tile_static_size{};
+            std::uint32_t min_queue_size{};
+            std::wstring name{};
+            enums::accelerator_profile profile{};
+            hsa_region_t system_coarse_grained_region{};
+            std::uint32_t version{};
+
+            // CREATORS
+            HSA_agent() = default;
+            HSA_agent(const HSA_agent&) = default;
+            HSA_agent(HSA_agent&&) = default;
+            ~HSA_agent() = default;
+
+            // MANIPULATORS
+            HSA_agent& operator=(const HSA_agent&) = default;
+            HSA_agent& operator=(HSA_agent&&) = default;
+        };
+
+        inline
+        const std::vector<hsa_agent_t>& Agent_pool::agents_()
+        {
+            static std::vector<hsa_agent_t> r;
+            static std::once_flag f;
+
+            std::call_once(f, []() {
+                throwing_hsa_result_check(
+                    hsa_iterate_agents([](hsa_agent_t agent, void*) {
+                        r.push_back(agent);
+
+                        return HSA_STATUS_SUCCESS;
+                    }, nullptr),
+                    __FILE__, __func__, __LINE__);
+            });
+
+            return r;
+        }
+
+        inline
+        hsa_agent_t Agent_pool::cpu_agent_()
+        {   // TODO: for e.g. multi-socket there can be multiple CPU agents.
+            for (auto&& x : agents_()) {
+                if (HSA_agent::type_(x) == HSA_DEVICE_TYPE_CPU) return x;
+            }
+
+            return {};
+        }
+
+        inline
+        hsa_agent_t Agent_pool::default_agent_()
+        {
+            std::vector<HSA_agent> tmp;
+            for (auto&& x : pool()) tmp.push_back(x.second);
+
+            tmp.erase(
+                std::remove_if(tmp.begin(), tmp.end(), [](const HSA_agent& x) {
+                    return x.is_cpu;
+                }),
+                tmp.end());
+
+            if (tmp.empty()) return cpu_agent_();
+
+            return std::max_element(
+                tmp.cbegin(),
+                tmp.cend(),
+                [](const HSA_agent& x, const HSA_agent& y) {
+                return x.dedicated_memory < y.dedicated_memory;
+            })->agent_;
+        }
+
+        inline
+        hsa_region_t Agent_pool::system_cg_()
+        {
+            static hsa_region_t sys_cg{};
+            static std::once_flag f;
+
+            std::call_once(f, []() {
+                for (auto&& region : HSA_agent::global_regions_(cpu_agent_())) {
+                    std::uint32_t f{};
+                    throwing_hsa_result_check(
+                        hsa_region_get_info(
+                            region, HSA_REGION_INFO_GLOBAL_FLAGS, &f),
+                        __FILE__, __func__, __LINE__);
+
+                    if (f & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) {
+                        sys_cg = region;
+
+                        return;
+                    }
+                }
+            });
+
+            return sys_cg;
+        }
+
+        inline
+        std::unordered_map<hsa_agent_t, Agent_pool::HSA_agent>& Agent_pool::
+            pool()
+        {
+            static std::unordered_map<hsa_agent_t, HSA_agent> r;
+            static std::once_flag f;
+
+            std::call_once(f, []() {
+                for (auto&& x : agents_()) r.emplace(x, HSA_agent{x});
+            });
+
+            return r;
+        }
+
+        inline
+        hsa_agent_t Agent_pool::cpu_agent()
+        {
+            static const hsa_agent_t r{cpu_agent_()};
+
+            return r;
+        }
+
+        inline
+        hsa_agent_t& Agent_pool::default_agent()
+        {
+            static hsa_agent_t r{default_agent_()};
+
+            return r;
+        }
+    } // Namespace hc::detail.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/hc_aligned_alloc.hpp b/include/hc/hc_aligned_alloc.hpp
new file mode 100644
index 00000000000..f4cafba2eb9
--- /dev/null
+++ b/include/hc/hc_aligned_alloc.hpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <cassert>
+#include <cstdlib>
+#include <memory>
+#include <type_traits>
+
+/** \cond HIDDEN_SYMBOLS */
+namespace detail {
+
+inline
+constexpr
+bool hc_is_alignment(std::size_t value) noexcept
+{
+    return (value > 0) && ((value & (value - 1)) == 0);
+}
+
+inline
+void* hc_aligned_alloc(std::size_t alignment, std::size_t size) noexcept
+{
+    assert(hc_is_alignment(alignment));
+
+    if (alignment < alignof(void*)) {
+        alignment = alignof(void*);
+    }
+    void* memptr = NULL;
+    // posix_memalign shall return 0 upon successfully allocate aligned memory
+    posix_memalign(&memptr, alignment, size);
+    assert(memptr);
+
+    return memptr;
+}
+
+inline
+void hc_aligned_free(void* ptr) noexcept
+{
+    if (ptr) {
+        std::free(ptr);
+    }
+}
+
+} // namespace detail
+/** \endcond */
diff --git a/include/hc/hc_am.hpp b/include/hc/hc_am.hpp
new file mode 100644
index 00000000000..e3275447a00
--- /dev/null
+++ b/include/hc/hc_am.hpp
@@ -0,0 +1,517 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <hc/hc.hpp>
+#include <hc/hc_runtime.hpp>
+
+#include <hsa/hsa.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <mutex>
+
+// TODO: this shouldn't be squatting in the global namespace.
+enum am_status_t { AM_ERROR_MISC = -1, AM_SUCCESS };
+enum am_memory_t {
+    am_device, am_host_pinned, am_host_noncoherent, am_host_coherent
+};
+
+namespace hc
+{
+    namespace detail
+    {
+        class auto_voidp {
+            // Provide automatic type conversion for void*.
+            // TODO: not very robust, replace.
+            void* ptr_{};
+        public:
+            auto_voidp(void* ptr) : ptr_{ptr} {}
+
+            template<typename T>
+            operator T*() const { return static_cast<T*>(ptr_); }
+        };
+    } // Namespace detail.
+
+    struct AmPointerInfo {
+        // Info for each pointer in the memory tracker.
+        // TODO: ROCr already tracks all of this, making it redundant.
+        void* host_pointer{};             ///< Host pointer. If host access is
+                                          ///  not allowed, NULL.
+        void* device_pointer{};           ///< Device pointer.
+        void* unaligned_device_pointer{}; ///< Unaligned device pointer
+        std::size_t size_bytes{};         ///< Size of allocation.
+        hc::accelerator* acc{};           ///< Accelerator where allocation is
+                                          ///  physically located.
+        bool is_in_device_mem{false};     ///< Memory is physically resident on
+                                          ///  a device (if false, memory is
+                                          /// located on host)
+        bool is_am_managed{false};        ///< Memory was allocated by AM and
+                                          ///  should be freed when am_reset is
+                                          ///  called.
+        std::uint64_t alloc_seq_num{};    ///< Sequence number of allocation.
+        void* app_pointer{};              ///< App-specific pointer to
+                                          ///  additional information.
+
+        // creates a dummy copy of AmPointerInfo
+        AmPointerInfo() = default;
+        AmPointerInfo(
+            void* host_ptr,
+            void* device_ptr,
+            void* unaligned_device_ptr,
+            std::size_t size,
+            hc::accelerator& acc,
+            bool is_device_mem = false,
+            bool is_am_mem = false)
+            :
+            host_pointer{host_ptr},
+            device_pointer{device_ptr},
+            unaligned_device_pointer{unaligned_device_ptr},
+            size_bytes{size},
+            acc{&acc},
+            is_in_device_mem{is_device_mem},
+            is_am_managed{is_am_mem},
+            alloc_seq_num{0},
+            app_pointer{nullptr}
+        {}
+        AmPointerInfo(const AmPointerInfo&) = default;
+        AmPointerInfo(AmPointerInfo&&) = default;
+        ~AmPointerInfo() = default;
+
+        AmPointerInfo& operator=(const AmPointerInfo&) = default;
+        AmPointerInfo& operator=(AmPointerInfo&&) = default;
+    };
+
+    /**
+     * Allocate a block of @p size bytes of memory on the specified @p acc.
+     *
+     * The contents of the newly allocated block of memory are not initialized.
+     *
+     * If @p size == 0, 0 is returned.
+     *
+     * Flags:
+     *  am_host_pinned : Allocated pinned host memory and map it into the
+     *                   address space of the specified accelerator.
+     *
+     * @return : On success, pointer to the newly allocated memory is returned.
+     * The pointer is typecast to the desired return type.
+     *
+     * If an error occurred trying to allocate the requested memory, 0 is
+     * returned.
+     *
+     * @see am_free, am_copy
+     */
+    template<typename Accelerator>
+    inline
+    detail::auto_voidp am_aligned_alloc(
+        std::size_t size,
+        Accelerator& acc,
+        std::uint32_t flags,
+        std::size_t alignment = 0)
+    {   // TODO: this logic should be reviewed, it is interesting.
+        if (size == 0u) return nullptr;
+        if (!acc.is_hsa_accelerator()) return nullptr;
+
+        hsa_region_t* region{};
+        switch (flags) {
+        case am_host_pinned : case am_host_noncoherent :
+            region = static_cast<hsa_region_t*>(acc.get_hsa_am_system_region());
+            break;
+        case am_host_coherent :
+            region = static_cast<hsa_region_t*>(
+                acc.get_hsa_am_finegrained_system_region());
+            break;
+        default :
+            region = static_cast<hsa_region_t*>(acc.get_hsa_am_system_region());
+        }
+
+        if (!region || region->handle == 0) {
+            region = static_cast<hsa_region_t*>(acc.get_hsa_am_system_region());
+        }
+
+        size = (alignment == 0) ? size : (size + alignment);
+        void* r{nullptr};
+        detail::throwing_hsa_result_check(
+            hsa_memory_allocate(*region, size, &r),
+            __FILE__, __func__, __LINE__);
+
+        static const auto round_up_to_next_multiple =
+            [](std::uintptr_t x, std::uintptr_t y) {
+            x = x + y - 1;
+            return x - x % y;
+        };
+
+        return reinterpret_cast<void*>(round_up_to_next_multiple(
+            reinterpret_cast<std::uintptr_t>(r), alignment ? alignment : 1));
+    }
+
+    /**
+     * Allocate a block of @p size bytes of memory on the specified @p acc.
+     *
+     * The contents of the newly allocated block of memory are not initialized.
+     *
+     * If @p size == 0, 0 is returned.
+     *
+     * Flags:
+     *  amHostPinned : Allocated pinned host memory and map it into the address
+     *                 space of the specified accelerator.
+     *
+     * @return : On success, pointer to the newly allocated memory is returned.
+     * The pointer is typecast to the desired return type.
+     *
+     * If an error occurred trying to allocate the requested memory, 0 is
+     * returned.
+     *
+     * @see am_free, am_copy
+     */
+    template<typename Accelerator>
+    inline
+    detail::auto_voidp am_alloc(
+        std::size_t size, Accelerator& acc, std::uint32_t flags)
+    {
+        return am_aligned_alloc(size, acc, flags, 0u);
+    }
+
+    namespace detail
+    {
+        inline
+        hsa_amd_pointer_info_t hsa_pointer_info(void* ptr)
+        {
+            hsa_amd_pointer_info_t r{};
+            r.size = sizeof(hsa_amd_pointer_info_t);
+            detail::throwing_hsa_result_check(
+                hsa_amd_pointer_info(ptr, &r, nullptr, nullptr, nullptr),
+                __FILE__, __func__, __LINE__);
+
+            return r;
+        }
+    }
+    /**
+     * Free a block of memory previously allocated with am_alloc.
+     *
+     * @return AM_SUCCESS
+     * @see am_alloc, am_copy
+     */
+    inline
+    am_status_t am_free(void* ptr)
+    {
+        if (!ptr) return AM_SUCCESS;
+
+        auto tmp = detail::hsa_pointer_info(ptr);
+
+        if (tmp.type != HSA_EXT_POINTER_TYPE_HSA) return AM_ERROR_MISC;
+
+        detail::throwing_hsa_result_check(
+            hsa_memory_free(tmp.agentBaseAddress),
+            __FILE__, __func__, __LINE__);
+
+        return AM_SUCCESS;
+    }
+
+    /**
+     * Copy @p size bytes of memory from @p src to @ dst. The memory areas
+     * (src+size and dst+size) must not overlap.
+     *
+     * @return AM_SUCCESS on error or AM_ERROR_MISC if an error occurs.
+     * @see am_alloc, am_free
+     */
+    __attribute__((deprecated(
+        "use accelerator_view::copy instead (and note src/dst order"
+        "reversal)")))
+    am_status_t am_copy(void* dst, const void* src, std::size_t size);
+
+    /**
+     * Return information about tracked pointer.
+     *
+     * AM tracks pointers when they are allocated or added to tracker with
+     * am_track_pointer.
+     * The tracker tracks the base pointer as well as the size of the
+     * allocation, and will find the information for a pointer anywhere in the
+     * tracked range.
+     *
+     * @returns AM_ERROR_MISC if pointer is not currently being tracked. In this
+     * case, @p info is not modified.
+
+    * @returns AM_SUCCESS if pointer is tracked and writes info to @p info. If
+    * @info is NULL, no info is written but the returned status indicates if the
+    * pointer was tracked.
+    *
+    * @see AM_memtracker_add
+    */
+    inline
+    am_status_t am_memtracker_get_info(hc::AmPointerInfo* info, const void* ptr)
+    {
+        if (!ptr) return AM_SUCCESS;
+
+        auto tmp = detail::hsa_pointer_info(const_cast<void*>(ptr));
+
+        if (tmp.type == HSA_EXT_POINTER_TYPE_UNKNOWN) return AM_ERROR_MISC;
+
+        info->host_pointer = tmp.hostBaseAddress;
+        info->device_pointer = tmp.agentBaseAddress;
+        info->unaligned_device_pointer = tmp.agentBaseAddress;
+        info->size_bytes = tmp.sizeInBytes;
+        // hc::accelerator* acc{};           ///< Accelerator where allocation is
+        //                                   ///  physically located.
+        // bool is_in_device_mem{false};     ///< Memory is physically resident on
+        //                                   ///  a device (if false, memory is
+        //                                   /// located on host)
+        // bool is_am_managed{false};        ///< Memory was allocated by AM and
+        //                                   ///  should be freed when am_reset is
+        //                                   ///  called.
+        //std::uint64_t alloc_seq_num{};    ///< Sequence number of allocation.
+        info->app_pointer = tmp.userData;
+
+        return AM_SUCCESS;
+    }
+
+    /**
+     * Add a pointer to the memory tracker.
+     *
+     * @return AM_ERROR_MISC : If @p ptr is NULL, or info._sizeBytes = 0, the
+     *                         info is not added to the tracker and
+     *                         AM_ERROR_MISC is returned.
+     * @return AM_SUCCESS
+     * @see am_memtracker_getinfo
+     */
+    am_status_t am_memtracker_add(void* ptr, hc::AmPointerInfo &info);
+
+    /*
+    * Update info for an existing pointer in the memory tracker.
+    *
+    * @returns AM_ERROR_MISC if pointer is not found in tracker.
+    * @returns AM_SUCCESS if pointer is not found in tracker.
+    *
+    * @see am_memtracker_getinfo, am_memtracker_add
+    */
+    am_status_t am_memtracker_update(
+        const void* ptr,
+        std::int32_t appId,
+        std::uint32_t allocationFlags,
+        void* appPtr = nullptr);
+
+    /**
+     * Remove @ptr from the tracker structure.
+     *
+     * @p ptr may be anywhere in a tracked memory range.
+     *
+     * @returns AM_ERROR_MISC if pointer is not found in tracker.
+     * @returns AM_SUCCESS if pointer is not found in tracker.
+     *
+     * @see am_memtracker_getinfo, am_memtracker_add
+     */
+    am_status_t am_memtracker_remove(void* ptr);
+
+    /**
+     * Remove all memory allocations associated with specified accelerator from
+     * the memory tracker.
+     *
+     * @returns Number of entries reset.
+     * @see am_memtracker_getinfo
+     */
+    std::size_t am_memtracker_reset(const hc::accelerator& acc);
+
+    /**
+     * Print the entries in the memory tracker table.
+     *
+     * Intended primarily for debug purposes.
+     * @see am_memtracker_getinfo
+     **/
+    inline
+    void am_memtracker_print(void* targetAddress = nullptr)
+    {
+        if (!targetAddress) return;
+
+        // const char* targetAddressP = static_cast<const char*>(targetAddress);
+        // std::ostream &os = std::cerr;
+
+        // uint64_t beforeD = std::numeric_limits<uint64_t>::max();
+        // uint64_t afterD = std::numeric_limits<uint64_t>::max();
+        // auto closestBefore = g_amPointerTracker.end();
+        // auto closestAfter = g_amPointerTracker.end();
+        // bool foundMatch = false;
+
+        // for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) {
+        //     const auto basePointer = static_cast<const char*> (iter->first._basePointer);
+        //     const auto endPointer = static_cast<const char*> (iter->first._endPointer);
+        //     if ((targetAddressP >= basePointer) && (targetAddressP < endPointer)) {
+        //         ptrdiff_t offset = targetAddressP - basePointer;
+        //         os << "db: memtracker found pointer:" << targetAddress << " offset:" << offset << " bytes inside this allocation:\n";
+        //         os << "   " << iter->first._basePointer << "-" << iter->first._endPointer << "::  ";
+        //         os << iter->second << std::endl;
+        //         foundMatch = true;
+        //         break;
+        //     } else {
+        //         if ((targetAddressP < basePointer) && (basePointer - targetAddressP < beforeD)) {
+        //             beforeD = (basePointer - targetAddressP);
+        //             closestBefore = iter;
+        //         }
+        //         if ((targetAddressP > endPointer) && (targetAddressP - endPointer < afterD)) {
+        //             afterD = (targetAddressP - endPointer);
+        //             closestAfter = iter;
+        //         }
+        //     };
+        //     }
+
+        //     if (!foundMatch) {
+        //         os << "db: memtracker did not find pointer:" << targetAddress << ".  However, it is closest to the following allocations:\n";
+        //         if (closestBefore != g_amPointerTracker.end()) {
+        //             os << "db: closest before: " << beforeD << " bytes before base of: " << closestBefore->second << std::endl;
+        //         }
+        //         if (closestAfter != g_amPointerTracker.end()) {
+        //             os << "db: closest after: " << afterD << " bytes after end of " << closestAfter->second << std::endl ;
+        //         }
+        //     }
+        // } else {
+        //     using namespace std;
+        //     os <<  setw(PTRW) << "base" << "-" << setw(PTRW) << "end" << ": ";
+        //     os  << setw(6+1) << "#SeqNum"
+        //         << setw(PTRW+1) << "HostPtr"
+        //         << setw(PTRW+1) << "DevPtr"
+        //         << setw(12+1) << "SizeBytes"
+        //         << setw(8+1) << "SizeMB"
+        //         << setw(5) << "Dev?"
+        //         << setw(6) << "Reg?"
+        //         << setw(6) << " AppId"
+        //         << setw(7) << " AppFlags"
+        //         << setw(12) << left << " Peers" << right
+        //         << "\n";
+
+        //     for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) {
+        //         os << setw(PTRW) << iter->first._basePointer << "-" << setw(PTRW) << iter->first._endPointer << ": ";
+        //         printShortPointerInfo(os, iter->second);
+        //         printRocrPointerInfo(os, iter->first._basePointer);
+        //         os << "\n";
+        //     }
+        // }
+
+        // g_amPointerTracker.readerUnlock();
+    }
+
+    /**
+     * Return total sizes of device, host, and user memory allocated by the
+     * application.
+     *
+     * User memory is registered with am_tracker_add.
+     **/
+    void am_memtracker_sizeinfo(
+        const hc::accelerator& acc,
+        std::size_t* deviceMemSize,
+        std::size_t* hostMemSize,
+        std::size_t* userMemSize);
+
+
+    void am_memtracker_update_peers(
+        const hc::accelerator& acc, int peerCnt, hsa_agent_t* agents);
+
+    /*
+    * Map device memory or hsa allocated host memory pointed to by @p ptr to the
+    * peers.
+    *
+    * @p ptr pointer which points to device memory or host memory
+    * @p num_peer number of peers to map
+    * @p peers pointer to peer accelerator list.
+    * @return AM_SUCCESS if mapped successfully.
+    * @return AM_ERROR_MISC if @p ptr is nullptr or @p num_peer is 0 or @p peers
+    *                       is nullptr.
+    * @return AM_ERROR_MISC if @p ptr is not am managed.
+    * @return AM_ERROR_MISC if @p ptr is not found in the pointer tracker.
+    * @return AM_ERROR_MISC if @p peers includes a non peer accelerator.
+    */
+    template<typename Accelerator>
+    inline
+    am_status_t am_map_to_peers(
+        void* ptr, std::size_t num_peer, const Accelerator* peers)
+    {
+        if (!ptr) return AM_ERROR_MISC;
+        if (num_peer == 0u) return AM_ERROR_MISC;
+        if (!peers) return AM_ERROR_MISC;
+
+        auto tmp = detail::hsa_pointer_info(ptr);
+
+        if (tmp.type != HSA_EXT_POINTER_TYPE_HSA) return AM_ERROR_MISC;
+
+        std::vector<hsa_agent_t> as{num_peer};
+        while (num_peer--) {
+            as[num_peer] =
+                *static_cast<hsa_agent_t*>(peers[num_peer].get_hsa_agent());
+        }
+        const auto s =
+            hsa_amd_agents_allow_access(as.size(), as.data(), nullptr, ptr);
+
+        if (s == HSA_STATUS_SUCCESS) return AM_SUCCESS;
+
+        return AM_ERROR_MISC;
+    }
+
+    /*
+    * Locks a host pointer to a vector of agents
+    *
+    * @p ac accelerator corresponding to current device
+    * @p hostPtr pointer to host memory which should be page-locked
+    * @p size size of hostPtr to be page-locked
+    * @p visibleAc pointer to hcc accelerators to which the hostPtr should be
+    *    visible
+    * @p numVisibleAc number of elements in visibleAc
+    * @return AM_SUCCESS if lock is successfully.
+    * @return AM_ERROR_MISC if lock is unsuccessful.
+    */
+    template<typename Accelerator>
+    inline
+    am_status_t am_memory_host_lock(
+        Accelerator& acc,
+        void* hostPtr,
+        std::size_t size,
+        Accelerator* visibleAcc,
+        std::size_t numVisibleAcc)
+    {
+        (void)acc;
+
+        if (!hostPtr) return AM_SUCCESS;
+
+        std::vector<hsa_agent_t> ag{numVisibleAcc};
+        while (numVisibleAcc--) {
+            ag[numVisibleAcc] = *static_cast<hsa_agent_t*>(
+                visibleAcc[numVisibleAcc].get_hsa_agent());
+        }
+
+        void* p{};
+        const auto s =
+            hsa_amd_memory_lock(hostPtr, size, ag.data(), ag.size(), &p);
+
+        (void)p;
+
+        if (s == HSA_STATUS_SUCCESS) return AM_SUCCESS;
+
+        return AM_ERROR_MISC;
+    }
+
+    /*
+    * Unlock page locked host memory
+    *
+    * @p ac current device accelerator
+    * @p hostPtr host pointer
+    * @return AM_SUCCESS if unlocked successfully.
+    * @return AM_ERROR_MISC if @p hostPtr unlock is un-successful.
+    */
+    template<typename Accelerator>
+    inline
+    am_status_t am_memory_host_unlock(Accelerator& acc, void* hostPtr)
+    {
+        (void)acc;
+
+        if (!hostPtr) return AM_SUCCESS;
+
+        if (hsa_amd_memory_unlock(hostPtr) == HSA_STATUS_SUCCESS) {
+            return AM_SUCCESS;
+        }
+
+        return AM_ERROR_MISC;
+    }
+} // namespace hc
+
diff --git a/include/hc/hc_atomics.hpp b/include/hc/hc_atomics.hpp
new file mode 100644
index 00000000000..89deea6d7cc
--- /dev/null
+++ b/include/hc/hc_atomics.hpp
@@ -0,0 +1,300 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+namespace hc
+{
+    namespace atomics
+    {
+        /** @{ */
+        /**
+         * Atomically read the value stored in dest , replace it with the value
+         * given in val and return the old value to the caller. This function
+         * provides overloads for int, unsigned int, int64_t, uint64_t, float
+         * and double parameters.
+         *
+         * @param[out] dest A pointer to the location which needs to be
+         *                  atomically modified. The location may reside within
+         *                  an array, an array_view, global or tile_static
+         *                  memory.
+         * @param[in] val The new value to be stored in the location pointed to
+         *                be dest.
+         * @return These functions return the old value which was previously
+         *         stored at dest, and that was atomically replaced. These
+         *         functions always succeed.
+         */
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_exchange(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __atomic_exchange_n(dest, val, __ATOMIC_RELAXED);
+        }
+        inline
+        float atomic_exchange(float* dest, float val) [[cpu]][[hc]]
+        {
+            static_assert(sizeof(float) == sizeof(unsigned int), "");
+
+            unsigned int ui{};
+            __builtin_memcpy(&ui, &val, sizeof(val));
+
+            unsigned int tmp{
+                atomic_exchange(reinterpret_cast<unsigned int*>(dest), ui)};
+
+            float r{};
+            __builtin_memcpy(&r, &tmp, sizeof(tmp));
+
+            return r;
+        }
+        inline
+        double atomic_exchange(double* dest, double val) [[cpu]][[hc]]
+        {
+            static_assert(sizeof(double) == sizeof(std::uint64_t), "");
+
+            std::uint64_t ui{};
+            __builtin_memcpy(&ui, &val, sizeof(val));
+
+            std::uint64_t tmp{
+                atomic_exchange(reinterpret_cast<std::uint64_t*>(dest), ui)};
+
+            double r{};
+            __builtin_memcpy(&r, &tmp, sizeof(tmp));
+
+            return r;
+        }
+        /** @} */
+
+        /** @{ */
+        /**
+         * These functions attempt to perform these three steps atomically:
+         * 1. Read the value stored in the location pointed to by dest
+         * 2. Compare the value read in the previous step with the value
+         *    contained in the location pointed by expected_val
+         * 3. Carry the following operations depending on the result of the
+         *    comparison of the previous step:
+         *    a. If the values are identical, then the function tries to
+         *       atomically change the value pointed by dest to the value in
+         *       val. The function indicates by its return value whether this
+         *       transformation has been successful or not.
+         *    b. If the values are not identical, then the function stores the
+         *       value read in step (1) into the location pointed to by
+         *       expected_val, and returns false.
+         *
+         * @param[out] dest A pointer to the location which needs to be
+         *                  atomically modified. The location may reside within
+         *                  an array, an array_view, global or tile_static
+         *                  memory.
+         * @param[out] expected_val A pointer to a local variable or function
+         *                          parameter. Upon calling the function, the
+         *                          location pointed by expected_val contains
+         *                          the value the caller expects dest to
+         *                          contain. Upon return from the function,
+         *                          expected_val will contain the most recent
+         *                          value read from dest.
+         * @param[in] val The new value to be stored in the location pointed to
+         *                be dest.
+         * @return The return value indicates whether the function has been
+         *         successful in atomically reading, comparing and modifying the
+         *         contents of the memory location.
+         */
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        bool atomic_compare_exchange(
+            T* dest, T* expected_val, T val) [[cpu]][[hc]]
+        {
+            return __atomic_compare_exchange_n(
+                dest,
+                expected_val,
+                val,
+                false,
+                __ATOMIC_RELAXED,
+                __ATOMIC_RELAXED);
+        }
+        /** @} */
+
+        /** @{ */
+        /**
+         * Atomically read the value stored in dest, apply the binary numerical
+         * operation specific to the function with the read value and val
+         * serving as input operands, and store the result back to the location
+         * pointed by dest.
+         *
+         * In terms of sequential semantics, the operation performed by any of
+         * the above function is described by the following piece of
+         * pseudo-code:
+         *
+         * *dest = *dest @f$\otimes@f$ val;
+         *
+         * Where the operation denoted by @f$\otimes@f$ is one of: addition
+         * (atomic_fetch_add), subtraction (atomic_fetch_sub), find maximum
+         * (atomic_fetch_max), find minimum (atomic_fetch_min), bit-wise AND
+         * (atomic_fetch_and), bit-wise OR (atomic_fetch_or), bit-wise XOR
+         * (atomic_fetch_xor).
+         *
+         * @param[out] dest A pointer to the location which needs to be
+         *                  atomically modified. The location may reside within
+         *                  an array, an array_view, global or tile_static
+         *                  memory.
+         * @param[in] val The second operand which participates in the
+         *                calculation of the binary operation whose result is
+         *                stored into the location pointed to be dest.
+         * @return These functions return the old value which was previously
+         *         stored at dest, and that was atomically replaced. These
+         *         functions always succeed.
+         */
+
+        /** @} */
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_add(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __atomic_fetch_add(dest, val, __ATOMIC_RELAXED);
+        }
+
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_sub(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __atomic_fetch_sub(dest, val, __ATOMIC_RELAXED);
+        }
+
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                std::is_signed<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_max(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __sync_fetch_and_max(dest, val);
+        }
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                std::is_unsigned<T>{} &&
+                sizeof(T) >= sizeof(std::uint32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_max(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __sync_fetch_and_umax(dest, val);
+        }
+
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                std::is_signed<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_min(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __sync_fetch_and_min(dest, val);
+        }
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                std::is_unsigned<T>{} &&
+                sizeof(T) >= sizeof(std::uint32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_min(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __sync_fetch_and_umin(dest, val);
+        }
+
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_and(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __atomic_fetch_and(dest, val, __ATOMIC_RELAXED);
+        }
+
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_or(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __atomic_fetch_or(dest, val, __ATOMIC_RELAXED);
+        }
+
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_xor(T* dest, T val) [[cpu]][[hc]]
+        {
+            return __atomic_fetch_xor(dest, val, __ATOMIC_RELAXED);
+        }
+
+        /** @{ */
+        /**
+         * Atomically increment or decrement the value stored at the location
+         * point to by dest.
+         *
+         * @param[out] dest A pointer to the location which needs to be
+         *                  atomically modified. The location may reside within
+         *                  an array, an array_view, global or tile_static
+         *                  memory.
+         * @return These functions return the old value which was previously
+         *         stored at dest, and that was atomically replaced. These
+         *         functions always succeed.
+         */
+
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_inc(T* dest) [[cpu]][[hc]]
+        {
+            return __atomic_fetch_add(dest, T{1}, __ATOMIC_RELAXED);
+        }
+
+        template<
+            typename T,
+            typename std::enable_if<
+                std::is_integral<T>{} &&
+                sizeof(T) >= sizeof(std::int32_t)>::type* = nullptr>
+        inline
+        T atomic_fetch_dec(T* dest) [[cpu]][[hc]]
+        {
+            return __atomic_fetch_sub(dest, T{1}, __ATOMIC_RELAXED);
+        }
+        /** @} */
+    } // Namespace atomics.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/hc_callable_attributes.hpp b/include/hc/hc_callable_attributes.hpp
new file mode 100644
index 00000000000..a42efacbaf4
--- /dev/null
+++ b/include/hc/hc_callable_attributes.hpp
@@ -0,0 +1,170 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <cstddef>
+#include <tuple>
+#include <utility>
+
+namespace hc
+{
+    namespace attr_impl
+    {
+        template<typename, typename...> class Callable_with_AMDGPU_attributes;
+
+        struct Flat_wg_tag {};
+        struct Max_wg_dim_tag {};
+        struct Waves_per_EU_tag {};
+    } // Namespace attr_impl.
+
+    namespace detail
+    {
+        template<typename, typename> struct Kernel_emitter;
+    }
+
+    template<unsigned int min_size = 0, unsigned int max_size = 0>
+    class Flat_workgroup_size : public attr_impl::Flat_wg_tag {
+        static_assert(
+            min_size <= max_size,
+            "Minimum workgroup size must not be greater than maximum size.");
+
+        static constexpr Flat_workgroup_size* flat_workgroup_size_{};
+
+        template<typename, typename...>
+        friend class attr_impl::Callable_with_AMDGPU_attributes;
+    public:
+        static
+        constexpr
+        unsigned int minimum() noexcept [[cpu, hc]] { return min_size; }
+        static
+        constexpr
+        unsigned int maximum() noexcept [[cpu, hc]] { return max_size; }
+    };
+
+    template<
+        unsigned int max_dim_z = 1,
+        unsigned int max_dim_y = 1,
+        unsigned int max_dim_x = 1>
+    class Max_workgroup_dim : public attr_impl::Max_wg_dim_tag {
+        static_assert(
+            max_dim_z * max_dim_y * max_dim_x > 0 &&
+            max_dim_z * max_dim_y * max_dim_x <= 1024u,
+            "Flattened required workgroup size must be in (0, 1024].");
+
+        static constexpr Max_workgroup_dim* max_workgroup_dim_{};
+
+        template<typename, typename...>
+        friend class attr_impl::Callable_with_AMDGPU_attributes;
+    public:
+        static
+        constexpr
+        unsigned int maximum_x() noexcept [[cpu, hc]] { return max_dim_x; }
+        static
+        constexpr
+        unsigned int maximum_y() noexcept [[cpu, hc]] { return max_dim_y; }
+        static
+        constexpr
+        unsigned int maximum_z() noexcept [[cpu, hc]] { return max_dim_z; }
+    };
+
+    template<unsigned int min_wave_cnt = 0, unsigned int max_wave_cnt = 0>
+    class Waves_per_eu : public attr_impl::Waves_per_EU_tag {
+        static_assert(
+            max_wave_cnt == 0 || min_wave_cnt <= max_wave_cnt,
+            "Minimum number of waves per EU must not be greater than maximum, "
+                "if the latter is specified.");
+
+        static constexpr Waves_per_eu* waves_per_eu_{};
+
+        template<typename, typename...>
+        friend class attr_impl::Callable_with_AMDGPU_attributes;
+    public:
+        static
+        constexpr
+        unsigned int minimum() noexcept [[cpu, hc]] { return min_wave_cnt; }
+        static
+        constexpr
+        unsigned int maximum() noexcept [[cpu, hc]] { return max_wave_cnt; }
+    };
+
+    namespace attr_impl
+    {
+        template<typename Callable, typename... Attrs>
+        class Callable_with_AMDGPU_attributes : private Callable {
+            struct Triple_ {
+                std::size_t m0;
+                std::size_t m1;
+                std::size_t m2;
+            };
+
+            using AttrTuple_ = std::tuple<Attrs..., void>;
+
+            template<std::size_t>
+            static
+            constexpr
+            Triple_ attr_idx_(Triple_ tmp) noexcept [[cpu, hc]]
+            {
+                return tmp;
+            }
+            template<std::size_t n, typename T, typename... As>
+            static
+            constexpr
+            Triple_ attr_idx_(Triple_ tmp) noexcept [[cpu, hc]]
+            {
+                return std::is_base_of<Flat_wg_tag, T>{} ?
+                    attr_idx_<n + 1, As...>({n, tmp.m1, tmp.m2}) :
+                    (std::is_base_of<Max_wg_dim_tag, T>{} ?
+                        attr_idx_<n + 1, As...>({tmp.m0, n, tmp.m2}) :
+                        (std::is_base_of<Waves_per_EU_tag, T>{} ?
+                            attr_idx_<n + 1, As...>({tmp.m0, tmp.m1, n}) :
+                            attr_idx_<n + 1, As...>(tmp)));
+            }
+
+            static constexpr Triple_ idxs_{attr_idx_<0u, Attrs...>({
+                sizeof...(Attrs), sizeof...(Attrs), sizeof...(Attrs)})};
+
+            using Flat_wg_size_ = typename std::conditional<
+                idxs_.m0 != sizeof...(Attrs),
+                typename std::tuple_element<idxs_.m0, AttrTuple_>::type,
+                Flat_workgroup_size<>>::type;
+            using Max_wg_dim_ = typename std::conditional<
+                idxs_.m1 != sizeof...(Attrs),
+                typename std::tuple_element<idxs_.m1, AttrTuple_>::type,
+                Max_workgroup_dim<>>::type;
+            using Waves_per_EU_ = typename std::conditional<
+                idxs_.m2 != sizeof...(Attrs),
+                typename std::tuple_element<idxs_.m2, AttrTuple_>::type,
+                Waves_per_eu<>>::type;
+
+            template<typename, typename>
+            friend struct detail::Kernel_emitter;
+        public:
+            // CREATORS
+            Callable_with_AMDGPU_attributes() [[cpu, hc]] = default;
+            explicit
+            Callable_with_AMDGPU_attributes(Callable callable)
+                : Callable{std::move(callable)} {}
+            Callable_with_AMDGPU_attributes(
+                const Callable_with_AMDGPU_attributes&) [[cpu, hc]] = default;
+            Callable_with_AMDGPU_attributes(
+                Callable_with_AMDGPU_attributes&&) [[cpu, hc]] = default;
+            ~Callable_with_AMDGPU_attributes() [[cpu, hc]] = default;
+
+            // ACCESSORS
+            using Callable::operator();
+        };
+    } // Namespace hc::attr_impl.
+
+    template<typename... Attrs, typename Callable>
+    inline
+    attr_impl::Callable_with_AMDGPU_attributes<
+        Callable, Attrs...> make_callable_with_AMDGPU_attributes(Callable f)
+    {
+        return attr_impl::Callable_with_AMDGPU_attributes<Callable, Attrs...>{
+            std::move(f)};
+    }
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/hc_completion_future.hpp b/include/hc/hc_completion_future.hpp
new file mode 100644
index 00000000000..3f3db9ad2ab
--- /dev/null
+++ b/include/hc/hc_completion_future.hpp
@@ -0,0 +1,281 @@
+#pragma once
+
+#include <hsa/hsa.h>
+
+#include <chrono>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <utility>
+
+namespace hc
+{
+    class accelerator_view;
+    template<typename, int> class array;
+    template<typename, int> class array_view;
+    template<int> class extent;
+    template<int> class tiled_extent;
+
+    // ------------------------------------------------------------------------
+    // completion_future
+    // ------------------------------------------------------------------------
+
+    /**
+     * This class is the return type of all asynchronous APIs and has an
+     * interface analogous to std::shared_future<void>. Similar to
+     * std::shared_future, this type provides member methods such as wait and
+     * get to wait for asynchronous operations to finish, and the type
+     * additionally provides a member method then(), to specify a completion
+     * callback functor to be executed upon completion of an asynchronous
+     * operation.
+     */
+    class completion_future {
+        struct State_ {
+            std::shared_future<void> future{};
+            std::once_flag maybe_then{};
+
+            State_(std::shared_future<void> fut)
+                : future{std::move(fut)}, maybe_then{}
+            {}
+        };
+
+        std::shared_ptr<State_> state_{};
+
+        friend class accelerator_view;
+        template<typename, int> friend class array_view;
+
+        // non-tiled parallel_for_each
+        // generic version
+        template<typename Kernel, int n>
+        friend
+        completion_future parallel_for_each(
+            const accelerator_view&, const extent<n>&, const Kernel&);
+
+        // tiled parallel_for_each
+        // generic version
+        template<typename Kernel, int n>
+        friend
+        completion_future parallel_for_each(
+            const accelerator_view&, const tiled_extent<n>&, const Kernel&);
+
+        // copy_async
+        template<typename T, int N>
+        friend
+        completion_future copy_async(
+            const array_view<const T, N>& src, const array_view<T, N>& dest);
+        template<typename T, int N>
+        friend
+        completion_future copy_async(const array<T, N>& src, array<T, N>& dest);
+        template<typename T, int N>
+        friend
+        completion_future copy_async(
+            const array<T, N>& src, const array_view<T, N>& dest);
+        template<typename T, int N>
+        friend
+        completion_future copy_async(
+            const array_view<T, N>& src, const array_view<T, N>& dest);
+        template<typename T, int N>
+        friend
+        completion_future copy_async(
+            const array_view<const T, N>& src, array<T, N>& dest);
+
+        template<typename InputIter, typename T, int N>
+        friend
+        completion_future copy_async(
+            InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);
+        template<typename InputIter, typename T, int N>
+        friend
+        completion_future copy_async(
+            InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);
+        template<typename InputIter, typename T, int N>
+        friend
+        completion_future copy_async(InputIter srcBegin, array<T, N>& dest);
+        template<typename InputIter, typename T, int N>
+        friend
+        completion_future copy_async(
+            InputIter srcBegin, const array_view<T, N>& dest);
+        template<typename OutputIter, typename T, int N>
+        friend
+        completion_future copy_async(
+            const array<T, N>& src, OutputIter destBegin);
+        template<typename OutputIter, typename T, int N>
+        friend
+        completion_future copy_async(
+            const array_view<T, N>& src, OutputIter destBegin);
+
+        // CREATORS
+        completion_future(std::shared_future<void> future)
+            : state_{std::make_shared<State_>(std::move(future))}
+        {}
+    public:
+
+        /**
+         * Default constructor. Constructs an empty uninitialized
+         * completion_future object which does not refer to any asynchronous
+         * operation. Default constructed completion_future objects have valid()
+         * == false
+         */
+        completion_future() = default;
+
+        /**
+         * Copy constructor. Constructs a new completion_future object that
+         * refers to the same asynchronous operation as the other
+         * completion_future object.
+         *
+         * @param[in] other An object of type completion_future from which to
+         *                  initialize this.
+         */
+        completion_future(const completion_future&) = default;
+
+        /**
+         * Move constructor. Move constructs a new completion_future object that
+         * refers to the same asynchronous operation as originally referred by
+         * the other completion_future object. After this constructor returns,
+         * other.valid() == false
+         *
+         * @param[in] other An object of type completion_future which the new
+         *                  completion_future
+         */
+        completion_future(completion_future&&) = default;
+
+        ~completion_future()
+        {
+            if (!state_) return;
+            if (state_.use_count() > 1) return;
+
+            if (state_->future.valid()) state_->future.wait();
+        }
+        /**
+         * Copy assignment. Copy assigns the contents of other to this. This
+         * method causes this to stop referring its current asynchronous
+         * operation and start referring the same asynchronous operation as
+         * other.
+         *
+         * @param[in] other An object of type completion_future which is copy
+         *                  assigned to this.
+         */
+        completion_future& operator=(const completion_future&) = default;
+
+        /**
+         * Move assignment. Move assigns the contents of other to this. This
+         * method causes this to stop referring its current asynchronous
+         * operation and start referring the same asynchronous operation as
+         * other. After this method returns, other.valid() == false
+         *
+         * @param[in] other An object of type completion_future which is move
+         *                  assigned to this.
+         */
+        completion_future& operator=(completion_future&&) = default;
+
+        /**
+         * This method is functionally identical to
+         * std::shared_future<void>::get. This method waits for the associated
+         * asynchronous operation to finish and returns only upon the completion
+         * of the asynchronous operation. If an exception was encountered during
+         * the execution of the asynchronous operation, this method throws that
+         * stored exception.
+         */
+        void get() const
+        {
+            if (state_) state_->future.get();
+        }
+
+        /**
+         * This method is functionally identical to
+         * std::shared_future<void>::valid. This returns true if this
+         * completion_future is associated with an asynchronous operation.
+         */
+        bool valid() const
+        {
+            return state_ ? state_->future.valid() : false;
+        }
+
+        /** @{ */
+        /**
+         * These methods are functionally identical to the corresponding
+         * std::shared_future<void> methods.
+         *
+         * The wait method waits for the associated asynchronous operation to
+         * finish and returns only upon completion of the associated
+         * asynchronous operation or if an exception was encountered when
+         * executing the asynchronous operation.
+         *
+         * The other variants are functionally identical to the
+         * std::shared_future<void> member methods with same names.
+         *
+         * @param waitMode[in] An optional parameter to specify the wait mode.
+         *                     By default it would be hcWaitModeBlocked.
+         *                     hcWaitModeActive would be used to reduce latency
+         *                     with the expense of using one CPU core for active
+         *                     waiting.
+         */
+        void wait() const
+        {
+            if (state_) state_->future.wait();
+
+            // TODO: printf:(
+            //detail::getContext()->flushPrintfBuffer();
+        }
+
+        template<typename Rep, typename Period>
+        std::future_status wait_for(
+            const std::chrono::duration<Rep, Period>& rel_time) const
+        {   // TODO: this should probably be an exception if !state_.
+            return state_ ?
+                state_->future.wait_for(rel_time) :
+                std::future_status::deferred;
+        }
+
+        template<typename Clock, typename Duration>
+        std::future_status wait_until(
+            const std::chrono::time_point<Clock, Duration>& abs_time) const
+        {
+            return state_ ?
+                state_->future.wait_until(abs_time) :
+                std::future_status::deferred;
+        }
+
+        /** @} */
+
+        /**
+         * Conversion operator to std::shared_future<void>. This method returns
+         * a shared_future<void> object corresponding to this completion_future
+         * object and refers to the same asynchronous operation.
+         */
+        operator std::shared_future<void>() const
+        {
+            return state_ ? state_->future : std::shared_future<void>{};
+        }
+
+        /**
+         * This method enables specification of a completion callback func which
+         * is executed upon completion of the asynchronous operation associated
+         * with this completion_future object. The completion callback func
+         * should have an operator() that is valid when invoked with non
+         * arguments, i.e., "func()".
+         */
+        template<typename F>
+        void then(const F& func) const
+        {   // TODO: this is probably incorrect; then() was underspecified in
+            //       C++AMP, and subtle to get right; we may want to remove it
+            //       or extend it to return a future, otherwise it is
+            //       intractable to provide guarantees about when the
+            //       continuation executes and, respectively, when it completes.
+            std::call_once(
+                state_->maybe_then, [=](std::shared_future<void> fut) {
+                std::thread{[=]() { fut.wait(); func(); }}.detach();
+            }, state_->future);
+        }
+
+        /**
+         * Get if the async operations has been completed.
+         *
+         * @return True if the async operation has been completed, false if not.
+         */
+        bool is_ready()
+        {
+            return state_->future.wait_for(std::chrono::nanoseconds{0}) ==
+                std::future_status::ready;
+        }
+    };
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/hc_defines.hpp b/include/hc/hc_defines.hpp
new file mode 100644
index 00000000000..8ff7daa16dd
--- /dev/null
+++ b/include/hc/hc_defines.hpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <cstdint>
+
+namespace hc
+{
+    // TODO: assess why this exists.
+    typedef _Float16 half;
+}
+
+//
+// work-item related builtin functions
+//
+extern "C"
+__attribute__((const))
+std::uint32_t hc_get_grid_size(std::uint32_t n) [[hc]];
+extern "C"
+__attribute__((const))
+std::uint32_t hc_get_workitem_absolute_id(std::uint32_t n) [[hc]];
+extern "C"
+__attribute__((const))
+std::uint32_t hc_get_group_size(std::uint32_t n) [[hc]];
+extern "C"
+__attribute__((const))
+std::uint32_t hc_get_workitem_id(std::uint32_t n) [[hc]];
+extern "C"
+__attribute__((const))
+std::uint32_t hc_get_num_groups(std::uint32_t n) [[hc]];
+extern "C"
+__attribute__((const))
+std::uint32_t hc_get_group_id(std::uint32_t n) [[hc]];
+
+// TODO: this should be implemented as a keyword (+possibly storage class).
+#define tile_static __attribute__((tile_static))
+
+extern "C"
+__attribute__((noduplicate, nothrow))
+void hc_barrier(unsigned int n) [[hc]];
+
+#ifndef CLK_LOCAL_MEM_FENCE
+    #define CLK_LOCAL_MEM_FENCE (1)
+#endif
+
+#ifndef CLK_GLOBAL_MEM_FENCE
+    #define CLK_GLOBAL_MEM_FENCE (2)
+#endif
+
+// Valid values for__hcc_backend__ to indicate the
+// compiler backend
+#define HCC_BACKEND_AMDGPU (1)
\ No newline at end of file
diff --git a/include/hc/hc_exception.hpp b/include/hc/hc_exception.hpp
new file mode 100644
index 00000000000..dc98a414404
--- /dev/null
+++ b/include/hc/hc_exception.hpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <exception>
+#include <string>
+
+namespace hc
+{
+    namespace detail
+    {
+        #ifndef E_FAIL
+            static constexpr auto E_FAIL = 0x80004005;
+        #endif
+
+        static constexpr const char __errorMsg_UnsupportedAccelerator[]{
+            "hc::parallel_for_each is not supported on the selected accelerator"
+            " \"CPU accelerator\"."};
+
+        // TODO: this should use standard error_code / error_category.
+        using HRESULT = typename std::remove_const<decltype(E_FAIL)>::type;
+        class runtime_exception : public std::exception {
+            std::string message_;
+            HRESULT code_;
+        public:
+            // TODO: noexcept is somewhat debateable, given the string.
+            runtime_exception(
+                const char * message,
+                HRESULT hresult) noexcept : message_{message}, code_{hresult}
+            {}
+            explicit
+            runtime_exception(HRESULT hresult) noexcept : code_{hresult} {}
+            runtime_exception(const runtime_exception& other) = default;
+            runtime_exception(runtime_exception&&) = default;
+            virtual
+            ~runtime_exception() = default;
+
+            runtime_exception& operator=(const runtime_exception&) = default;
+            runtime_exception& operator=(runtime_exception&&) = default;
+
+            virtual
+            const char* what() const noexcept
+            {
+                return message_.c_str();
+            }
+
+            HRESULT get_error_code() const noexcept
+            {
+                return code_;
+            }
+        };
+
+        struct invalid_compute_domain : public runtime_exception {
+            explicit
+            invalid_compute_domain(const char* message) noexcept
+                : runtime_exception{message, E_FAIL}
+            {}
+            invalid_compute_domain() noexcept : runtime_exception{E_FAIL} {}
+        };
+
+        struct accelerator_view_removed : public runtime_exception {
+            explicit
+            accelerator_view_removed(
+                const char* message, HRESULT view_removed_reason) noexcept
+                : runtime_exception{message, view_removed_reason}
+            {}
+            accelerator_view_removed(HRESULT view_removed_reason) noexcept
+                : runtime_exception{view_removed_reason}
+            {}
+
+            HRESULT get_view_removed_reason() const noexcept
+            {
+                return get_error_code();
+            }
+        };
+    } // Namespace hc::detail.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/hc_index.hpp b/include/hc/hc_index.hpp
new file mode 100644
index 00000000000..1fdad5d23d3
--- /dev/null
+++ b/include/hc/hc_index.hpp
@@ -0,0 +1,644 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+namespace hc
+{
+    template<int> class extent;
+
+    namespace detail
+    {
+        /** \cond HIDDEN_SYMBOLS */
+        template <int...> struct __indices {};
+
+        template <int _Sp, class _IntTuple, int _Ep>
+        struct __make_indices_imp;
+
+        template <int _Sp, int ..._Indices, int _Ep>
+        struct __make_indices_imp<_Sp, __indices<_Indices...>, _Ep> {
+            using type = typename __make_indices_imp<
+                _Sp+1, __indices<_Indices..., _Sp>, _Ep>::type;
+        };
+
+        template <int _Ep, int ..._Indices>
+        struct __make_indices_imp<_Ep, __indices<_Indices...>, _Ep> {
+            typedef __indices<_Indices...> type;
+        };
+
+        template <int _Ep, int _Sp = 0>
+        struct __make_indices {
+            static_assert(_Sp <= _Ep, "__make_indices input error");
+            using type =
+                typename __make_indices_imp<_Sp, __indices<>, _Ep>::type;
+        };
+
+        template <int _Ip>
+        class __index_leaf {
+            int __idx;
+            int dummy;
+        public:
+            explicit
+            __index_leaf(int __t) noexcept [[cpu, hc]] : __idx(__t) {}
+
+            __index_leaf& operator=(const int __t) noexcept [[cpu, hc]]
+            {
+                __idx = __t;
+                return *this;
+            }
+            __index_leaf& operator+=(const int __t) noexcept [[cpu, hc]]
+            {
+                __idx += __t;
+                return *this;
+            }
+            __index_leaf& operator-=(const int __t) noexcept [[cpu, hc]]
+            {
+                __idx -= __t;
+                return *this;
+            }
+            __index_leaf& operator*=(const int __t) noexcept [[cpu, hc]]
+            {
+                __idx *= __t;
+                return *this;
+            }
+            __index_leaf& operator/=(const int __t) noexcept [[cpu, hc]]
+            {
+                __idx /= __t;
+                return *this;
+            }
+            __index_leaf& operator%=(const int __t) noexcept [[cpu, hc]]
+            {
+                __idx %= __t;
+                return *this;
+            }
+            int& get() noexcept [[cpu, hc]] { return __idx; }
+            const int& get() const noexcept [[cpu, hc]] { return __idx; }
+        };
+
+        template <class _Indx> struct index_impl;
+
+        template <int ...N>
+        struct index_impl<__indices<N...> > : public __index_leaf<N>...  {
+            index_impl() [[cpu, hc]] : __index_leaf<N>(0)... {}
+
+            template<class ..._Up>
+                explicit
+                index_impl(_Up... __u) [[cpu, hc]]
+                    : __index_leaf<N>(__u)... {}
+
+            index_impl(const index_impl& other) [[cpu, hc]]
+                :
+                index_impl(static_cast<const __index_leaf<N>&>(other).get()...)
+            {}
+
+            index_impl(int component) [[cpu, hc]]
+                : __index_leaf<N>(component)... {}
+            index_impl(int components[]) [[cpu, hc]]
+                : __index_leaf<N>(components[N])... {}
+            index_impl(const int components[]) [[cpu, hc]]
+                : __index_leaf<N>(components[N])... {}
+
+            template<class ..._Tp>
+            inline
+            void __swallow(_Tp...) [[cpu, hc]] {}
+
+            int operator[](unsigned int c) const [[cpu, hc]]
+            {
+                return static_cast<const __index_leaf<0>&>(
+                    *((__index_leaf<0> *)this + c)).get();
+            }
+            int& operator[](unsigned int c) [[cpu, hc]]
+            {
+                return static_cast<__index_leaf<0>&>(
+                    *((__index_leaf<0> *)this + c)).get();
+            }
+            index_impl& operator=(const index_impl& __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator=(
+                    static_cast<const __index_leaf<N>&>(__t).get())...);
+                return *this;
+            }
+            index_impl& operator+=(const index_impl& __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator+=(
+                    static_cast<const __index_leaf<N>&>(__t).get())...);
+                return *this;
+            }
+            index_impl& operator-=(const index_impl& __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator-=(
+                    static_cast<const __index_leaf<N>&>(__t).get())...);
+                return *this;
+            }
+            index_impl& operator*=(const index_impl& __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator*=(
+                    static_cast<const __index_leaf<N>&>(__t).get())...);
+                return *this;
+            }
+            index_impl& operator/=(const index_impl& __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator/=(
+                    static_cast<const __index_leaf<N>&>(__t).get())...);
+                return *this;
+            }
+            index_impl& operator%=(const index_impl& __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator%=(
+                    static_cast<const __index_leaf<N>&>(__t).get())...);
+                return *this;
+            }
+            index_impl& operator+=(const int __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator+=(__t)...);
+                return *this;
+            }
+            index_impl& operator-=(const int __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator-=(__t)...);
+                return *this;
+            }
+            index_impl& operator*=(const int __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator*=(__t)...);
+                return *this;
+            }
+            index_impl& operator/=(const int __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator/=(__t)...);
+                return *this;
+            }
+            index_impl& operator%=(const int __t) [[cpu, hc]]
+            {
+                __swallow(__index_leaf<N>::operator%=(__t)...);
+                return *this;
+            }
+        };
+
+        template<int N, typename _Tp>
+        struct index_helper {
+            static
+            inline
+            void set(_Tp& now) [[cpu, hc]]
+            {
+                now[N - 1] = hc_get_global_id(_Tp::rank - N);
+                index_helper<N - 1, _Tp>::set(now);
+            }
+            static
+            inline
+            bool equal(const _Tp& _lhs, const _Tp& _rhs) [[cpu, hc]]
+            {
+                return (_lhs[N - 1] == _rhs[N - 1]) &&
+                    (index_helper<N - 1, _Tp>::equal(_lhs, _rhs));
+            }
+            static
+            inline
+            int count_size(const _Tp& now) [[cpu, hc]]
+            {
+                return now[N - 1] * index_helper<N - 1, _Tp>::count_size(now);
+            }
+        };
+
+        template<typename _Tp>
+        struct index_helper<1, _Tp> {
+            static
+            inline
+            void set(_Tp& now) [[cpu, hc]]
+            {
+                now[0] = hc_get_global_id(_Tp::rank - 1);
+            }
+            static
+            inline
+            bool equal(const _Tp& _lhs, const _Tp& _rhs) [[cpu, hc]]
+            {
+                return (_lhs[0] == _rhs[0]);
+            }
+            static
+            inline
+            int count_size(const _Tp& now) [[cpu, hc]]
+            {
+                return now[0];
+            }
+        };
+
+        template<int N, typename T, typename U>
+        struct amp_helper {
+            static
+            bool
+            inline contains(const T& idx, const U& ext) [[cpu, hc]]
+            {
+                return idx[N - 1] >= 0 && idx[N - 1] < ext[N - 1] &&
+                    amp_helper<N - 1, T, U>::contains(idx, ext);
+            }
+
+            static
+            bool
+            inline contains(
+                const T& idx, const U& ext,const U& ext2) [[cpu, hc]]
+            {
+                return idx[N - 1] >= 0 &&
+                    ext[N - 1] > 0 &&
+                    (idx[N - 1] + ext[N - 1]) <= ext2[N - 1] &&
+                    amp_helper<N - 1, T, U>::contains(idx, ext, ext2);
+            }
+
+            static
+            inline
+            int flatten(const T& idx, const U& ext) [[cpu, hc]]
+            {
+                return idx[N - 1] +
+                    ext[N - 1] * amp_helper<N - 1, T, U>::flatten(idx, ext);
+            }
+            static
+            inline
+            void minus(const T& idx, U& ext) [[cpu, hc]]
+            {
+                ext.base_ -= idx.base_;
+            }
+        };
+
+        template<typename T, typename U>
+        struct amp_helper<1, T, U> {
+            static
+            inline
+            bool contains(const T& idx, const U& ext) [[cpu, hc]]
+            {
+                return idx[0] >= 0 && idx[0] < ext[0];
+            }
+
+            static
+            inline
+            bool contains(const T& idx, const U& ext,const U& ext2) [[cpu, hc]]
+            {
+                return
+                    idx[0] >= 0 && ext[0] > 0 && (idx[0] + ext[0]) <= ext2[0];
+            }
+
+            static
+            inline
+            int flatten(const T& idx, const U&) [[cpu, hc]]
+            {
+                return idx[0];
+            }
+            static
+            inline
+            void minus(const T& idx, U& ext) [[cpu, hc]]
+            {
+                ext.base_ -= idx.base_;
+            }
+        };
+        /** \endcond */
+
+        /**
+         * Represents a unique position in N-dimensional space.
+         *
+         * @tparam N The dimensionality space into which this index applies.
+         *           Special constructors are supplied for the cases where
+         *           @f$N \in \{1,2,3\}@f$, but N can be any integer greater
+         *           than 0.
+         */
+        template<int N>
+        class index {
+            static_assert(N > 0, "rank should greater than 0.");
+
+            using base = index_impl<typename __make_indices<N>::type>;
+            base base_;
+
+            template<int> friend class hc::extent;
+            template<int, typename> friend struct index_helper;
+            template<int, typename, typename> friend struct amp_helper;
+        public:
+            /**
+             * A static member of index<N> that contains the rank of this index.
+             */
+            static constexpr int rank = N;
+
+            /**
+             * The element type of index<N>.
+             */
+            using value_type = int;
+
+            /**
+             * Default constructor. The value at each dimension is initialized
+             * to zero. Thus, "index<3> ix;" initializes the variable to the
+             * position (0,0,0).
+             */
+            index() [[cpu, hc]] = default;
+
+            /**
+             * Copy constructor. Constructs a new index<N> from the supplied
+             * argument "other".
+             *
+             * @param[in] other An object of type index<N> from which to
+             *                  initialize this new index.
+             */
+            index(const index&) [[cpu, hc]] = default;
+            index(index&&) [[cpu, hc]] = default;
+
+            /** @{ */
+            /**
+             * Constructs an index<N> with the coordinate values provided by
+             * @f$i_{0..2}@f$. These are specialized constructors that are only
+             * valid when the rank of the index @f$N \in \{1,2,3\}@f$. Invoking
+             * a specialized constructor whose argument @f$count \ne N@f$ will
+             * result in a compilation error.
+             *
+             * @param[in] i0 The component values of the index vector.
+             */
+            template<
+                typename... Ts,
+                typename std::enable_if<sizeof...(Ts) == N>::type* = nullptr>
+            explicit
+            index(Ts... i_n) [[cpu, hc]] : base_{static_cast<int>(i_n)...}
+            {
+                static_assert(
+                    sizeof...(Ts) <= 3,
+                    "Explicit constructor with rank greater than 3 is not "
+                        "allowed");
+            }
+
+            /** @} */
+
+            /**
+             * Constructs an index<N> with the coordinate values provided the
+             * array of int component values. If the coordinate array length
+             * @f$\ne@f$ N, the behavior is undefined. If the array value is
+             * NULL or not a valid pointer, the behavior is undefined.
+             *
+             * @param[in] components An array of N int values.
+             */
+            explicit
+            index(const int components[]) [[cpu, hc]] : base_{components} {}
+
+            /**
+             * Assigns the component values of "other" to this index<N> object.
+             *
+             * @param[in] other An object of type index<N> from which to copy
+             *                  into this index.
+             * @return Returns *this.
+             */
+            index& operator=(const index&) [[cpu, hc]] = default;
+            index& operator=(index&&) [[cpu, hc]] = default;
+
+            /** @{ */
+            /**
+             * Returns the index component value at position c.
+             *
+             * @param[in] c The dimension axis whose coordinate is to be
+             *              accessed.
+             * @return A the component value at position c.
+             */
+            int operator[](unsigned int c) const [[cpu, hc]]
+            {
+                return base_[c];
+            }
+            int& operator[](unsigned int c) [[cpu, hc]]
+            {
+                return base_[c];
+            }
+
+            /** @} */
+
+            /** @{ */
+            /**
+             * Compares two objects of index<N>.
+             *
+             * The expression
+             * @f$leftIdx \oplus rightIdx@f$
+             * is true if @f$leftIdx[i] \oplus rightIdx[i]@f$ for every i from 0
+             * to N-1.
+             *
+             * @param[in] other The right-hand index<N> to be compared.
+             */
+            // FIXME: the signature is not entirely the same as defined in:
+            //        C++AMP spec v1.2 #1137
+            bool operator==(const index& other) const [[cpu, hc]]
+            {
+                return index_helper<N, index<N> >::equal(*this, other);
+            }
+            bool operator!=(const index& other) const [[cpu, hc]]
+            {
+                return !(*this == other);
+            }
+
+            /** @} */
+
+            /** @{ */
+            /**
+             * For a given operator @f$\oplus@f$, produces the same effect as
+             * (*this) = (*this) @f$\oplus@f$ rhs;
+             * The return value is "*this".
+             *
+             * @param[in] rhs The right-hand index<N> of the arithmetic
+             *                operation.
+             */
+            index& operator+=(const index& rhs) [[cpu, hc]]
+            {
+                base_ += rhs.base_;
+                return *this;
+            }
+            index& operator-=(const index& rhs) [[cpu, hc]]
+            {
+                base_ -= rhs.base_;
+                return *this;
+            }
+
+            /** @} */
+
+            /** @{ */
+            /**
+             * For a given operator @f$\oplus@f$, produces the same effect as
+             * (*this) = (*this) @f$\oplus@f$ value;
+             * The return value is "*this".
+             *
+             * @param[in] value The right-hand int of the arithmetic operation.
+             */
+            index& operator+=(int value) [[cpu, hc]]
+            {
+                base_  += value;
+                return *this;
+            }
+            index& operator-=(int value) [[cpu, hc]]
+            {
+                base_ -= value;
+                return *this;
+            }
+            index& operator*=(int value) [[cpu, hc]]
+            {
+                base_ *= value;
+                return *this;
+            }
+            index& operator/=(int value) [[cpu, hc]]
+            {
+                base_ /= value;
+                return *this;
+            }
+            index& operator%=(int value) [[cpu, hc]]
+            {
+                base_ %= value;
+                return *this;
+            }
+
+            /** @} */
+
+            /** @{ */
+            /**
+             * For a given operator @f$\oplus@f$, produces the same effect as
+             * (*this) = (*this) @f$\oplus@f$ 1;
+             *
+             * For prefix increment and decrement, the return value is "*this".
+             * Otherwise a new index<N> is returned.
+             */
+            index& operator++() [[cpu, hc]]
+            {
+                return *this += 1;
+            }
+            index operator++(int) [[cpu, hc]]
+            {
+                index ret = *this;
+                ++*this;
+                return ret;
+            }
+            index& operator--() [[cpu, hc]]
+            {
+                return *this -= 1;
+            }
+            index operator--(int) [[cpu, hc]]
+            {
+                index ret = *this;
+                --*this;
+                return ret;
+            }
+
+            /** @} */
+        };
+
+
+        ////////////////////////////////////////////////////////////////////////
+        // operators for index<N>
+        ////////////////////////////////////////////////////////////////////////
+
+        /** @{ */
+        /**
+         * Binary arithmetic operations that produce a new index<N> that is the
+         * result of performing the corresponding pair-wise binary arithmetic
+         * operation on the elements of the operands. The result index<N> is
+         * such that for a given operator @f$\oplus@f$,
+         * @f$result[i] = leftIdx[i] \oplus rightIdx[i]@f$
+         * for every i from 0 to N-1.
+         *
+         * @param[in] lhs The left-hand index<N> of the arithmetic operation.
+         * @param[in] rhs The right-hand index<N> of the arithmetic operation.
+         */
+        // FIXME: the signature is not entirely the same as defined in:
+        //        C++AMP spec v1.2 #1138
+        template<int N>
+        index<N> operator+(const index<N>& lhs, const index<N>& rhs) [[cpu, hc]]
+        {
+            index<N> __r = lhs;
+            __r += rhs;
+            return __r;
+        }
+        template<int N>
+        index<N> operator-(const index<N>& lhs, const index<N>& rhs) [[cpu, hc]]
+        {
+            index<N> __r = lhs;
+            __r -= rhs;
+            return __r;
+        }
+
+        /** @} */
+
+        /** @{ */
+        /**
+         * Binary arithmetic operations that produce a new index<N> that is the
+         * result of performing the corresponding binary arithmetic operation on
+         * the elements of the index operands. The result index<N> is such that
+         * for a given operator @f$\oplus@f$,
+         * result[i] = idx[i] @f$\oplus@f$ value
+         * or
+         * result[i] = value @f$\oplus@f$ idx[i]
+         * for every i from 0 to N-1.
+         *
+         * @param[in] idx The index<N> operand
+         * @param[in] value The integer operand
+         */
+        // FIXME: the signature is not entirely the same as defined in:
+        //        C++AMP spec v1.2 #1141
+        template<int N>
+        index<N> operator+(const index<N>& idx, int value) [[cpu, hc]]
+        {
+            index<N> __r = idx;
+            __r += value;
+            return __r;
+        }
+        template<int N>
+        index<N> operator+(int value, const index<N>& idx) [[cpu, hc]]
+        {
+            index<N> __r = idx;
+            __r += value;
+            return __r;
+        }
+        template<int N>
+        index<N> operator-(const index<N>& idx, int value) [[cpu, hc]]
+        {
+            index<N> __r = idx;
+            __r -= value;
+            return __r;
+        }
+        template<int N>
+        index<N> operator-(int value, const index<N>& idx) [[cpu, hc]]
+        {
+            index<N> __r(value);
+            __r -= idx;
+            return __r;
+        }
+        template<int N>
+        index<N> operator*(const index<N>& idx, int value) [[cpu, hc]]
+        {
+            index<N> __r = idx;
+            __r *= value;
+            return __r;
+        }
+        template<int N>
+        index<N> operator*(int value, const index<N>& idx) [[cpu, hc]]
+        {
+            index<N> __r(value);
+            __r *= idx;
+            return __r;
+        }
+        template<int N>
+        index<N> operator/(const index<N>& idx, int value) [[cpu, hc]]
+        {
+            index<N> __r = idx;
+            __r /= value;
+            return __r;
+        }
+        template<int N>
+        index<N> operator/(int value, const index<N>& idx) [[cpu, hc]]
+        {
+            index<N> __r(value);
+            __r /= idx;
+            return __r;
+        }
+        template<int N>
+        index<N> operator%(const index<N>& idx, int value) [[cpu, hc]]
+        {
+            index<N> __r = idx;
+            __r %= value;
+            return __r;
+        }
+        template<int N>
+        index<N> operator%(int value, const index<N>& idx) [[cpu, hc]]
+        {
+            index<N> __r(value);
+            __r %= idx;
+            return __r;
+        }
+
+        /** @} */
+    } // Namespace hc::detail.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/hc_kernel_emitter.hpp b/include/hc/hc_kernel_emitter.hpp
new file mode 100644
index 00000000000..c0829acea3c
--- /dev/null
+++ b/include/hc/hc_kernel_emitter.hpp
@@ -0,0 +1,316 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "hc_agent_pool.hpp"
+#include "hc_callable_attributes.hpp"
+#include "hc_defines.hpp"
+#include "hc_index.hpp"
+#include "implementation/hc_program_state.hpp"
+
+#include <elfio/elfio.hpp>
+
+#include <link.h>
+
+#include <cstdint>
+#include <exception>
+#include <mutex>
+#include <type_traits>
+#include <unordered_map>
+
+namespace hc
+{
+    template<int> class tiled_index;
+
+    namespace detail
+    {
+       struct Indexer {
+            template<int n>
+            operator index<n>() const noexcept [[hc]]
+            {
+                index<n> tmp;
+                for (auto i = 0; i != n; ++i) {
+                    tmp[n - i - 1] = hc_get_workitem_absolute_id(i);
+                }
+
+                return tmp;
+            }
+
+            template<int n>
+            operator hc::tiled_index<n>() const noexcept [[hc]]
+            {
+                return {};
+            }
+        };
+
+        template<typename Kernel>
+        inline
+        const char* linker_name_for()
+        {
+            static std::once_flag f{};
+            static std::string r{};
+
+            std::call_once(f, [&]() {
+                dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* pr) {
+                    const auto base = info->dlpi_addr;
+                    ELFIO::elfio elf;
+
+                    if (!elf.load(base ? info->dlpi_name : "/proc/self/exe")) {
+                        return 0;
+                    }
+
+                    struct Symbol {
+                        std::string name;
+                        ELFIO::Elf64_Addr value;
+                        ELFIO::Elf_Xword size;
+                        unsigned char bind;
+                        unsigned char type;
+                        ELFIO::Elf_Half section_index;
+                        unsigned char other;
+                    } tmp{};
+                    for (auto&& section : elf.sections) {
+                        if (section->get_type() != SHT_SYMTAB) continue;
+
+                        ELFIO::symbol_section_accessor fn{elf, section};
+
+                        static const auto k_addr = reinterpret_cast<
+                            std::uintptr_t>(&Kernel::entry_point);
+                        auto n = fn.get_symbols_num();
+                        while (n--) {
+                            fn.get_symbol(
+                            n,
+                            tmp.name,
+                            tmp.value,
+                            tmp.size,
+                            tmp.bind,
+                            tmp.type,
+                            tmp.section_index,
+                            tmp.other);
+
+                            if (tmp.type != STT_FUNC) continue;
+
+                            if (tmp.value + base == k_addr) {
+                                *static_cast<std::string*>(pr) = tmp.name;
+
+                                return 1;
+                            }
+                        }
+                    }
+
+                    return 0;
+                }, &r);
+            });
+
+            if (!r.empty()) return r.c_str();
+
+            throw std::runtime_error{
+                std::string{"Kernel: "} +
+                typeid(&Kernel::entry_point).name() +
+                " is not available."};
+        }
+
+        template<typename Kernel>
+        class HSA_kernel {
+            template<typename, typename, typename>
+            friend
+            class Kernel_emitter_base;
+
+            // IMPLEMENTATION - DATA
+            hsa_executable_symbol_t kernel_{};
+
+            // IMPLEMENTATION - STATICS
+            static
+            const std::string& name_()
+            {
+                static const std::string r{linker_name_for<Kernel>()};
+
+                return r;
+            }
+            static
+            std::string symbol_name_(hsa_executable_symbol_t x)
+            {   // TODO: this uses deprecated HSA APIs because ROCr did not
+                //       implement the updated ones.
+                std::size_t sz{};
+                throwing_hsa_result_check(
+                    hsa_executable_symbol_get_info(
+                        x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz),
+                    __FILE__, __func__, __LINE__);
+
+                std::string r(sz, '\0');
+                throwing_hsa_result_check(
+                    hsa_executable_symbol_get_info(
+                        x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r[0]),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            std::uint32_t group_size_(hsa_executable_symbol_t x)
+            {
+                std::uint32_t r{};
+                throwing_hsa_result_check(
+                    hsa_executable_symbol_get_info(
+                        x,
+                        HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+                        &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            std::uint64_t kernel_object_(hsa_executable_symbol_t x)
+            {
+                std::uint64_t r{};
+                throwing_hsa_result_check(
+                    hsa_executable_symbol_get_info(
+                        x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            hsa_executable_symbol_t kernel_symbol_(hsa_agent_t x)
+            {
+                for (auto&& kernel : Program_state::kernels()[x]) {
+                    if (name_() == symbol_name_(kernel)) return kernel;
+                }
+
+                throw std::runtime_error{
+                    "Code for kernel " + name_() + " is unavailable."};
+            }
+
+            static
+            std::uint32_t private_size_(hsa_executable_symbol_t x)
+            {
+                std::uint32_t r{};
+                throwing_hsa_result_check(
+                    hsa_executable_symbol_get_info(
+                        x,
+                        HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+                        &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            // IMPLEMENTATION - CREATORS
+            explicit
+            HSA_kernel(hsa_agent_t x)
+            try :
+                kernel_{kernel_symbol_(x)},
+                group_size{group_size_(kernel_)},
+                kernel_object{kernel_object_(kernel_)},
+                private_size{private_size_(kernel_)}
+            {}
+            catch (const std::exception& ex) {
+                std::cerr << ex.what() << std::endl;
+
+                throw;
+            }
+        public:
+            // DATA
+            std::uint32_t group_size{};
+            std::uint64_t kernel_object{};
+            std::uint32_t private_size{};
+
+            // CREATORS
+            HSA_kernel() = default;
+        };
+
+        template<typename Index, typename Kernel, typename Emitter>
+        class Kernel_emitter_base {
+            // TODO: this validation should be done further above, in pfe
+            //       itself, for more clarity. It is also a placeholder.
+            static
+            std::false_type is_callable_(...) noexcept [[cpu, hc]];
+            template<typename I, typename K>
+            static
+            auto is_callable_(I* idx, const K* f) noexcept [[cpu, hc]]
+                -> decltype((*f)(*idx), std::true_type{});
+
+            static_assert(
+                decltype(is_callable_(
+                    std::declval<Index*>(), std::declval<const Kernel*>())){},
+                "Invalid Callable passed to parallel_for_each.");
+        public:
+            static
+            std::unordered_map<hsa_agent_t, HSA_kernel<Emitter>>& kernel()
+            {
+                static std::unordered_map<hsa_agent_t, HSA_kernel<Emitter>> r;
+                static std::once_flag f;
+
+                std::call_once(f, []() {
+                    for (auto&& agent : Agent_pool::pool()) {
+                        if (agent.second.is_cpu) continue;
+
+                        r.emplace(
+                            agent.first, HSA_kernel<Emitter>{agent.first});
+                    }
+                });
+
+                return r;
+            }
+        };
+
+        template<typename T>
+        inline
+        void ignore_arg(T&&)
+        {}
+
+        template<typename Index, typename Kernel>
+        struct Kernel_emitter :
+            public Kernel_emitter_base<
+                Index, Kernel, Kernel_emitter<Index, Kernel>> {
+            static
+            __attribute__((used, annotate("__HCC_KERNEL__")))
+            void entry_point(Kernel f) noexcept [[cpu, hc]]
+            {
+                #if __HCC_ACCELERATOR__ != 0
+                    Index tmp = Indexer{};
+                    f(tmp);
+                #else
+                    ignore_arg(f);
+                #endif
+            }
+        };
+
+        template<typename Kernel, typename... Attrs>
+        using Kernel_with_attributes =
+            hc::attr_impl::Callable_with_AMDGPU_attributes<Kernel, Attrs...>;
+
+        template<typename Index, typename Kernel, typename... Attrs>
+        struct Kernel_emitter<Index, Kernel_with_attributes<Kernel, Attrs...>> :
+            public Kernel_emitter_base<
+                Index,
+                Kernel_with_attributes<Kernel, Attrs...>,
+                Kernel_emitter<
+                    Index, Kernel_with_attributes<Kernel, Attrs...>>> {
+            using K = Kernel_with_attributes<Kernel, Attrs...>;
+
+            static
+            __attribute__((
+                used,
+                annotate("__HCC_KERNEL__"),
+                amdgpu_flat_work_group_size(
+                    K::Flat_wg_size_::minimum(), K::Flat_wg_size_::maximum()),
+                amdgpu_waves_per_eu(
+                    K::Waves_per_EU_::minimum(), K::Waves_per_EU_::maximum())))
+            void entry_point(K f) noexcept [[cpu, hc]]
+            {
+                #if __HCC_ACCELERATOR__ != 0
+                    Index tmp = Indexer{};
+                    f(tmp);
+                #else
+                    ignore_arg(f);
+                #endif
+            }
+        };
+    } // Namespace hc::detail.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/hc_launch.hpp b/include/hc/hc_launch.hpp
new file mode 100644
index 00000000000..a8d56af5030
--- /dev/null
+++ b/include/hc/hc_launch.hpp
@@ -0,0 +1,295 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "hc_callable_attributes.hpp"
+#include "hc_index.hpp"
+#include "hc_kernel_emitter.hpp"
+#include "hc_queue_pool.hpp"
+#include "hc_runtime.hpp"
+#include "hc_signal_pool.hpp"
+
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+
+#include <link.h>
+
+#include <array>
+#include <cstdint>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <typeinfo>
+#include <type_traits>
+#include <utility>
+
+namespace hc
+{
+    class accelerator_view;
+    template<int> class tiled_extent;
+    template<int> class tiled_index;
+}
+
+/** \cond HIDDEN_SYMBOLS */
+namespace hc
+{
+    namespace detail
+    {
+        template<typename T>
+        struct Index_type;
+
+        template<int n>
+        struct Index_type<hc::extent<n>> {
+            using index_type = index<n>;
+        };
+
+        template<int n>
+        struct Index_type<hc::tiled_extent<n>> {
+            using index_type = hc::tiled_index<n>;
+        };
+
+        template<typename T>
+        using IndexType = typename Index_type<T>::index_type;
+
+        template<typename Kernel>
+        inline
+        std::unique_ptr<void, void (*)(void*)>  make_kernel_state(
+            const Kernel& f)
+        {
+            static const auto del = [](void* p) {
+                if (hsa_amd_memory_unlock(p) != HSA_STATUS_SUCCESS) {
+                    std::cerr << "Failed to unlock locked kernel memory; "
+                        << "HC Runtime may be in an inconsistent state."
+                        << std::endl;
+                }
+
+                delete static_cast<Kernel*>(p);
+            };
+
+            return std::unique_ptr<void, decltype(del)>{new Kernel{f}, del};
+        }
+
+        constexpr
+        inline
+        std::array<std::uint16_t, 1> local_dimensions(const hc::extent<1>&)
+        {
+            return std::array<std::uint16_t, 1>{64};
+        }
+        constexpr
+        inline
+        std::array<std::uint16_t, 2> local_dimensions(const hc::extent<2>&)
+        {
+            return std::array<std::uint16_t, 2>{8, 8};
+        }
+        constexpr
+        inline
+        std::array<std::uint16_t, 3> local_dimensions(const hc::extent<3>&)
+        {
+            return std::array<std::uint16_t, 3>{4, 4, 4};
+        }
+
+        template<int n>
+        inline
+        std::array<std::uint16_t, n> local_dimensions(
+            const hc::tiled_extent<n>& domain)
+        {
+            std::array<std::uint16_t, n> r{};
+            for (auto i = 0; i != n; ++i) r[i] = domain.tile_dim[i];
+
+            return r;
+        }
+
+        template<typename T>
+        constexpr
+        inline
+        std::uint32_t dynamic_lds(const T&) noexcept
+        {
+            return 0;
+        }
+
+        template<int n>
+        inline
+        std::uint32_t dynamic_lds(const hc::tiled_extent<n>& domain) noexcept
+        {
+            return domain.get_dynamic_group_segment_size();
+        }
+
+        template<typename Domain>
+        inline
+        std::pair<
+            std::array<std::uint32_t, Domain::rank>,
+            std::array<std::uint16_t, Domain::rank>> dimensions(
+                const Domain& domain)
+        {   // TODO: optimise.
+            using R = std::pair<
+                std::array<std::uint32_t, Domain::rank>,
+                std::array<std::uint16_t, Domain::rank>>;
+
+            R r{};
+            auto tmp = local_dimensions(domain);
+            for (auto i = 0; i != Domain::rank; ++i) {
+                r.first[i] = domain[i];
+                r.second[i] = tmp[i];
+            }
+
+            return r;
+        }
+
+        enum Packet_type{ barrier, kernel, n };
+
+        template<Packet_type packet>
+        inline
+        std::uint16_t make_packet_header() noexcept
+        {
+            constexpr std::array<std::uint16_t, Packet_type::n> type{{
+                HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE,
+                HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE
+            }};
+            constexpr std::uint16_t fence_scope{
+                (HSA_FENCE_SCOPE_SYSTEM <<
+                    HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
+                (HSA_FENCE_SCOPE_SYSTEM <<
+                    HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE)};
+            constexpr std::uint16_t barrier{
+                (packet == Packet_type::barrier) << HSA_PACKET_HEADER_BARRIER};
+
+            return type[packet] | fence_scope | barrier;
+        }
+
+        template<typename Kernel, typename Domain>
+        inline
+        hsa_signal_t make_kernel_dispatch(
+            const Domain& domain,
+            hsa_kernel_dispatch_packet_t* slot,
+            hsa_agent_t agent,
+            void* locked_kernel) noexcept
+        {
+            if (!locked_kernel || !slot) return {};
+
+            *slot = {};
+
+            slot->header = HSA_PACKET_TYPE_INVALID;
+            slot->setup =
+                Domain::rank << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+
+            const auto dims = dimensions(domain);
+
+            slot->grid_size_x = dims.first[Domain::rank - 1];
+            slot->grid_size_y =
+                (Domain::rank > 1) ? dims.first[Domain::rank - 2] : 1;
+            slot->grid_size_z =
+                (Domain::rank > 2) ? dims.first[Domain::rank - 3] : 1;
+            slot->workgroup_size_x = std::min<std::uint32_t>(
+                dims.second[Domain::rank - 1], slot->grid_size_x);
+            slot->workgroup_size_y = std::min<std::uint32_t>(
+                (Domain::rank > 1) ? dims.second[Domain::rank - 2] : 1,
+                slot->grid_size_y);
+            slot->workgroup_size_z = std::min<std::uint32_t>(
+                (Domain::rank > 2) ? dims.second[Domain::rank - 3] : 1,
+                slot->grid_size_z);
+
+            using K = Kernel_emitter<IndexType<Domain>, Kernel>;
+
+            slot->private_segment_size = K::kernel()[agent].private_size;
+            slot->group_segment_size =
+                K::kernel()[agent].group_size + dynamic_lds(domain);
+            slot->kernel_object = K::kernel()[agent].kernel_object;
+            slot->kernarg_address = locked_kernel;
+
+            slot->reserved2 = make_packet_header<Packet_type::kernel>();
+            slot->completion_signal = Signal_pool::allocate();
+
+            return slot->completion_signal;
+        }
+
+        template<typename AcceleratorView, typename Domain, typename Kernel>
+        inline
+        void launch_kernel(
+            const AcceleratorView& av,
+            const Domain& domain,
+            const Kernel& f)
+        {
+            launch_kernel_async(av, domain, f).wait();
+        }
+
+        template<typename Kernel>
+        inline
+        void* lock_callable(hsa_agent_t agent, void* ptr)
+        {
+            throwing_hsa_result_check(
+                hsa_amd_memory_lock(ptr, sizeof(Kernel), &agent, 1, &ptr),
+                __FILE__, __func__, __LINE__);
+
+            return ptr;
+        }
+
+        template<typename AcceleratorView, typename Domain, typename Kernel>
+        inline
+        std::pair<std::shared_future<void>, hsa_signal_t> launch_kernel_async(
+            const AcceleratorView& av,
+            const Domain& domain,
+            const Kernel& f)
+        {
+            const auto agent = *static_cast<hsa_agent_t*>(
+                av.get_accelerator().get_hsa_agent());
+            auto queue = static_cast<hsa_queue_t*>(av.get_hsa_queue());
+
+            auto ks = make_kernel_state(f);
+            auto slot = Queue_pool::queue_slot(queue);
+            auto signal = make_kernel_dispatch<Kernel>(
+                domain,
+                static_cast<hsa_kernel_dispatch_packet_t*>(slot.first),
+                agent,
+                lock_callable<Kernel>(agent, ks.get()));
+            Queue_pool::enable(slot, queue);
+
+            return {
+                std::async([=](decltype(ks)) {
+                    Signal_pool::wait(signal);
+                    Signal_pool::deallocate(signal);
+                }, std::move(ks)).share(),
+                signal};
+        }
+
+        inline
+        hsa_signal_t make_barrier(hsa_barrier_and_packet_t* slot) noexcept
+        {
+            if (!slot) return {};
+
+            *slot = {};
+
+            slot->header = HSA_PACKET_TYPE_INVALID;
+            slot->reserved2 = make_packet_header<Packet_type::barrier>();
+            slot->completion_signal = Signal_pool::allocate();
+
+            return slot->completion_signal;
+        }
+
+        template<typename AcceleratorView>
+        inline
+        std::pair<std::shared_future<void>, hsa_signal_t> insert_barrier(
+            const AcceleratorView& av)
+        {
+            auto slot = Queue_pool::queue_slot(
+                static_cast<hsa_queue_t*>(av.get_hsa_queue()));
+            auto signal = make_barrier(
+                static_cast<hsa_barrier_and_packet_t*>(slot.first));
+            Queue_pool::enable(
+                slot, static_cast<hsa_queue_t*>(av.get_hsa_queue()));
+
+            return {
+                std::async([=]() {
+                    Signal_pool::wait(signal);
+                    Signal_pool::deallocate(signal);
+                }).share(),
+                signal};
+        }
+    } // Namespace hc::detail.
+} // Namespace hc.
+/** \endcond */
diff --git a/include/hc/hc_math.hpp b/include/hc/hc_math.hpp
new file mode 100644
index 00000000000..1df26e5031b
--- /dev/null
+++ b/include/hc/hc_math.hpp
@@ -0,0 +1,1721 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "hc_defines.hpp"
+
+#include <cmath>
+#include <stdexcept>
+
+extern "C" _Float16 __ocml_acos_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_acos_f32(float x) [[hc]];
+extern "C" double __ocml_acos_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_acosh_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_acosh_f32(float x) [[hc]];
+extern "C" double __ocml_acosh_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_asin_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_asin_f32(float x) [[hc]];
+extern "C" double __ocml_asin_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_asinh_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_asinh_f32(float x) [[hc]];
+extern "C" double __ocml_asinh_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_atan_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_atan_f32(float x) [[hc]];
+extern "C" double __ocml_atan_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_atanh_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_atanh_f32(float x) [[hc]];
+extern "C" double __ocml_atanh_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_atan2_f16(_Float16 y, _Float16 x) [[hc]];
+extern "C" float __ocml_atan2_f32(float y, float x) [[hc]];
+extern "C" double __ocml_atan2_f64(double y, double x) [[hc]];
+
+extern "C" _Float16 __ocml_cbrt_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_cbrt_f32(float x) [[hc]];
+extern "C" double __ocml_cbrt_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_ceil_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_ceil_f32(float x) [[hc]];
+extern "C" double __ocml_ceil_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_copysign_f16(_Float16 x, _Float16 y) [[hc]];
+extern "C" float __ocml_copysign_f32(float x, float y) [[hc]];
+extern "C" double __ocml_copysign_f64(double x, double y) [[hc]];
+
+extern "C" _Float16 __ocml_cos_f16(_Float16 x) [[hc]];
+extern "C" _Float16 __ocml_native_cos_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_cos_f32(float x) [[hc]];
+extern "C" float __ocml_native_cos_f32(float x) [[hc]];
+extern "C" double __ocml_cos_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_cosh_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_cosh_f32(float x) [[hc]];
+extern "C" double __ocml_cosh_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_cospi_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_cospi_f32(float x) [[hc]];
+extern "C" double __ocml_cospi_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_erf_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_erf_f32(float x) [[hc]];
+extern "C" double __ocml_erf_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_erfc_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_erfc_f32(float x) [[hc]];
+extern "C" double __ocml_erfc_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_erfcinv_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_erfcinv_f32(float x) [[hc]];
+extern "C" double __ocml_erfcinv_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_erfinv_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_erfinv_f32(float x) [[hc]];
+extern "C" double __ocml_erfinv_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_exp_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_exp_f32(float x) [[hc]];
+extern "C" double __ocml_exp_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_exp10_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_exp10_f32(float x) [[hc]];
+extern "C" double __ocml_exp10_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_native_exp2_f16(_Float16 x) [[hc]];
+extern "C" _Float16 __ocml_exp2_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_exp2_f32(float x) [[hc]];
+extern "C" float __ocml_native_exp2_f32(float x) [[hc]];
+extern "C" double __ocml_exp2_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_expm1_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_expm1_f32(float x) [[hc]];
+extern "C" double __ocml_expm1_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_fabs_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_fabs_f32(float x) [[hc]];
+extern "C" double __ocml_fabs_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_fdim_f16(_Float16 x, _Float16 y) [[hc]];
+extern "C" float __ocml_fdim_f32(float x, float y) [[hc]];
+extern "C" double __ocml_fdim_f64(double x, double y) [[hc]];
+
+extern "C" _Float16 __ocml_floor_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_floor_f32(float x) [[hc]];
+extern "C" double __ocml_floor_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_fma_f16(_Float16 x, _Float16 y, _Float16 z) [[hc]];
+extern "C" float __ocml_fma_f32(float x, float y, float z) [[hc]];
+extern "C" double __ocml_fma_f64(double x, double y, double z) [[hc]];
+
+extern "C" _Float16 __ocml_fmax_f16(_Float16 x, _Float16 y) [[hc]];
+extern "C" float __ocml_fmax_f32(float x, float y) [[hc]];
+extern "C" double __ocml_fmax_f64(double x, double y) [[hc]];
+
+extern "C" _Float16 __ocml_fmin_f16(_Float16 x, _Float16 y) [[hc]];
+extern "C" float __ocml_fmin_f32(float x, float y) [[hc]];
+extern "C" double __ocml_fmin_f64(double x, double y) [[hc]];
+
+extern "C" _Float16 __ocml_fmod_f16(_Float16 x, _Float16 y) [[hc]];
+extern "C" float __ocml_fmod_f32(float x, float y) [[hc]];
+extern "C" double __ocml_fmod_f64(double x, double y) [[hc]];
+
+extern "C" int __ocml_fpclassify_f16(_Float16 x) [[hc]];
+extern "C" int __ocml_fpclassify_f32(float x) [[hc]];
+extern "C" int __ocml_fpclassify_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_frexp_f16(_Float16 x, __attribute__((address_space(5))) int *exp) [[hc]];
+extern "C" float __ocml_frexp_f32(float x, __attribute__((address_space(5))) int *exp) [[hc]];
+extern "C" double __ocml_frexp_f64(double x, __attribute__((address_space(5))) int *exp) [[hc]];
+
+extern "C" _Float16 __ocml_hypot_f16(_Float16 x, _Float16 y) [[hc]];
+extern "C" float __ocml_hypot_f32(float x, float y) [[hc]];
+extern "C" double __ocml_hypot_f64(double x, double y) [[hc]];
+
+extern "C" int __ocml_ilogb_f16(_Float16 x) [[hc]];
+extern "C" int __ocml_ilogb_f32(float x) [[hc]];
+extern "C" int __ocml_ilogb_f64(double x) [[hc]];
+
+extern "C" int __ocml_isfinite_f16(_Float16 x) [[hc]];
+extern "C" int __ocml_isfinite_f32(float x) [[hc]];
+extern "C" int __ocml_isfinite_f64(double x) [[hc]];
+
+extern "C" int __ocml_isinf_f16(_Float16 x) [[hc]];
+extern "C" int __ocml_isinf_f32(float x) [[hc]];
+extern "C" int __ocml_isinf_f64(double x) [[hc]];
+
+extern "C" int __ocml_isnan_f16(_Float16 x) [[hc]];
+extern "C" int __ocml_isnan_f32(float x) [[hc]];
+extern "C" int __ocml_isnan_f64(double x) [[hc]];
+
+extern "C" int __ocml_isnormal_f16(_Float16 x) [[hc]];
+extern "C" int __ocml_isnormal_f32(float x) [[hc]];
+extern "C" int __ocml_isnormal_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_ldexp_f16(_Float16 x, std::int16_t exp) [[hc]];
+extern "C" float __ocml_ldexp_f32(float x, int exp) [[hc]];
+extern "C" double __ocml_ldexp_f64(double x, int exp) [[hc]];
+
+extern "C" _Float16 __ocml_lgamma_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_lgamma_f32(float x) [[hc]];
+extern "C" double __ocml_lgamma_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_log_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_log_f32(float x) [[hc]];
+extern "C" double __ocml_log_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_log10_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_log10_f32(float x) [[hc]];
+extern "C" double __ocml_log10_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_log2_f16(_Float16 x) [[hc]];
+extern "C" _Float16 __ocml_native_log2_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_log2_f32(float x) [[hc]];
+extern "C" float __ocml_native_log2_f32(float x) [[hc]];
+extern "C" double __ocml_log2_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_log1p_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_log1p_f32(float x) [[hc]];
+extern "C" double __ocml_log1p_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_logb_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_logb_f32(float x) [[hc]];
+extern "C" double __ocml_logb_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_modf_f16(_Float16 x, __attribute__((address_space(5))) _Float16 *iptr) [[hc]];
+extern "C" float __ocml_modf_f32(float x, __attribute__((address_space(5))) float *iptr) [[hc]];
+extern "C" double __ocml_modf_f64(double x, __attribute__((address_space(5))) double *iptr) [[hc]];
+
+extern "C" _Float16 __ocml_nan_f16(int tagp) [[hc]];
+extern "C" float __ocml_nan_f32(int tagp) [[hc]];
+extern "C" double __ocml_nan_f64(unsigned long tagp) [[hc]];
+
+extern "C" _Float16 __ocml_nearbyint_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_nearbyint_f32(float x) [[hc]];
+extern "C" double __ocml_nearbyint_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_nextafter_f16(_Float16 x, _Float16 y) [[hc]];
+extern "C" float __ocml_nextafter_f32(float x, float y) [[hc]];
+extern "C" double __ocml_nextafter_f64(double x, double y) [[hc]];
+
+extern "C" _Float16 __ocml_pow_f16(_Float16 x, _Float16 y) [[hc]];
+extern "C" float __ocml_pow_f32(float x, float y) [[hc]];
+extern "C" double __ocml_pow_f64(double x, double y) [[hc]];
+
+extern "C" _Float16 __ocml_rcbrt_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_rcbrt_f32(float x) [[hc]];
+extern "C" double __ocml_rcbrt_f64(double x) [[hc]];
+
+// TODO: rcp is implementation only, it does not have a public interface.
+extern "C" __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16) __asm("llvm.amdgcn.rcp.f16");
+extern "C" float __ocml_native_recip_f32(float x) [[hc]];
+
+extern "C" _Float16 __ocml_remainder_f16(_Float16 x, _Float16 y) [[hc]];
+extern "C" float __ocml_remainder_f32(float x, float y) [[hc]];
+extern "C" double __ocml_remainder_f64(double x, double y) [[hc]];
+
+extern "C" _Float16 __ocml_remquo_f16(_Float16 x, _Float16 y, __attribute__((address_space(5))) int *quo) [[hc]];
+extern "C" float __ocml_remquo_f32(float x, float y, __attribute__((address_space(5))) int *quo) [[hc]];
+extern "C" double __ocml_remquo_f64(double x, double y, __attribute__((address_space(5))) int *quo) [[hc]];
+
+extern "C" _Float16 __ocml_round_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_round_f32(float x) [[hc]];
+extern "C" double __ocml_round_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_rsqrt_f16(_Float16 x) [[hc]];
+extern "C" _Float16 __ocml_native_rsqrt_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_rsqrt_f32(float x) [[hc]];
+extern "C" float __ocml_native_rsqrt_f32(float x) [[hc]];
+extern "C" double __ocml_rsqrt_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_scalb_f16(_Float16 x, _Float16 exp) [[hc]];
+extern "C" float __ocml_scalb_f32(float x, float exp) [[hc]];
+extern "C" double __ocml_scalb_f64(double x, double exp) [[hc]];
+
+extern "C" _Float16 __ocml_scalbn_f16(_Float16 x, int exp) [[hc]];
+extern "C" float __ocml_scalbn_f32(float x, int exp) [[hc]];
+extern "C" double __ocml_scalbn_f64(double x, int exp) [[hc]];
+
+extern "C" _Float16 __ocml_sinpi_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_sinpi_f32(float x) [[hc]];
+extern "C" double __ocml_sinpi_f64(double x) [[hc]];
+
+extern "C" int __ocml_signbit_f16(_Float16 x) [[hc]];
+extern "C" int __ocml_signbit_f32(float x) [[hc]];
+extern "C" int __ocml_signbit_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_sin_f16(_Float16 x) [[hc]];
+extern "C" _Float16 __ocml_native_sin_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_sin_f32(float x) [[hc]];
+extern "C" float __ocml_native_sin_f32(float x) [[hc]];
+extern "C" double __ocml_sin_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_sincos_f16(_Float16 x, __attribute__((address_space(5))) _Float16 *c) [[hc]];
+extern "C" float __ocml_sincos_f32(float x, __attribute__((address_space(5))) float *c) [[hc]];
+extern "C" double __ocml_sincos_f64(double x, __attribute__((address_space(5))) double *c) [[hc]];
+
+extern "C" _Float16 __ocml_sinh_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_sinh_f32(float x) [[hc]];
+extern "C" double __ocml_sinh_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_sqrt_f16(_Float16 x) [[hc]];
+extern "C" _Float16 __ocml_native_sqrt_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_sqrt_f32(float x) [[hc]];
+extern "C" float __ocml_native_sqrt_f32(float x) [[hc]];
+extern "C" double __ocml_sqrt_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_tgamma_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_tgamma_f32(float x) [[hc]];
+extern "C" double __ocml_tgamma_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_tan_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_tan_f32(float x) [[hc]];
+extern "C" double __ocml_tan_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_tanh_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_tanh_f32(float x) [[hc]];
+extern "C" double __ocml_tanh_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_tanpi_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_tanpi_f32(float x) [[hc]];
+extern "C" double __ocml_tanpi_f64(double x) [[hc]];
+
+extern "C" _Float16 __ocml_trunc_f16(_Float16 x) [[hc]];
+extern "C" float __ocml_trunc_f32(float x) [[hc]];
+extern "C" double __ocml_trunc_f64(double x) [[hc]];
+
+#define HCC_MATH_LIB_FN inline __attribute__((hc, nothrow))
+namespace hc
+{
+    namespace detail
+    {
+        namespace fast_math
+        {
+            using std::acos;
+            using ::acosf;
+            using std::asin;
+            using ::asinf;
+            using std::atan;
+            using ::atanf;
+            using std::atan2;
+            using ::atan2f;
+            using std::ceil;
+            using ::ceilf;
+            using std::cos;
+            using ::cosf;
+            using std::cosh;
+            using ::coshf;
+            using std::exp;
+            using ::exp10;
+            using std::exp2;
+            using ::exp10f;
+            using ::exp2f;
+            using ::expf;
+            using std::fabs;
+            using ::fabsf;
+            using std::floor;
+            using ::floorf;
+            using std::fmax;
+            using ::fmaxf;
+            using std::fmin;
+            using ::fminf;
+            using std::fmod;
+            using ::fmodf;
+            using std::frexp;
+            using ::frexpf;
+            using std::isfinite;
+            using std::isinf;
+            using std::isnan;
+            using std::isnormal;
+            using std::ldexp;
+            using ::ldexpf;
+            using std::log;
+            using ::logf;
+            using std::log10;
+            using ::log10f;
+            using std::log2;
+            using ::log2f;
+            using std::modf;
+            using ::modff;
+            using std::pow;
+            using ::powf;
+            using std::round;
+            using ::roundf;
+            using std::signbit;
+            using std::sin;
+            using ::sinf;
+            using std::sinh;
+            using ::sinhf;
+            using std::sqrt;
+            using ::sqrtf;
+            using std::tan;
+            using ::tanf;
+            using std::tanh;
+            using ::tanhf;
+            using std::trunc;
+            using ::truncf;
+
+            HCC_MATH_LIB_FN
+            float acosf(float x) { return __ocml_acos_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 acos(_Float16 x) { return __ocml_acos_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float acos(float x) { return fast_math::acosf(x); }
+
+            HCC_MATH_LIB_FN
+            float asinf(float x) { return __ocml_asin_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 asin(_Float16 x) { return __ocml_asin_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float asin(float x) { return fast_math::asinf(x); }
+
+            HCC_MATH_LIB_FN
+            float atanf(float x) { return __ocml_atan_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 atan(_Float16 x) { return __ocml_atan_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float atan(float x) { return fast_math::atanf(x); }
+
+            HCC_MATH_LIB_FN
+            float atan2f(float y, float x) { return __ocml_atan2_f32(y, x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 atan2(_Float16 y, _Float16 x)
+            {
+                return __ocml_atan2_f16(y, x);
+            }
+
+            HCC_MATH_LIB_FN
+            float atan2(float y, float x) { return fast_math::atan2f(y, x); }
+
+            HCC_MATH_LIB_FN
+            float ceilf(float x) { return __ocml_ceil_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 ceil(_Float16 x) { return __ocml_ceil_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float ceil(float x) { return fast_math::ceilf(x); }
+
+            HCC_MATH_LIB_FN
+            float cosf(float x) { return __ocml_native_cos_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 cos(_Float16 x) { return __ocml_native_cos_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float cos(float x) { return fast_math::cosf(x); }
+
+            HCC_MATH_LIB_FN
+            float coshf(float x) { return __ocml_cosh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 cosh(_Float16 x) { return __ocml_cosh_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float cosh(float x) { return fast_math::coshf(x); }
+
+            HCC_MATH_LIB_FN
+            float expf(float x) { return __ocml_native_exp2_f32(M_LOG2E * x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 exp(_Float16 x) { return __ocml_native_exp2_f16(M_LOG2E * x); }
+
+            HCC_MATH_LIB_FN
+            float exp(float x) { return fast_math::expf(x); }
+
+            HCC_MATH_LIB_FN
+            float exp2f(float x) { return __ocml_native_exp2_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 exp2(_Float16 x) { return __ocml_native_exp2_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float exp2(float x) { return exp2f(x); }
+
+            HCC_MATH_LIB_FN
+            float fabsf(float x) { return __ocml_fabs_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fabs(_Float16 x) { return __ocml_fabs_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float fabs(float x) { return fabsf(x); }
+
+            HCC_MATH_LIB_FN
+            float floorf(float x) { return __ocml_floor_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 floor(_Float16 x) { return __ocml_floor_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float floor(float x) { return floorf(x); }
+
+            HCC_MATH_LIB_FN
+            float fmaxf(float x, float y) { return __ocml_fmax_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fmax(_Float16 x, _Float16 y)
+            {
+                return __ocml_fmax_f16(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            float fmax(float x, float y) { return fmaxf(x, y); }
+
+            HCC_MATH_LIB_FN
+            float fminf(float x, float y) { return __ocml_fmin_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fmin(_Float16 x, _Float16 y)
+            {
+                return __ocml_fmin_f16(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            float fmin(float x, float y) { return fminf(x, y); }
+
+            HCC_MATH_LIB_FN
+            float fmodf(float x, float y) { return __ocml_fmod_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fmod(_Float16 x, _Float16 y)
+            {
+                return __ocml_fmod_f16(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            float fmod(float x, float y) { return fmodf(x, y); }
+
+            HCC_MATH_LIB_FN
+            float frexpf(float x, int *exp) {
+                int e;
+                float ret = __ocml_frexp_f32(
+                    x, (__attribute__((address_space(5))) int*) &e);
+                *exp = e;
+
+                return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            _Float16 frexp(_Float16 x, int *exp)
+            {
+                int e;
+                _Float16 ret = __ocml_frexp_f16(
+                    x, (__attribute__((address_space(5))) int*) &e);
+                *exp = e;
+
+                return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            float frexp(float x, int *exp) { return frexpf(x, exp); }
+
+            HCC_MATH_LIB_FN
+            int isfinite(_Float16 x) { return __ocml_isfinite_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int isfinite(float x) { return __ocml_isfinite_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int isinf(_Float16 x) { return __ocml_isinf_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int isinf(float x) { return __ocml_isinf_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int isnan(_Float16 x) { return __ocml_isnan_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int isnan(float x) { return __ocml_isnan_f32(x); }
+
+            HCC_MATH_LIB_FN
+            float ldexpf(float x, int exp) { return __ocml_ldexp_f32(x,exp); }
+
+            HCC_MATH_LIB_FN
+            _Float16 ldexp(_Float16 x, std::uint16_t exp)
+            {
+                return __ocml_ldexp_f16(x, exp);
+            }
+
+            HCC_MATH_LIB_FN
+            float ldexp(float x, int exp) { return fast_math::ldexpf(x, exp); }
+
+            namespace
+            {   // TODO: this is temporary, lifted straight out of irif.h.
+                // Namespace is merely for documentation.
+                #define M_LOG2_10_F 0x1.a934f0p+1f
+                // Value of 1 / log2(10)
+                #define M_RLOG2_10_F 0x1.344136p-2f
+                // Value of 1 / M_LOG2E_F = 1 / log2(e)
+                #define M_RLOG2_E_F 0x1.62e430p-1f
+            }
+
+            HCC_MATH_LIB_FN
+            float logf(float x)
+            {
+                return __ocml_native_log2_f32(x) * M_RLOG2_E_F;
+            }
+
+            HCC_MATH_LIB_FN
+            _Float16 log(_Float16 x)
+            {
+                return __ocml_native_log2_f16(x) *
+                    static_cast<_Float16>(M_RLOG2_E_F);
+            }
+
+            HCC_MATH_LIB_FN
+            float log(float x) { return logf(x); }
+
+            HCC_MATH_LIB_FN
+            float log10f(float x)
+            {
+                return __ocml_native_log2_f32(x) * M_RLOG2_10_F;
+            }
+
+            HCC_MATH_LIB_FN
+            _Float16 log10(_Float16 x)
+            {
+                return __ocml_native_log2_f16(x) *
+                    static_cast<_Float16>(M_RLOG2_10_F);
+            }
+
+            HCC_MATH_LIB_FN
+            float log10(float x) { return log10f(x); }
+
+            HCC_MATH_LIB_FN
+            float log2f(float x) { return __ocml_native_log2_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 log2(_Float16 x) { return __ocml_native_log2_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float log2(float x) { return log2f(x); }
+
+            HCC_MATH_LIB_FN
+            float modff(float x, float* iptr) {
+                float i;
+                float ret = __ocml_modf_f32(
+                    x, (__attribute__((address_space(5))) float*)&i);
+                *iptr = i;
+
+                return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            _Float16 modf(_Float16 x, _Float16* iptr) {
+                _Float16 i;
+                _Float16 ret = __ocml_modf_f16(
+                    x, (__attribute__((address_space(5))) _Float16*) &i);
+                *iptr = i;
+
+                return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            float modf(float x, float* iptr) { return modff(x, iptr); }
+
+            HCC_MATH_LIB_FN
+            float powf(float x, float y) { return __ocml_pow_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 pow(_Float16 x, _Float16 y)
+            {
+                return __ocml_pow_f16(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            float pow(float x, float y) { return powf(x, y); }
+
+            HCC_MATH_LIB_FN
+            float roundf(float x) { return __ocml_round_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 round(_Float16 x) { return __ocml_round_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float round(float x) { return roundf(x); }
+
+            HCC_MATH_LIB_FN
+            float rsqrtf(float x) { return __ocml_native_rsqrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 rsqrt(_Float16 x) { return __ocml_native_rsqrt_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float rsqrt(float x) { return rsqrtf(x); }
+
+            HCC_MATH_LIB_FN
+            int signbitf(float x) { return __ocml_signbit_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int signbit(_Float16 x) { return __ocml_signbit_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int signbit(float x) { return signbitf(x); }
+
+            HCC_MATH_LIB_FN
+            float sinf(float x) { return __ocml_native_sin_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 sin(_Float16 x) { return __ocml_native_sin_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float sin(float x) { return sinf(x); }
+
+            HCC_MATH_LIB_FN
+            void sincosf(float x, float *s, float *c) {
+                float lc;
+                *s = __ocml_sincos_f32(
+                    x, (__attribute__((address_space(5))) float*)&lc);
+                *c=lc;
+            }
+
+            HCC_MATH_LIB_FN
+            void sincos(_Float16 x, _Float16 *s, _Float16 *c)
+            {
+                _Float16 lc;
+                *s = __ocml_sincos_f16(
+                    x, (__attribute__((address_space(5))) _Float16*) &lc);
+                *c = lc;
+            }
+
+            HCC_MATH_LIB_FN
+            float sinhf(float x) { return __ocml_sinh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 sinh(_Float16 x) { return __ocml_sinh_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float sinh(float x) { return sinhf(x); }
+
+            HCC_MATH_LIB_FN
+            float sqrtf(float x) { return __ocml_native_sqrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 sqrt(_Float16 x) { return __ocml_native_sqrt_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float sqrt(float x) { return sqrtf(x); }
+
+            HCC_MATH_LIB_FN
+            float tanf(float x) { return __ocml_tan_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 tan(_Float16 x) { return __ocml_tan_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float tan(float x) { return tanf(x); }
+
+            HCC_MATH_LIB_FN
+            float tanhf(float x) { return __ocml_tanh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 tanh(_Float16 x) { return __ocml_tanh_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float tanh(float x) { return tanhf(x); }
+
+            HCC_MATH_LIB_FN
+            float truncf(float x) { return __ocml_trunc_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 trunc(_Float16 x) { return __ocml_trunc_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float trunc(float x) { return truncf(x); }
+        } // namespace hc::detail::fast_math
+        namespace precise_math
+        {
+            using std::acos;
+            using std::acosh;
+            using ::acoshf;
+            using ::acosf;
+            using std::asin;
+            using std::asinh;
+            using ::asinhf;
+            using ::asinf;
+            using std::atan;
+            using std::atan2;
+            using ::atan2f;
+            using std::atanh;
+            using ::atanhf;
+            using ::atanf;
+            using std::cbrt;
+            using ::cbrtf;
+            using std::ceil;
+            using ::ceilf;
+            using std::copysign;
+            using ::copysignf;
+            using std::cos;
+            using std::cosh;
+            using ::coshf;
+            using ::cosf;
+            using std::erf;
+            using std::erfc;
+            using ::erfcf;
+            using ::erff;
+            using std::exp;
+            using ::exp10;
+            using ::exp10f;
+            using std::exp2;
+            using ::exp2f;
+            using ::expf;
+            using std::expm1;
+            using ::expm1f;
+            using std::fabs;
+            using ::fabsf;
+            using std::fdim;
+            using ::fdimf;
+            using std::floor;
+            using ::floorf;
+            using std::fma;
+            using ::fmaf;
+            using std::fmax;
+            using ::fmaxf;
+            using std::fmin;
+            using ::fminf;
+            using std::fmod;
+            using ::fmodf;
+            using std::frexp;
+            using ::frexpf;
+            using std::hypot;
+            using ::hypotf;
+            using std::ilogb;
+            using ::ilogbf;
+            using std::isfinite;
+            using std::isinf;
+            using std::isnan;
+            using std::isnormal;
+            using std::ldexp;
+            using ::ldexpf;
+            using std::log;
+            using std::log10;
+            using std::log1p;
+            using std::log2;
+            using std::logb;
+            using ::log10f;
+            using ::log1pf;
+            using ::log2f;
+            using ::logbf;
+            using ::logf;
+            using std::modf;
+            using ::modff;
+            using std::nearbyint;
+            using ::nearbyintf;
+            using std::nextafter;
+            using ::nextafterf;
+            using std::pow;
+            using ::powf;
+            using std::remainder;
+            using ::remainderf;
+            using std::remquo;
+            using ::remquof;
+            using std::round;
+            using ::roundf;
+            using std::scalbn;
+            using ::scalbnf;
+            using std::signbit;
+            using std::sin;
+            using std::sinh;
+            using ::sinhf;
+            using ::sinf;
+            using std::sqrt;
+            using ::sqrtf;
+            using std::tan;
+            using std::tanh;
+            using ::tanhf;
+            using ::tanf;
+            using std::tgamma;
+            using ::tgammaf;
+            using std::trunc;
+            using ::truncf;
+
+            HCC_MATH_LIB_FN
+            float acosf(float x) { return __ocml_acos_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 acos(_Float16 x) { return __ocml_acos_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float acos(float x) { return __ocml_acos_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double acos(double x) { return __ocml_acos_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float acoshf(float x) { return __ocml_acosh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 acosh(_Float16 x) { return __ocml_acosh_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float acosh(float x) { return __ocml_acosh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double acosh(double x) { return __ocml_acosh_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float asinf(float x) { return __ocml_asin_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 asin(_Float16 x) { return __ocml_asin_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float asin(float x) { return __ocml_asin_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double asin(double x) { return __ocml_asin_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float asinhf(float x) { return __ocml_asinh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 asinh(_Float16 x) { return __ocml_asinh_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float asinh(float x) { return __ocml_asinh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double asinh(double x) { return __ocml_asinh_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float atanf(float x) { return __ocml_atan_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 atan(_Float16 x) { return __ocml_atan_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float atan(float x) { return __ocml_atan_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double atan(double x) { return __ocml_atan_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float atanhf(float x) { return __ocml_atanh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 atanh(_Float16 x) { return __ocml_atanh_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float atanh(float x) { return __ocml_atanh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double atanh(double x) { return __ocml_atanh_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float atan2f(float y, float x) { return __ocml_atan2_f32(y, x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 atan2(_Float16 x, _Float16 y) { return __ocml_atan2_f16(x, y); }
+
+            HCC_MATH_LIB_FN
+            float atan2(float y, float x) { return __ocml_atan2_f32(y, x); }
+
+            HCC_MATH_LIB_FN
+            double atan2(double y, double x) { return __ocml_atan2_f64(y, x); }
+
+            HCC_MATH_LIB_FN
+            float cbrtf(float x) { return __ocml_cbrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 cbrt(_Float16 x) { return __ocml_cbrt_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float cbrt(float x) { return __ocml_cbrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double cbrt(double x) { return __ocml_cbrt_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float ceilf(float x) { return __ocml_ceil_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 ceil(_Float16 x) { return __ocml_ceil_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float ceil(float x) { return __ocml_ceil_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double ceil(double x) { return __ocml_ceil_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float copysignf(float x, float y) { return __ocml_copysign_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 copysign(_Float16 x, _Float16 y) { return __ocml_copysign_f16(x, y); }
+
+            HCC_MATH_LIB_FN
+            float copysign(float x, float y)
+            {
+                return __ocml_copysign_f32(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            double copysign(double x, double y)
+            {
+                return __ocml_copysign_f64(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            float cosf(float x) { return __ocml_cos_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 cos(_Float16 x) { return __ocml_cos_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float cos(float x) { return __ocml_cos_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double cos(double x) { return __ocml_cos_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float coshf(float x) { return __ocml_cosh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 cosh(_Float16 x) { return __ocml_cosh_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float cosh(float x) { return __ocml_cosh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double cosh(double x) { return __ocml_cosh_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float cospif(float x) { return __ocml_cospi_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 cospi(_Float16 x) { return __ocml_cospi_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float cospi(float x) { return __ocml_cospi_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double cospi(double x) { return __ocml_cospi_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float erff(float x) { return __ocml_erf_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 erf(_Float16 x) { return __ocml_erf_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float erf(float x) { return __ocml_erf_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double erf(double x) { return __ocml_erf_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float erfcf(float x) { return __ocml_erfc_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 erfc(_Float16 x) { return __ocml_erfc_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float erfc(float x) { return __ocml_erfc_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double erfc(double x) { return __ocml_erfc_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float erfcinvf(float x) { return __ocml_erfcinv_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 erfcinv(_Float16 x) { return __ocml_erfcinv_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float erfcinv(float x) { return __ocml_erfcinv_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double erfcinv(double x) { return __ocml_erfcinv_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float erfinvf(float x) { return __ocml_erfinv_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 erfinv(_Float16 x) { return __ocml_erfinv_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float erfinv(float x) { return __ocml_erfinv_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double erfinv(double x) { return __ocml_erfinv_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float expf(float x) { return __ocml_exp_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 exp(_Float16 x) { return __ocml_exp_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float exp(float x) { return __ocml_exp_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double exp(double x) { return __ocml_exp_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float exp2f(float x) { return __ocml_exp2_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 exp2(_Float16 x) { return __ocml_exp2_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float exp2(float x) { return __ocml_exp2_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double exp2(double x) { return __ocml_exp2_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float exp10f(float x) { return __ocml_exp10_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 exp10(_Float16 x) { return __ocml_exp10_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float exp10(float x) { return __ocml_exp10_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double exp10(double x) { return __ocml_exp10_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float expm1f(float x) { return __ocml_expm1_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 expm1(_Float16 x) { return __ocml_expm1_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float expm1(float x) { return __ocml_expm1_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double expm1(double x) { return __ocml_expm1_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float fabsf(float x) { return __ocml_fabs_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fabs(_Float16 x) { return __ocml_fabs_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float fabs(float x) { return __ocml_fabs_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double fabs(double x) { return __ocml_fabs_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float fdimf(float x, float y) { return __ocml_fdim_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fdim(_Float16 x, _Float16 y) { return __ocml_fdim_f16(x, y); }
+
+            HCC_MATH_LIB_FN
+            float fdim(float x, float y) { return __ocml_fdim_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            double fdim(double x, double y) { return __ocml_fdim_f64(x, y); }
+
+            HCC_MATH_LIB_FN
+            float floorf(float x) { return __ocml_floor_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 floor(_Float16 x) { return __ocml_floor_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float floor(float x) { return __ocml_floor_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double floor(double x) { return __ocml_floor_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float fmaf(float x, float y, float z) { return __ocml_fma_f32(x, y, z); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fma(_Float16 x, _Float16 y, _Float16 z)
+            {
+                return __ocml_fma_f16(x, y, z);
+            }
+
+            HCC_MATH_LIB_FN
+            float fma(float x, float y, float z)
+            {
+                return __ocml_fma_f32(x, y, z);
+            }
+
+            HCC_MATH_LIB_FN
+            double fma(double x, double y, double z)
+            {
+                return __ocml_fma_f64(x, y, z);
+            }
+
+            HCC_MATH_LIB_FN
+            float fmaxf(float x, float y) { return __ocml_fmax_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fmax(_Float16 x, _Float16 y) { return __ocml_fmax_f16(x, y); }
+
+            HCC_MATH_LIB_FN
+            float fmax(float x, float y) { return __ocml_fmax_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            double fmax(double x, double y) { return __ocml_fmax_f64(x, y); }
+
+            HCC_MATH_LIB_FN
+            float fminf(float x, float y) { return __ocml_fmin_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fmin(_Float16 x, _Float16 y) { return __ocml_fmin_f16(x, y); }
+
+            HCC_MATH_LIB_FN
+            float fmin(float x, float y) { return fminf(x, y); }
+
+            HCC_MATH_LIB_FN
+            double fmin(double x, double y) { return __ocml_fmin_f64(x, y); }
+
+            HCC_MATH_LIB_FN
+            float fmodf(float x, float y) { return __ocml_fmod_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 fmod(_Float16 x, _Float16 y) { return __ocml_fmod_f16(x, y); }
+
+            HCC_MATH_LIB_FN
+            float fmod(float x, float y) { return __ocml_fmod_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            double fmod(double x, double y) { return __ocml_fmod_f64(x, y); }
+
+            HCC_MATH_LIB_FN
+            int fpclassify(_Float16 x) { return __ocml_fpclassify_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int fpclassify(float x) { return __ocml_fpclassify_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int fpclassify(double x) { return __ocml_fpclassify_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float frexpf(float x, int *exp)
+            {
+                int e;
+                float ret =__ocml_frexp_f32(
+                    x, (__attribute__((address_space(5))) int*) &e);
+                *exp = e;
+
+                return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            _Float16 frexp(_Float16 x, int* exp)
+            {
+                int e;
+                _Float16 ret = __ocml_frexp_f16(
+                    x, (__attribute__((address_space(5))) int*) &e);
+                *exp = e;
+
+                return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            double frexp(double x, int *exp)
+            {
+                return precise_math::frexpf(x, exp);
+            }
+
+            HCC_MATH_LIB_FN
+            float hypotf(float x, float y) { return __ocml_hypot_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 hypot(_Float16 x, _Float16 y) { return __ocml_hypot_f16(x, y); }
+
+            HCC_MATH_LIB_FN
+            float hypot(float x, float y) { return __ocml_hypot_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            double hypot(double x, double y) { return __ocml_hypot_f64(x, y); }
+
+            HCC_MATH_LIB_FN
+            int ilogbf(float x) { return __ocml_ilogb_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int ilogb(_Float16 x) { return __ocml_ilogb_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int ilogb(float x) { return __ocml_ilogb_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int ilogb(double x) { return __ocml_ilogb_f64(x); }
+
+            HCC_MATH_LIB_FN
+            int isfinite(_Float16 x) { return __ocml_isfinite_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int isfinite(float x) { return __ocml_isfinite_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int isfinite(double x) { return __ocml_isfinite_f64(x); }
+
+            HCC_MATH_LIB_FN
+            int isinf(_Float16 x) { return __ocml_isinf_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int isinf(float x) { return __ocml_isinf_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int isinf(double x) { return __ocml_isinf_f64(x); }
+
+            HCC_MATH_LIB_FN
+            int isnan(_Float16 x) { return __ocml_isnan_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int isnan(float x) { return __ocml_isnan_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int isnan(double x) { return __ocml_isnan_f64(x); }
+
+            HCC_MATH_LIB_FN
+            int isnormal(_Float16 x) { return __ocml_isnormal_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int isnormal(float x) { return __ocml_isnormal_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int isnormal(double x) { return __ocml_isnormal_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float ldexpf(float x, int exp) { return __ocml_ldexp_f32(x, exp); }
+
+            HCC_MATH_LIB_FN
+            _Float16 ldexp(_Float16 x, std::int16_t e) { return __ocml_ldexp_f16(x, e); }
+
+            HCC_MATH_LIB_FN
+            float ldexp(float x, int exp) { return __ocml_ldexp_f32(x, exp); }
+
+            HCC_MATH_LIB_FN
+            double ldexp(double x, int exp) { return __ocml_ldexp_f64(x,exp); }
+
+            HCC_MATH_LIB_FN
+            float lgammaf(float x) { return __ocml_lgamma_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 lgamma(_Float16 x) { return __ocml_lgamma_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float lgamma(float x) { return __ocml_lgamma_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double lgamma(double x) { return __ocml_lgamma_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float logf(float x) { return __ocml_log_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 log(_Float16 x) { return __ocml_log_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float log(float x) { return __ocml_log_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double log(double x) { return __ocml_log_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float log10f(float x) { return __ocml_log10_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 log10(_Float16 x) { return __ocml_log10_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float log10(float x) { return __ocml_log10_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double log10(double x) { return __ocml_log10_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float log2f(float x) { return __ocml_log2_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 log2(_Float16 x) { return __ocml_log2_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float log2(float x) { return __ocml_log2_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double log2(double x) { return __ocml_log2_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float log1pf(float x) { return __ocml_log1p_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 log1p(_Float16 x) { return __ocml_log1p_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float log1p(float x) { return __ocml_log1p_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double log1p(double x) { return __ocml_log1p_f32(x); }
+
+            HCC_MATH_LIB_FN
+            float logbf(float x) { return __ocml_logb_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 logb(_Float16 x) { return __ocml_logb_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float logb(float x) { return __ocml_logb_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double logb(double x) { return __ocml_logb_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float modff(float x, float *iptr)
+            {
+                float i;
+                float ret = __ocml_modf_f32(
+                    x, (__attribute__((address_space(5))) float*)&i);
+                *iptr = i;
+
+                return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            _Float16 modf(_Float16 x, _Float16* p)
+            {
+                _Float16 lp;
+                _Float16 ret = __ocml_modf_f16(
+                    x, (__attribute__((address_space(5))) _Float16*) &lp);
+                *p = lp;
+
+                return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            double modf(double x, double* p)
+            {
+                double lp;
+                double ret = __ocml_modf_f64(
+                    x, (__attribute__((address_space(5))) double*) &lp);
+                *p = lp;
+
+                return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            _Float16 nanh(int x) { return __ocml_nan_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float nanf(int tagp) { return __ocml_nan_f32(tagp); }
+
+            HCC_MATH_LIB_FN
+            double nan(int tagp)
+            {
+                return __ocml_nan_f64(static_cast<unsigned long>(tagp));
+            }
+
+            HCC_MATH_LIB_FN
+            float nearbyintf(float x) { return __ocml_nearbyint_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 nearbyint(_Float16 x) { return __ocml_nearbyint_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float nearbyint(float x) { return __ocml_nearbyint_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double nearbyint(double x) { return __ocml_nearbyint_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float nextafterf(float x, float y)
+            {
+                return __ocml_nextafter_f32(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            _Float16 nextafter(_Float16 x, _Float16 y)
+            {
+                return __ocml_nextafter_f16(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            float nextafter(float x, float y)
+            {
+                return __ocml_nextafter_f32(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            double nextafter(double x, double y)
+            {
+                return __ocml_nextafter_f64(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            float powf(float x, float y) { return __ocml_pow_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 pow(_Float16 x, _Float16 y) { return __ocml_pow_f16(x, y); }
+
+            HCC_MATH_LIB_FN
+            float pow(float x, float y) { return __ocml_pow_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            double pow(double x, double y) { return __ocml_pow_f64(x, y); }
+
+            HCC_MATH_LIB_FN
+            float rcbrtf(float x) { return __ocml_rcbrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 rcbrt(_Float16 x) { return __ocml_rcbrt_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float rcbrt(float x) { return __ocml_rcbrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double rcbrt(double x) { return __ocml_rcbrt_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float remainderf(float x, float y) { return __ocml_remainder_f32(x, y); }
+
+            HCC_MATH_LIB_FN
+            _Float16 remainder(_Float16 x, _Float16 y)
+            {
+                return __ocml_remainder_f16(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            float remainder(float x, float y)
+            {
+                return __ocml_remainder_f32(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            double remainder(double x, double y)
+            {
+                return __ocml_remainder_f64(x, y);
+            }
+
+            HCC_MATH_LIB_FN
+            float remquof(float x, float y, int *quo)
+            {
+                int lq; float ret = __ocml_remquo_f32(x, y, (__attribute__((address_space(5))) int*) &lq);
+                *quo = lq; return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            _Float16 remquo(_Float16 x, _Float16 y, int* q)
+            {
+                int lq; _Float16 ret = __ocml_remquo_f16(x, y, (__attribute__((address_space(5))) int*) &lq);
+                *q = lq; return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            float remquo(float x, float y, int *quo) { return precise_math::remquof(x, y, quo); }
+
+            HCC_MATH_LIB_FN
+            double remquo(double x, double y, int *quo)
+            {
+                int lq; double ret = __ocml_remquo_f64(x, y, (__attribute__((address_space(5))) int*) &lq);
+                *quo = lq; return ret;
+            }
+
+            HCC_MATH_LIB_FN
+            float roundf(float x) { return __ocml_round_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 round(_Float16 x) { return __ocml_round_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float round(float x) { return __ocml_round_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double round(double x) { return __ocml_round_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float rsqrtf(float x) { return __ocml_rsqrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 rsqrt(_Float16 x) { return __ocml_rsqrt_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float rsqrt(float x) { return __ocml_rsqrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double rsqrt(double x) { return __ocml_rsqrt_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float sinpif(float x) { return __ocml_sinpi_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 sinpi(_Float16 x) { return __ocml_sinpi_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float sinpi(float x) { return __ocml_sinpi_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double sinpi(double x) { return __ocml_sinpi_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float scalbf(float x, float exp) { return __ocml_scalb_f32(x, exp); }
+
+            HCC_MATH_LIB_FN
+            _Float16 scalb(_Float16 x, _Float16 y) { return __ocml_scalb_f16(x, y); }
+
+            HCC_MATH_LIB_FN
+            float scalb(float x, float exp) { return __ocml_scalb_f32(x, exp); }
+
+            HCC_MATH_LIB_FN
+            double scalb(double x, double exp) { return __ocml_scalb_f64(x, exp); }
+
+            HCC_MATH_LIB_FN
+            float scalbnf(float x, int exp) { return __ocml_scalbn_f32(x, exp); }
+
+            HCC_MATH_LIB_FN
+            _Float16 scalbn(_Float16 x, int e) { return __ocml_scalbn_f16(x, e); }
+
+            HCC_MATH_LIB_FN
+            float scalbn(float x, int exp) { return __ocml_scalbn_f32(x, exp); }
+
+            HCC_MATH_LIB_FN
+            double scalbn(double x, int exp) { return __ocml_scalbn_f64(x, exp); }
+
+            HCC_MATH_LIB_FN
+            int signbitf(float x) { return __ocml_signbit_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int signbit(_Float16 x) { return __ocml_signbit_f16(x); }
+
+            HCC_MATH_LIB_FN
+            int signbit(float x) { return __ocml_signbit_f32(x); }
+
+            HCC_MATH_LIB_FN
+            int signbit(double x) { return __ocml_signbit_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float sinf(float x) { return __ocml_sin_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 sin(_Float16 x) { return __ocml_sin_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float sin(float x) { return __ocml_sin_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double sin(double x) { return __ocml_sin_f64(x); }
+
+            HCC_MATH_LIB_FN
+            void sincosf(float x, float *s, float *c) {
+                float lc; *s = __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*) &lc);
+                *c = lc;
+            }
+
+            HCC_MATH_LIB_FN
+            void sincos(_Float16 x, _Float16* s, _Float16* c)
+            {
+                _Float16 lc; *s = __ocml_sincos_f16(x, (__attribute__((address_space(5))) _Float16*) &lc);
+                *c = lc;
+            }
+
+            HCC_MATH_LIB_FN
+            void sincos(float x, float *s, float *c) { precise_math::sincosf(x, s, c); }
+
+            HCC_MATH_LIB_FN
+            void sincos(double x, double *s, double *c)
+            {
+                double lc; *s = __ocml_sincos_f64(x, (__attribute__((address_space(5))) double*) &lc);
+                *c = lc;
+            }
+
+            HCC_MATH_LIB_FN
+            float sinhf(float x) { return __ocml_sinh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 sinh(_Float16 x) { return __ocml_sinh_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float sinh(float x) { return __ocml_sinh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double sinh(double x) { return __ocml_sinh_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float sqrtf(float x) { return __ocml_sqrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 sqrt(_Float16 x) { return __ocml_sqrt_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float sqrt(float x) { return __ocml_sqrt_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double sqrt(double x) { return __ocml_sqrt_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float tgammaf(float x) { return __ocml_tgamma_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 tgamma(_Float16 x) { return __ocml_tgamma_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float tgamma(float x) { return __ocml_tgamma_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double tgamma(double x) { return __ocml_tgamma_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float tanf(float x) { return __ocml_tan_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 tan(_Float16 x) { return __ocml_tan_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float tan(float x) { return __ocml_tan_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double tan(double x) { return __ocml_tan_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float tanhf(float x) { return __ocml_tanh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 tanh(_Float16 x) { return __ocml_tanh_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float tanh(float x) { return __ocml_tanh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double tanh(double x) { return __ocml_tanh_f32(x); }
+
+            HCC_MATH_LIB_FN
+            float tanpif(float x) { return __ocml_tanpi_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 tanpi(_Float16 x) { return __ocml_tanpi_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float tanpi(float x) { return __ocml_tanpi_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double tanpi(double x) { return __ocml_tanpi_f64(x); }
+
+            HCC_MATH_LIB_FN
+            float truncf(float x) { return __ocml_trunc_f32(x); }
+
+            HCC_MATH_LIB_FN
+            _Float16 trunc(_Float16 x) { return __ocml_trunc_f16(x); }
+
+            HCC_MATH_LIB_FN
+            float trunc(float x) { return __ocml_trunc_f32(x); }
+
+            HCC_MATH_LIB_FN
+            double trunc(double x) { return __ocml_trunc_f64(x); }
+        } // namespace hc::detail::precise_math
+    } // namespace hc::detail
+} // namespace hc
diff --git a/include/hc/hc_norm_unorm.hpp b/include/hc/hc_norm_unorm.hpp
new file mode 100644
index 00000000000..18d595eba4c
--- /dev/null
+++ b/include/hc/hc_norm_unorm.hpp
@@ -0,0 +1,313 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+namespace hc
+{
+    namespace short_vector
+    {
+        template<typename T>
+        constexpr
+        inline
+        T _clamp(T x, T x_min, T x_max) [[cpu, hc]]
+        {   // TODO: consider using med3 for [[hc]]
+            return (x < x_min) ? x_min : ((x_max < x) ? x_max : x);
+        }
+
+        class unorm;
+
+        class norm {
+            float x_{};
+
+            friend class unorm;
+
+            friend
+            inline
+            norm operator+(const norm& x, const norm& y) noexcept [[cpu, hc]]
+            {
+                return norm{x} += y;
+            }
+            friend
+            inline
+            norm operator-(const norm& x, const norm& y) noexcept [[cpu, hc]]
+            {
+                return norm{x} -= y;
+            }
+            friend
+            inline
+            norm operator*(const norm& x, const norm& y) noexcept [[cpu, hc]]
+            {
+                return norm{x} *= y;
+            }
+            friend
+            inline
+            norm operator/(const norm& x, const norm& y) [[cpu, hc]]
+            {
+                return norm{x} /= y;
+            }
+            friend
+            inline
+            bool operator==(const norm& x, const norm& y) noexcept [[cpu, hc]]
+            {
+                return x.x_ == y.x_;
+            }
+            friend
+            inline
+            bool operator!=(const norm& x, const norm& y) noexcept [[cpu, hc]]
+            {
+                return !(x == y);
+            }
+            friend
+            inline
+            bool operator<(const norm& x, const norm& y) noexcept [[cpu, hc]]
+            {
+                return x.x_ < y.x_;
+            }
+            friend
+            inline
+            bool operator<=(const norm& x, const norm& y) noexcept [[cpu, hc]]
+            {
+                return !(y < x);
+            }
+            friend
+            inline
+            bool operator>(const norm& x, const norm& y) noexcept [[cpu, hc]]
+            {
+                return y < x;
+            }
+            friend
+            inline
+            bool operator>=(const norm& x, const norm& y) noexcept [[cpu, hc]]
+            {
+                return !(x < y);
+            }
+        public:
+            // CREATORS
+            norm() [[cpu, hc]] = default;
+            norm(const norm&) [[cpu, hc]] = default;
+            norm(norm&&) [[cpu, hc]] = default;
+            constexpr
+            norm(const unorm& x) noexcept [[cpu, hc]];
+            constexpr
+            explicit
+            norm(float x) noexcept [[cpu, hc]] : x_{_clamp(x, -1.0f, 1.0f)} {}
+            constexpr
+            explicit
+            norm(unsigned int x) noexcept [[cpu, hc]]
+                : norm{static_cast<float>(x)}
+            {}
+            constexpr
+            explicit
+            norm(int x) noexcept [[cpu, hc]] : norm{static_cast<float>(x)} {}
+            constexpr
+            explicit
+            norm(double x) noexcept [[cpu, hc]] : norm{static_cast<float>(x)} {}
+            ~norm() [[cpu, hc]] = default;
+
+            // MANIPULATORS
+            norm& operator=(const norm&) [[cpu, hc]] = default;
+            norm& operator=(norm&&) [[cpu, hc]] = default;
+            norm& operator+=(const norm& x) noexcept [[cpu, hc]]
+            {
+                return *this = norm{x_ + x.x_};
+            }
+            norm& operator-=(const norm& x) noexcept [[cpu, hc]]
+            {
+                return *this = norm{x_ - x.x_};
+            }
+            norm& operator*=(const norm& x) noexcept [[cpu, hc]]
+            {
+                return *this = norm{x_ * x.x_};
+            }
+            norm& operator/=(const norm& x) [[cpu, hc]]
+            {
+                return *this = norm{x_ / x.x_};
+            }
+            norm& operator++() noexcept [[cpu, hc]]
+            {
+                return *this = norm{++x_};
+            }
+            norm operator++(int) noexcept [[cpu, hc]]
+            {
+                norm tmp{*this};
+                ++*this;
+                return tmp;
+            }
+            norm& operator--() noexcept [[cpu, hc]]
+            {
+                return *this = norm{--x_};
+            }
+            norm operator--(int) noexcept [[cpu, hc]]
+            {
+                norm tmp{*this};
+                --*this;
+                return tmp;
+            }
+
+            // ACCESSORS
+            constexpr
+            operator float() const noexcept [[cpu, hc]] { return x_; }
+            constexpr
+            norm operator-() const noexcept [[cpu, hc]] { return norm{-x_}; }
+        };
+
+        static constexpr norm NORM_MAX{1.0f};
+        static constexpr norm NORM_MIN{-1.0f};
+        static constexpr norm NORM_ZERO{0.0f};
+
+        class unorm {
+            float x_{};
+
+            friend class norm;
+
+            friend
+            inline
+            unorm operator+(const unorm& x, const unorm& y) noexcept [[cpu, hc]]
+            {
+                return unorm{x} += y;
+            }
+            friend
+            inline
+            unorm operator-(const unorm& x, const unorm& y) noexcept [[cpu, hc]]
+            {
+                return unorm{x} -= y;
+            }
+            friend
+            inline
+            unorm operator*(const unorm& x, const unorm& y) noexcept [[cpu, hc]]
+            {
+                return unorm{x} *= y;
+            }
+            friend
+            inline
+            unorm operator/(const unorm& x, const unorm& y) [[cpu, hc]]
+            {
+                return unorm{x} /= y;
+            }
+            friend
+            inline
+            bool operator==(const unorm& x, const unorm& y) noexcept [[cpu, hc]]
+            {
+                return x.x_ == y.x_;
+            }
+            friend
+            inline
+            bool operator!=(const unorm& x, const unorm& y) noexcept [[cpu, hc]]
+            {
+                return !(x == y);
+            }
+            friend
+            inline
+            bool operator<(const unorm& x, const unorm& y) noexcept [[cpu, hc]]
+            {
+                return x.x_ < y.x_;
+            }
+            friend
+            inline
+            bool operator<=(const unorm& x, const unorm& y) noexcept [[cpu, hc]]
+            {
+                return !(y < x);
+            }
+            friend
+            inline
+            bool operator>(const unorm& x, const unorm& y) noexcept [[cpu, hc]]
+            {
+                return y < x;
+            }
+            friend
+            inline
+            bool operator>=(const unorm& x, const unorm& y) noexcept [[cpu, hc]]
+            {
+                return !(x < y);
+            }
+        public:
+            // CREATORS
+            unorm() [[cpu, hc]] = default;
+            unorm(const unorm&) [[cpu, hc]] = default;
+            unorm(unorm&&) [[cpu, hc]] = default;
+            constexpr
+            explicit
+            unorm(const norm& x) noexcept [[cpu, hc]] : unorm{x.x_} {}
+            constexpr
+            explicit
+            unorm(float x) noexcept [[cpu, hc]] : x_{_clamp(x, 0.0f, 1.0f)} {}
+            constexpr
+            explicit
+            unorm(unsigned int x) noexcept [[cpu, hc]]
+                : unorm{static_cast<float>(x)}
+            {}
+            constexpr
+            explicit
+            unorm(int x) noexcept [[cpu, hc]] : unorm{static_cast<float>(x)} {}
+            constexpr
+            explicit
+            unorm(double x) noexcept [[cpu, hc]]
+                : unorm{static_cast<float>(x)}
+            {}
+            ~unorm() [[cpu, hc]] = default;
+
+            // MANIPULATORS
+            unorm& operator=(const unorm&) [[cpu, hc]] = default;
+            unorm& operator=(unorm&&) [[cpu, hc]] = default;
+            unorm& operator+=(const unorm& x) noexcept [[cpu, hc]]
+            {
+                return *this = unorm{x_ + x.x_};
+            }
+            unorm& operator-=(const unorm& x) noexcept [[cpu, hc]]
+            {
+                return *this = unorm{x_ - x.x_};
+            }
+            unorm& operator*=(const unorm& x) noexcept [[cpu, hc]]
+            {
+                return *this = unorm{x_ * x.x_};
+            }
+            unorm& operator/=(const unorm& x) [[cpu, hc]]
+            {
+                return *this = unorm{x_ / x.x_};
+            }
+            unorm& operator++() noexcept [[cpu, hc]]
+            {
+                return *this = unorm{++x_};
+            }
+            unorm operator++(int) noexcept [[cpu, hc]]
+            {
+                unorm tmp{*this};
+                ++*this;
+                return tmp;
+            }
+            unorm& operator--() noexcept [[cpu, hc]]
+            {
+                return *this = unorm{--x_};
+            }
+            unorm operator--(int) noexcept [[cpu, hc]]
+            {
+                unorm tmp{*this};
+                --*this;
+                return tmp;
+            }
+
+            // ACCESSORS
+            constexpr
+            operator float() const noexcept [[cpu, hc]] { return x_; }
+        };
+
+        // TODO: use levelisation to fix the weird late definition.
+        constexpr
+        inline
+        norm::norm(const unorm& x) noexcept [[cpu, hc]] : x_{x.x_} {}
+
+        static constexpr unorm UNORM_MAX{1.0f};
+        static constexpr unorm UNORM_MIN{0.0f};
+        static constexpr unorm UNORM_ZERO{0.0f};
+    } // Namespace hc::short_vector.
+} // Namespace hc.
+
+namespace std
+{   // TODO: add additional specialisations.
+    template<>
+    struct is_unsigned<hc::short_vector::unorm> : public std::true_type {};
+}
\ No newline at end of file
diff --git a/include/hc/hc_printf.hpp b/include/hc/hc_printf.hpp
new file mode 100644
index 00000000000..74eb46507e6
--- /dev/null
+++ b/include/hc/hc_printf.hpp
@@ -0,0 +1,415 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <hc/hc_am.hpp>
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <regex>
+#include <string>
+#include <type_traits>
+
+// The printf on the accelerator is only enabled when
+// The HCC_ENABLE_ACCELERATOR_PRINTF is defined
+//
+//#define HCC_ENABLE_ACCELERATOR_PRINTF (1)
+
+// Indicate whether hc::printf is supported
+#define HC_FEATURE_PRINTF (1)
+
+// Enable extra debug messages
+#define HC_PRINTF_DEBUG  (0)
+
+namespace hc {
+
+/*
+* Supported Types
+* Pointer Types
+* void*
+* const void*
+* Integer Types
+* uint8_t, int8_t - unsigned char, char
+* uint16_t, int16_t - unsigned short, short, uchar16_t, char16_t
+* uint32_t, int32_t - unsigned int, int, unsigned long, long, uchar32_t, char32_t
+* uint64_t, int64_t - 64 bit uint/ints
+* unsigned long long, long long - at least 64 bits
+* Floating Point Types
+* half - 16 bit fp
+* float - 32 bit fp
+* double - 64 bit fp
+*/
+
+union PrintfPacketData {
+  uint32_t        ui;
+  int32_t         i;
+  uint64_t        uli;
+  int64_t         li;
+  hc::half        h;
+  float           f;
+  double          d;
+  void*           ptr;
+  const void*     cptr;
+
+  // Header offset members (union uses same memory)
+  // uia[0] - PrintfPacket buffer offset
+  // uia[1] - Printf String buffer offset
+  // ali - Using a single atomic offset of 8B, update
+  // both uias of 4B using single atomic operation.
+  // ull - used to load offsets non-atomically, and
+  // required to update atomic_ullong. Non-atomic
+  // use of ull will also run faster.
+  std::atomic<uint64_t> ali;
+  uint32_t        uia[2];
+};
+
+enum PrintfPacketDataType {
+  // Header types
+  PRINTF_BUFFER_SIZE = 0
+  ,PRINTF_STRING_BUFFER = 1
+  ,PRINTF_STRING_BUFFER_SIZE = 2
+  ,PRINTF_OFFSETS = 3
+  ,PRINTF_HEADER_SIZE = 4
+  ,PRINTF_MIN_SIZE = 5
+
+  // Packet Data types
+  ,PRINTF_UNUSED
+  ,PRINTF_UINT32_T
+  ,PRINTF_INT32_T
+  ,PRINTF_UINT64_T
+  ,PRINTF_INT64_T
+  ,PRINTF_HALF
+  ,PRINTF_FLOAT
+  ,PRINTF_DOUBLE
+  ,PRINTF_VOID_PTR
+  ,PRINTF_CONST_VOID_PTR
+  ,PRINTF_CHAR_PTR
+  ,PRINTF_CONST_CHAR_PTR
+};
+
+class PrintfPacket {
+public:
+  void clear()             [[hc,cpu]] { type = PRINTF_UNUSED; }
+  void set(uint32_t d)     [[hc,cpu]] { type = PRINTF_UINT32_T;       data.ui = d; }
+  void set(int32_t d)      [[hc,cpu]] { type = PRINTF_INT32_T;        data.i = d; }
+  void set(uint64_t d)     [[hc,cpu]] { type = PRINTF_UINT64_T;       data.uli = d; }
+  void set(int64_t d)      [[hc,cpu]] { type = PRINTF_INT64_T;        data.li = d; }
+  void set(unsigned long long d) [[hc,cpu]] { type = PRINTF_UINT64_T; data.uli = d; }
+  void set(long long d)    [[hc,cpu]] { type = PRINTF_INT64_T;        data.li = d; }
+  void set(hc::half d)     [[hc,cpu]] { type = PRINTF_HALF;           data.h = d; }
+  void set(float d)        [[hc,cpu]] { type = PRINTF_FLOAT;          data.f = d; }
+  void set(double d)       [[hc,cpu]] { type = PRINTF_DOUBLE;         data.d = d; }
+  void set(void* d)        [[hc,cpu]] { type = PRINTF_VOID_PTR;       data.ptr = d; }
+  void set(const void* d)  [[hc,cpu]] { type = PRINTF_CONST_VOID_PTR; data.cptr = d; }
+  void set(char* d)        [[hc,cpu]] { type = PRINTF_CHAR_PTR;       data.ptr = d; }
+  void set(const char* d)  [[hc,cpu]] { type = PRINTF_CONST_CHAR_PTR; data.cptr = d; }
+  PrintfPacketDataType type;
+  PrintfPacketData data;
+};
+
+// Global printf buffer
+// The actual variable is currently defined in mcwamp_hsa.cpp
+extern PrintfPacket* printf_buffer;
+
+enum PrintfError {
+   PRINTF_SUCCESS = 0
+  ,PRINTF_BUFFER_OVERFLOW = 1
+  ,PRINTF_STRING_BUFFER_OVERFLOW = 2
+  ,PRINTF_UNKNOWN_ERROR = 3
+  ,PRINTF_BUFFER_NULLPTR = 4
+};
+
+static inline PrintfPacket* createPrintfBuffer(const unsigned int numElements) {
+  PrintfPacket* printfBuffer = NULL;
+  if (numElements > PRINTF_MIN_SIZE) {
+    printfBuffer = hc::internal::am_alloc_host_coherent(sizeof(PrintfPacket) * numElements);
+
+    // Initialize the Header elements of the Printf Buffer
+    printfBuffer[PRINTF_BUFFER_SIZE].type = PRINTF_BUFFER_SIZE;
+    printfBuffer[PRINTF_BUFFER_SIZE].data.ui = numElements;
+
+    // Header includes a helper string buffer which holds all char* args
+    // PrintfPacket is 12 bytes, equivalent string buffer size used
+    printfBuffer[PRINTF_STRING_BUFFER].type = PRINTF_STRING_BUFFER;
+    printfBuffer[PRINTF_STRING_BUFFER].data.ptr = hc::internal::am_alloc_host_coherent(sizeof(char) * numElements * 12);
+    printfBuffer[PRINTF_STRING_BUFFER_SIZE].type = PRINTF_STRING_BUFFER_SIZE;
+    printfBuffer[PRINTF_STRING_BUFFER_SIZE].data.ui = numElements * 12;
+
+    // Using one atomic offset to maintain order and atomicity
+    printfBuffer[PRINTF_OFFSETS].type = PRINTF_OFFSETS;
+    printfBuffer[PRINTF_OFFSETS].data.uia[0] = PRINTF_HEADER_SIZE;
+    printfBuffer[PRINTF_OFFSETS].data.uia[1] = 0;
+  }
+  return printfBuffer;
+}
+
+static inline void deletePrintfBuffer(PrintfPacket*& buffer) {
+  if (buffer){
+    if (buffer[PRINTF_STRING_BUFFER].data.ptr)
+      hc::am_free(buffer[PRINTF_STRING_BUFFER].data.ptr);
+    hc::am_free(buffer);
+  }
+  buffer = NULL;
+}
+
+static inline unsigned int string_length(const char* str) [[hc,cpu]]{
+  unsigned int size = 0;
+  while(str[size]!='\0')
+    size++;
+  return size;
+}
+
+static inline void copy_n(char* dest, const char* src, const unsigned int len) [[hc,cpu]] {
+  for(unsigned int i=0; i < len; i++){
+    dest[i] = src[i];
+  }
+}
+
+// return the memory size (including '/0') if it's a C-string
+template <typename T>
+std::size_t mem_size_if_string(typename std::enable_if< std::is_same<T,const char*>::value
+                                                        || std::is_same<T,char*>::value, T>::type  s) [[hc,cpu]] {
+  return string_length(s) + 1;
+}
+
+template <typename T>
+std::size_t mem_size_if_string(typename std::enable_if< !std::is_same<T,const char*>::value
+                                                         && !std::is_same<T,char*>::value, T>::type  s) [[hc,cpu]] {
+  return 0;
+}
+
+// get the argument count
+static inline void countArg(unsigned int& count_arg, unsigned int& count_char) [[hc,cpu]] {}
+template <typename T>
+static inline void countArg(unsigned int& count_arg, unsigned int& count_char, const T t) [[hc,cpu]] {
+  ++count_arg;
+  count_char += mem_size_if_string<T>(t);
+}
+template <typename T, typename... Rest>
+static inline void countArg(unsigned int& count_arg, unsigned int& count_char, const T t, const Rest&... rest) [[hc,cpu]] {
+  ++count_arg;
+  count_char += mem_size_if_string<T>(t);
+  countArg(count_arg, count_char, rest...);
+}
+
+template<typename T>
+PrintfError process_str_batch(PrintfPacket* queue, int poffset, unsigned int& soffset
+, typename std::enable_if< std::is_same<T,const char*>::value || std::is_same<T,char*>::value, T>::type string) [[hc,cpu]] {
+
+  if (queue[poffset].type != PRINTF_CHAR_PTR && queue[poffset].type != PRINTF_CONST_CHAR_PTR)
+    return PRINTF_UNKNOWN_ERROR;
+
+  unsigned int str_len = string_length(string);
+  unsigned int sb_offset = soffset;
+  char* string_buffer = (char*) queue[PRINTF_STRING_BUFFER].data.ptr;
+  if (!string_buffer || soffset + str_len + 1 > queue[PRINTF_STRING_BUFFER_SIZE].data.ui){
+    return PRINTF_STRING_BUFFER_OVERFLOW;
+  }
+  copy_n(&string_buffer[sb_offset], string, str_len + 1);
+  queue[poffset].set(&string_buffer[sb_offset]);
+  soffset += str_len + 1;
+  return PRINTF_SUCCESS;
+}
+
+template<typename T>
+PrintfError process_str_batch(PrintfPacket* queue, int poffset, unsigned int& soffset
+, typename std::enable_if< !std::is_same<T,const char*>::value && !std::is_same<T,char*>::value, T>::type data) [[hc,cpu]] {
+
+  if (queue[poffset].type == PRINTF_CHAR_PTR || queue[poffset].type == PRINTF_CONST_CHAR_PTR)
+    return PRINTF_UNKNOWN_ERROR;
+  else
+    return PRINTF_SUCCESS;
+}
+
+template <typename T>
+static inline PrintfError set_batch(PrintfPacket* queue, int poffset, unsigned int& soffset, const T t) [[hc,cpu]] {
+  PrintfError err = PRINTF_SUCCESS;
+  queue[poffset].set(t);
+  err = process_str_batch<T>(queue, poffset, soffset, t);
+  return err;
+}
+
+template <typename T, typename... Rest>
+static inline PrintfError set_batch(PrintfPacket* queue, int poffset, unsigned int& soffset, const T t, Rest... rest) [[hc,cpu]] {
+  PrintfError err = PRINTF_SUCCESS;
+  queue[poffset].set(t);
+
+  if ((err = process_str_batch<T>(queue, poffset, soffset, t)) != PRINTF_SUCCESS)
+    return err;
+
+  return set_batch(queue, poffset + 1, soffset, rest...);
+}
+
+template <typename... All>
+static inline PrintfError printf(PrintfPacket* queue, All... all) [[hc,cpu]] {
+  unsigned int count_arg = 0;
+  unsigned int count_char = 0;
+  countArg(count_arg, count_char, all...);
+
+  PrintfError error = PRINTF_SUCCESS;
+  PrintfPacketData old_off, try_off;
+
+  if (!queue) {
+    error = PRINTF_BUFFER_NULLPTR;
+  }
+  else if (count_arg + 1 + queue[PRINTF_OFFSETS].data.uia[0] > queue[PRINTF_BUFFER_SIZE].data.ui) {
+    error = PRINTF_BUFFER_OVERFLOW;
+  }
+  else if (!queue[PRINTF_STRING_BUFFER].data.ptr || count_char + queue[PRINTF_OFFSETS].data.uia[1] > queue[PRINTF_STRING_BUFFER_SIZE].data.ui){
+    error = PRINTF_STRING_BUFFER_OVERFLOW;
+  }
+  else {
+    do {
+      // Suggest an offset and compete with other kernels for a spot.
+      // One kernel will make it through at a time. Attempt
+      // to win a portion of printf buffer and printf string buffer.
+      // Otherwise, update to latest offset values, and try again.
+      old_off.uli = queue[PRINTF_OFFSETS].data.ali.load();
+      try_off.uia[0] = old_off.uia[0] + count_arg + 1;
+      try_off.uia[1] = old_off.uia[1] + count_char;
+    } while(!(queue[PRINTF_OFFSETS].data.ali.compare_exchange_weak(old_off.uli, try_off.uli)));
+
+    unsigned int poffset = (unsigned int)old_off.uia[0];
+    unsigned int soffset = (unsigned int)old_off.uia[1];
+
+    if (poffset + count_arg + 1 > queue[PRINTF_BUFFER_SIZE].data.ui) {
+      error = PRINTF_BUFFER_OVERFLOW;
+    }
+    else if (soffset + count_char > queue[PRINTF_STRING_BUFFER_SIZE].data.ui){
+      error = PRINTF_STRING_BUFFER_OVERFLOW;
+    }
+    else {
+      if (set_batch(queue, poffset, soffset, count_arg, all...) != PRINTF_SUCCESS)
+        error = PRINTF_STRING_BUFFER_OVERFLOW;
+    }
+  }
+
+  return error;
+}
+
+
+// The presence of hc::printf may impact performance even when it's not being called.
+// Currently hcc's printf on accelerator is an opt-in feature.  This means that users
+// have to define HCC_ENABLE_ACCELERATOR_PRINTF to enable it.
+#ifdef HCC_ENABLE_ACCELERATOR_PRINTF
+
+template <typename... All>
+static inline PrintfError printf(const char* format_string, All... all) [[hc,cpu]] {
+  return printf(hc::printf_buffer, format_string, all...);
+}
+
+#else
+
+// this is just a stubs for printf that doesn't do anything
+template <typename... All>
+static inline PrintfError printf(const char* format_string, All... all) [[hc,cpu]] {
+  return PRINTF_SUCCESS;
+}
+
+#endif
+
+// regex for finding format string specifiers
+static const std::regex specifierPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([hl]*)([diuoxXfFeEgGaAcsp]){1}");
+static const std::regex signedIntegerPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([hl]*)([cdi]){1}");
+static const std::regex unsignedIntegerPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([hl]*)([uoxX]){1}");
+static const std::regex floatPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([fFeEgGaA]){1}");
+static const std::regex pointerPattern("(%){1}[ps]");
+static const std::regex doubleAmpersandPattern("(%){2}");
+static const std::string ampersand("%");
+
+static inline void processPrintfPackets(PrintfPacket* packets, const unsigned int numPackets) {
+
+  for (unsigned int i = 0; i < numPackets; ) {
+
+    unsigned int numPrintfArgs = packets[i++].data.ui;
+    if (numPrintfArgs == 0)
+      continue;
+
+    // get the format
+    unsigned int formatStringIndex = i++;
+    assert(packets[formatStringIndex].type == PRINTF_CHAR_PTR
+           || packets[formatStringIndex].type == PRINTF_CONST_CHAR_PTR);
+    std::string formatString((const char*)packets[formatStringIndex].data.cptr);
+    std::smatch specifierMatches;
+
+#if HC_PRINTF_DEBUG
+    std::printf("%s:%d \t number of matches = %d\n", __FUNCTION__, __LINE__, (int)specifierMatches.size());
+#endif
+
+    for (unsigned int j = 1; j < numPrintfArgs; ++j, ++i) {
+
+      if (!std::regex_search(formatString, specifierMatches, specifierPattern)) {
+        // More printf argument than format specifier??
+        // Just skip to the next printf request
+        i+=(numPrintfArgs - j);
+        break;
+      }
+
+      std::string specifier = specifierMatches.str();
+#if HC_PRINTF_DEBUG
+      std::cout << " (specifier found: " << specifier << ") ";
+#endif
+
+      // print the substring before the specifier
+      // clean up all the double ampersands
+      std::string prefix = specifierMatches.prefix();
+      prefix = std::regex_replace(prefix,doubleAmpersandPattern,ampersand);
+      std::printf("%s",prefix.c_str());
+
+      std::smatch specifierTypeMatch;
+      if (std::regex_search(specifier, specifierTypeMatch, unsignedIntegerPattern)) {
+        std::printf(specifier.c_str(), packets[i].data.ui);
+      } else if (std::regex_search(specifier, specifierTypeMatch, signedIntegerPattern)) {
+        std::printf(specifier.c_str(), packets[i].data.i);
+      } else if (std::regex_search(specifier, specifierTypeMatch, floatPattern)) {
+        if (packets[i].type == PRINTF_HALF)
+          std::cout << static_cast<float>(packets[i].data.h);
+        else if (packets[i].type == PRINTF_FLOAT)
+          std::printf(specifier.c_str(), packets[i].data.f);
+        else
+          std::printf(specifier.c_str(), packets[i].data.d);
+      } else if (std::regex_search(specifier, specifierTypeMatch, pointerPattern)) {
+        std::printf(specifier.c_str(), packets[i].data.cptr);
+      }
+      else {
+        assert(false);
+      }
+      formatString = specifierMatches.suffix();
+    }
+    // print the substring after the last specifier
+    // clean up all the double ampersands before printing
+    formatString = std::regex_replace(formatString,doubleAmpersandPattern,ampersand);
+    std::printf("%s",formatString.c_str());
+  }
+  std::flush(std::cout);
+}
+
+static inline void processPrintfBuffer(PrintfPacket* gpuBuffer) {
+
+  if (gpuBuffer == nullptr) return;
+
+  unsigned int cursor = gpuBuffer[PRINTF_OFFSETS].data.uia[0];
+
+  // check whether the printf buffer is non-empty
+  if (cursor !=  PRINTF_HEADER_SIZE) {
+    unsigned int bufferSize = gpuBuffer[PRINTF_BUFFER_SIZE].data.ui;
+    unsigned int numPackets = ((bufferSize<cursor)?bufferSize:cursor) - PRINTF_HEADER_SIZE;
+
+    processPrintfPackets(gpuBuffer+PRINTF_HEADER_SIZE, numPackets);
+
+    // reset the printf buffer and string buffer
+    gpuBuffer[PRINTF_OFFSETS].data.uia[0] = PRINTF_HEADER_SIZE;
+    gpuBuffer[PRINTF_OFFSETS].data.uia[1] = 0;
+  }
+}
+
+
+} // namespace hc
diff --git a/include/hc/hc_queue_pool.hpp b/include/hc/hc_queue_pool.hpp
new file mode 100644
index 00000000000..93b27e5b7ed
--- /dev/null
+++ b/include/hc/hc_queue_pool.hpp
@@ -0,0 +1,200 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "hc_agent_pool.hpp"
+
+#include <hsa/hsa.h>
+
+#include <algorithm>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+
+namespace hc
+{
+    namespace detail
+    {
+        class Queue_pool {
+            struct Deleter {
+                void operator()(hsa_queue_t* queue) const noexcept
+                {
+                    try {
+                        throwing_hsa_result_check(
+                            hsa_queue_destroy(queue),
+                            __FILE__, __func__, __LINE__);
+                    }
+                    catch (const std::exception& ex) {
+                        std::cerr << ex.what() << std::endl;
+                    }
+                }
+            };
+
+            using RAIIQueue_ = std::unique_ptr<hsa_queue_t, Deleter>;
+            using OnceRAIIQueue_ = std::pair<std::once_flag, RAIIQueue_>;
+
+            // IMPLEMENTATION - DATA - STATICS
+            static constexpr std::size_t default_queue_{0u};
+            static constexpr std::size_t first_queue_idx_{default_queue_ + 1};
+
+            // IMPLEMENTATION - STATICS
+            template<typename T>
+            static
+            const T& clamp_(const T& lo, const T& x, const T& hi) noexcept
+            {
+                if (x < lo) return lo;
+                if (hi < x) return hi;
+                return x;
+            }
+
+            static
+            RAIIQueue_ make_queue_(hsa_agent_t x)
+            {
+                static constexpr std::uint32_t default_sz{256u};
+
+                const auto sz = clamp_(
+                    Agent_pool::pool()[x].min_queue_size,
+                    default_sz,
+                    Agent_pool::pool()[x].max_queue_size);
+
+                hsa_queue_t* r{};
+                throwing_hsa_result_check(
+                    hsa_queue_create(
+                        x,
+                        sz,
+                        HSA_QUEUE_TYPE_MULTI,
+                        [](hsa_status_t status, hsa_queue_t*, void*) {
+                            try {
+                                throwing_hsa_result_check(
+                                    status, __FILE__, __func__, __LINE__);
+                            }
+                            catch (const std::exception& ex) {
+                                std::cerr << ex.what() << std::endl;
+
+                                throw;
+                            }
+                        },
+                        nullptr,
+                        UINT32_MAX,
+                        Agent_pool::pool()[x].max_tile_static_size,
+                        &r),
+                    __FILE__, __func__, __LINE__);
+
+                return RAIIQueue_{r, Deleter{}};
+            }
+
+            static
+            std::unordered_map<hsa_agent_t, std::vector<OnceRAIIQueue_>>&
+                pool_()
+            {
+                static std::unordered_map<
+                    hsa_agent_t, std::vector<OnceRAIIQueue_>> r;
+                static std::once_flag f;
+
+                std::call_once(f, []() {
+                    for (auto&& agent : Agent_pool::pool()) {
+                        r.emplace(
+                            std::piecewise_construct,
+                            std::make_tuple(agent.first),
+                            std::make_tuple(agent.second.max_queue_count));
+                    }
+                });
+
+                return r;
+            }
+
+            static
+            std::uint64_t read_index_(hsa_queue_t* x) noexcept
+            {
+                return hsa_queue_load_read_index_scacquire(x);
+            }
+
+            static
+            std::uint64_t write_index_(hsa_queue_t* x) noexcept
+            {
+                std::uint64_t r;
+                do {
+                    r = hsa_queue_load_write_index_scacquire(x);
+                }
+                while (hsa_queue_cas_write_index_scacq_screl(x, r, r + 1) != r);
+
+                return r;
+            }
+        public:
+            static
+            hsa_queue_t* default_queue(hsa_agent_t agent)
+            {
+                if (pool_()[agent].empty()) return nullptr;
+
+                std::call_once(pool_()[agent][default_queue_].first, [=]() {
+                    pool_()[agent][default_queue_].second = make_queue_(agent);
+                });
+
+                return pool_()[agent][default_queue_].second.get();
+            }
+
+            static
+            hsa_queue_t* defined_queue(hsa_agent_t agent)
+            {
+                if (pool_()[agent].empty()) return nullptr;
+
+                static std::unordered_map<
+                    hsa_agent_t, std::atomic<std::uint16_t>> cnt;
+
+                const auto defined_queue_cnt = pool_()[agent].size() - 1;
+                const auto idx =
+                    first_queue_idx_ + (cnt[agent]++ % defined_queue_cnt);
+
+                std::call_once(pool_()[agent][idx].first, [=]() {
+                    pool_()[agent][idx].second = make_queue_(agent);
+                });
+
+                return pool_()[agent][idx].second.get();
+            }
+
+            static
+            void enable(
+                std::pair<void*, std::uint64_t>& slot,
+                hsa_queue_t* queue) noexcept
+            {   // Precondition: reserved2 = fully formed packet header.
+                auto p = static_cast<hsa_barrier_and_packet_t*>(slot.first);
+                std::uint16_t h = p->reserved2;
+                p->reserved2 = 0;
+
+                __atomic_store(&p->header, &h, __ATOMIC_SEQ_CST);
+
+                hsa_signal_store_screlease(queue->doorbell_signal, slot.second);
+            }
+
+            static
+            std::pair<void*, std::uint64_t> queue_slot(hsa_queue_t* queue)
+            {   // TODO: add per-queue backoff.
+                if (!queue) {
+                    throw std::logic_error{
+                        "Tried to get slot in non-existing queue."};
+                }
+
+                auto p = static_cast<hsa_kernel_dispatch_packet_t*>(
+                    queue->base_address);
+                do {
+                    const auto f = read_index_(queue);
+                    const auto l = write_index_(queue);
+
+                    if (queue->size <= l - f) continue;
+
+                    return {p + (l % queue->size), l};
+                } while (true);
+            }
+        };
+    }
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/hc_rt_debug.hpp b/include/hc/hc_rt_debug.hpp
new file mode 100644
index 00000000000..9564328efdf
--- /dev/null
+++ b/include/hc/hc_rt_debug.hpp
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <cstdlib>
+#include <cstdio>
+#ifndef USE_LIBCXX
+#include <cxxabi.h>
+#endif
+#include <iostream>
+#include <cstring>
+#include <sstream>
+#include <string>
+#include <vector>
+
+
+#define DB_API        0  /* 0x0001  HCC runtime API calls */
+#define DB_CMD        1  /* 0x0002  Kernel and Copy Commands and Barriers */
+#define DB_WAIT       2  /* 0x0004  Synchronization and waiting for commands to finish. */
+#define DB_AQL        3  /* 0x0008  Decode and display AQL packets  */
+#define DB_QUEUE      4  /* 0x0010  Queue creation and desruction commands, and queue contents after each command push. */
+#define DB_SIG        5  /* 0x0020  Signal creation, allocation, pool */
+#define DB_LOCK       6  /* 0x0040  Locks and HCC thread-safety code */
+#define DB_KERNARG    7  /* 0x0080  Show first 128 bytes of kernarg blocks passed to kernels */
+#define DB_COPY       8  /* 0x0100  Copy debug */
+#define DB_COPY2      9  /* 0x0200  Detailed copy debug */
+#define DB_RESOURCE  10  /* 0x0400  Resource (signal/kernarg/queue) allocation and growth, and other unusual potentially performance-impacting events. */
+#define DB_INIT      11  /* 0x0800  HCC initialization and shutdown. */
+#define DB_MISC      12  /* 0x1000  misc debug, not yet classified. */
+#define DB_AQL2      13  /* 0x2000  Show raw bytes of AQL packet */
+#define DB_CODE      14  /* 0x4000  Show CreateKernel and code creation debug */
+#define DB_CMD2      15  /* 0x8000  More detailed command info, including barrier commands created by hcc rt. */
+// If adding new define here update the table below:
+
+extern unsigned HCC_DB;
+
+#define DBPARM(x) #x << "=" << x
+
+
+// Keep close to debug defs above since these have to be kept in-sync
+static std::vector<std::string> g_DbStr = {"api", "cmd", "wait", "aql", "queue", "sig", "lock", "kernarg", "copy", "copy2", "resource", "init", "misc", "aql2", "code", "cmd2"};
+
+
+// Macro for prettier debug messages, use like:
+// DBOUT(" Something happened" << myId() << " i= " << i << "\n");
+#define COMPILE_HCC_DB 1
+
+#define DBFLAG(db_flag) (COMPILE_HCC_DB && (HCC_DB & (1<<(db_flag))))
+
+#define DBSTREAM  std::cerr
+#define DBWSTREAM std::wcerr
+
+// Use str::stream so output is atomic wrt other threads:
+#define DBOUT(db_flag, msg) \
+if (DBFLAG(db_flag)) { \
+    std::stringstream sstream;\
+    sstream << "   hcc-" << g_DbStr[db_flag] << " tid:" << hcc_tlsShortTid._shortTid << " " << msg ; \
+    DBSTREAM << sstream.str();\
+};
+
+// Like DBOUT, but add newline:
+#define DBOUTL(db_flag, msg) DBOUT(db_flag, msg << "\n")
+
+// get a the current filename without the path
+#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/')+1 : __FILE__)
+
+// Class with a constructor that gets called when new thread is created:
+struct ShortTid {
+    ShortTid() ;
+    int _shortTid;
+};
+
+extern thread_local ShortTid hcc_tlsShortTid;
+
+namespace hc {
+
+  static void print_backtrace() {
+  }
+
+} // namespace hc
diff --git a/include/hc/hc_runtime.hpp b/include/hc/hc_runtime.hpp
new file mode 100644
index 00000000000..ce8edd924bc
--- /dev/null
+++ b/include/hc/hc_runtime.hpp
@@ -0,0 +1,174 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "hc_aligned_alloc.hpp"
+#include "hc_defines.hpp"
+
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstring>
+#include <future>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <system_error>
+#include <vector>
+
+namespace hc
+{
+    namespace detail
+    {
+        namespace enums
+        {
+            /// access_type is used for accelerator that supports unified memory
+            /// Such accelerator can use access_type to control whether can
+            /// access data on it or not
+            enum access_type {
+                access_type_none = 0,
+                access_type_read = (1 << 0),
+                access_type_write = (1 << 1),
+                access_type_read_write = access_type_read | access_type_write,
+                access_type_auto = (1 << 31)
+            };
+
+            enum queuing_mode {
+                queuing_mode_immediate,
+                queuing_mode_automatic
+            };
+
+            enum execute_order {
+                execute_in_order,
+                execute_any_order
+            };
+
+            // Flags to specify visibility of previous commands after a marker
+            // is executed.
+            enum memory_scope {
+                no_scope=0,           // No release operation applied
+                accelerator_scope=1,  // Release to current accelerator
+                system_scope=2,       // Release to system (CPU + all
+                                      // accelerators)
+            };
+
+            static
+            inline
+            memory_scope greater_scope(memory_scope scope1, memory_scope scope2)
+            {
+                if ((scope1==system_scope) || (scope2 == system_scope)) {
+                    return system_scope;
+                }
+                if ((scope1==accelerator_scope) ||
+                    (scope2 == accelerator_scope)) {
+                    return accelerator_scope;
+                }
+                return no_scope;
+            }
+
+            enum hcCommandKind {
+                hcCommandInvalid= -1,
+
+                hcMemcpyHostToHost = 0,
+                hcMemcpyHostToDevice = 1,
+                hcMemcpyDeviceToHost = 2,
+                hcMemcpyDeviceToDevice = 3,
+                hcCommandKernel = 4,
+                hcCommandMarker = 5,
+            };
+
+            // Commands sent to copy queues:
+            static
+            inline
+            bool isCopyCommand(hcCommandKind k)
+            {
+                switch (k) {
+                    case hcMemcpyHostToHost:
+                    case hcMemcpyHostToDevice:
+                    case hcMemcpyDeviceToHost:
+                    case hcMemcpyDeviceToDevice:
+                        return true;
+                    default:
+                        return false;
+                }
+            }
+
+            // Commands sent to compute queue:
+            static
+            inline
+            bool isComputeQueueCommand(hcCommandKind k)
+            {
+                return (k == hcCommandKernel) || (k == hcCommandMarker);
+            }
+
+            enum hcWaitMode {
+                hcWaitModeBlocked = 0,
+                hcWaitModeActive = 1
+            };
+
+            enum accelerator_profile {
+                accelerator_profile_none = 0,
+                accelerator_profile_base = 1,
+                accelerator_profile_full = 2
+            };
+        } // namespace hc::detail::enums
+
+        template<std::size_t m, std::size_t n>
+        inline
+        void throwing_hsa_result_check(
+            hsa_status_t s,
+            const char (&file)[m],
+            const char (&fn)[n],
+            int line)
+        {
+            if (s == HSA_STATUS_SUCCESS || s == HSA_STATUS_INFO_BREAK) return;
+
+            const char* p{};
+            auto r = hsa_status_string(s, &p);
+
+            throw std::system_error{
+                (r == HSA_STATUS_SUCCESS) ? s : r,
+                std::system_category(),
+                "In " + (file +
+                    (", in function " + (fn +
+                    (((", on line " + std::to_string(line)) +
+                    ", HSA RT failed: ") + p))))
+            };
+        }
+
+        inline
+        hsa_amd_pointer_info_t pointer_info(const void* ptr)
+        {
+            hsa_amd_pointer_info_t r{};
+            r.size = sizeof(r);
+
+            throwing_hsa_result_check(
+                hsa_amd_pointer_info(
+                    const_cast<void*>(ptr), &r, nullptr, nullptr, nullptr),
+                __FILE__, __func__, __LINE__);
+
+            return r;
+        }
+
+        inline
+        __attribute__((constructor))
+        void construct_hc_runtime()
+        {
+            throwing_hsa_result_check(hsa_init(), __FILE__, __func__, __LINE__);
+        }
+
+        inline
+        __attribute__((destructor))
+        void destruct_hc_runtime()
+        {
+            throwing_hsa_result_check(
+                hsa_shut_down(), __FILE__, __func__, __LINE__);
+        }
+    } // Namespace hc::detail.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/hc_short_vector.hpp b/include/hc/hc_short_vector.hpp
new file mode 100644
index 00000000000..266b3060d6d
--- /dev/null
+++ b/include/hc/hc_short_vector.hpp
@@ -0,0 +1,1171 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <hc/hc_defines.hpp>
+#include <hc/hc_norm_unorm.hpp>
+
+#include <type_traits>
+
+namespace hc
+{
+    namespace short_vector
+    {
+        template<typename T, int n>
+        class Vector_base {
+            using VecT = typename std::conditional<
+                std::is_same<T, norm>{} || std::is_same<T, unorm>{},
+                float,
+                T>::type __attribute__((ext_vector_type(n)));
+
+            union { // TODO: revise, this is only used for ref_n() functions.
+                VecT data_;
+                T components_[n]{};
+            };
+
+            friend class Vector_base<T, 2>;
+            friend class Vector_base<T, 3>;
+            friend class Vector_base<T, 4>;
+
+            friend
+            inline
+            Vector_base operator+(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return Vector_base{x} += y;
+            }
+            friend
+            inline
+            Vector_base operator-(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return Vector_base{x} -= y;
+            }
+            friend
+            inline
+            Vector_base operator*(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return Vector_base{x} *= y;
+            }
+            friend
+            inline
+            Vector_base operator/(
+                const Vector_base& x, const Vector_base& y) [[cpu, hc]]
+            {
+                return Vector_base{x} /= y;
+            }
+            friend
+            inline
+            bool operator==(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                auto tmp = x.data_ == y.data_;
+                for (auto i = 0; i != n; ++i) if (tmp[i] == 0) return false;
+
+                return true;
+            }
+            friend
+            inline
+            bool operator!=(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return !(x == y);
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            friend
+            inline
+            Vector_base operator%(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return Vector_base{x} %= y;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            friend
+            inline
+            Vector_base operator^(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return Vector_base{x} ^= y;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            friend
+            inline
+            Vector_base operator|(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return Vector_base{x} |= y;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            friend
+            inline
+            Vector_base operator&(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return Vector_base{x} &= y;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            friend
+            inline
+            Vector_base operator<<(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return Vector_base{x} <<= y;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            friend
+            inline
+            Vector_base operator>>(
+                const Vector_base& x, const Vector_base& y) noexcept [[cpu, hc]]
+            {
+                return Vector_base{x} >>= y;
+            }
+
+            explicit
+            Vector_base(VecT x) noexcept [[cpu, hc]] : data_{x} {}
+        public:
+            using value_type = T;
+
+            static constexpr int size{n};
+
+            // CREATORS
+            Vector_base() [[cpu, hc]] = default;
+            Vector_base(const Vector_base&) [[cpu, hc]] = default;
+            Vector_base(Vector_base&&) = default;
+            constexpr
+            Vector_base(T x) noexcept [[cpu, hc]] : data_(x) {}
+            template<
+                int m = n, typename std::enable_if<m == 2>::type* = nullptr>
+            constexpr
+            Vector_base(T x, T y) noexcept [[cpu, hc]] : data_{x, y} {}
+            template<
+                int m = n, typename std::enable_if<m == 3>::type* = nullptr>
+            constexpr
+            Vector_base(T x, T y, T z) noexcept [[cpu, hc]] : data_{x, y, z} {}
+            template<
+                int m = n, typename std::enable_if<m == 4>::type* = nullptr>
+            constexpr
+            Vector_base(T x, T y, T z, T w) noexcept [[cpu, hc]]
+                : data_{x, y, z, w}
+            {}
+            template<
+                typename U,
+                int m,
+                typename std::enable_if<
+                    std::is_convertible<U, T>{} && m == n>::type* = nullptr>
+            Vector_base(const Vector_base<U, m>& x)
+            {   // TODO: optimise.
+                for (auto i = 0; i != m; ++i) data_[i] = x.data_[i];
+            }
+            ~Vector_base() [[cpu, hc]] = default;
+
+            // MANIPULATORS
+            Vector_base& operator=(const Vector_base&) [[cpu, hc]] = default;
+            Vector_base& operator=(Vector_base&&) [[cpu, hc]] = default;
+            Vector_base& operator+=(const Vector_base& x) noexcept [[cpu, hc]]
+            {
+                data_ += x.data_;
+                return *this;
+            }
+            Vector_base& operator-=(const Vector_base& x) noexcept [[cpu, hc]]
+            {
+                data_ -= x.data_;
+                return *this;
+            }
+            Vector_base& operator*=(const Vector_base& x) noexcept [[cpu, hc]]
+            {
+                data_ *= x.data_;
+                return *this;
+            }
+            Vector_base& operator/=(const Vector_base& x) [[cpu, hc]]
+            {
+                data_ /= x.data_;
+                return *this;
+            }
+            Vector_base& operator++() noexcept [[cpu, hc]]
+            {
+                ++data_;
+                return *this;
+            }
+            Vector_base operator++(int) noexcept [[cpu, hc]]
+            {
+                Vector_base tmp{*this};
+                ++*this;
+                return tmp;
+            }
+            Vector_base& operator--() noexcept [[cpu, hc]]
+            {
+                --data_;
+                return *this;
+            }
+            Vector_base operator--(int) noexcept [[cpu, hc]]
+            {
+                Vector_base tmp{*this};
+                --*this;
+                return tmp;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            Vector_base operator%=(const Vector_base& x) [[cpu, hc]]
+            {
+                data_ %= x.data_;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            Vector_base operator^=(const Vector_base& x) noexcept [[cpu, hc]]
+            {
+                data_ ^= x.data_;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            Vector_base operator|=(const Vector_base& x) noexcept [[cpu, hc]]
+            {
+                data_ |= x.data_;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            Vector_base operator&=(const Vector_base& x) noexcept [[cpu, hc]]
+            {
+                data_ &= x.data_;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            Vector_base operator>>=(const Vector_base& x) [[cpu, hc]]
+            {
+                data_ >>= x.data_;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            Vector_base operator<<=(const Vector_base& x) [[cpu, hc]]
+            {
+                data_ <<= x.data_;
+                return *this;
+            }
+
+            // one-component access
+            void set_x(T x) noexcept [[cpu, hc]] { data_.x = x; }
+            template<
+                int m = n, typename std::enable_if<m == 2>::type* = nullptr>
+            void set_y(T x) noexcept [[cpu, hc]] { data_.y = x; }
+            template<
+                int m = n, typename std::enable_if<m == 3>::type* = nullptr>
+            void set_z(T x) noexcept [[cpu, hc]] { data_.z = x; }
+            template<
+                int m = n, typename std::enable_if<m == 4>::type* = nullptr>
+            void set_w(T x) noexcept [[cpu, hc]] { data_.w = x; }
+            void set_r(T x) noexcept [[cpu, hc]] { set_x(x); }
+            template<
+                int m = n, typename std::enable_if<m == 2>::type* = nullptr>
+            void set_g(T x) noexcept [[cpu, hc]] { set_y(x); }
+            template<
+                int m = n, typename std::enable_if<m == 3>::type* = nullptr>
+            void set_b(T x) noexcept [[cpu, hc]] { set_z(x); }
+            template<
+                int m = n, typename std::enable_if<m == 4>::type* = nullptr>
+            void set_a(T x) noexcept [[cpu, hc]] { set_w(x); }
+
+            T& ref_x() noexcept [[cpu, hc]] { return components_[0]; }
+            template<
+                int m = n, typename std::enable_if<m == 2>::type* = nullptr>
+            T& ref_y() noexcept [[cpu, hc]] { return components_[1]; }
+            template<
+                int m = n, typename std::enable_if<m == 3>::type* = nullptr>
+            T& ref_z() noexcept [[cpu, hc]] { return components_[2]; }
+            template<
+                int m = n, typename std::enable_if<m == 4>::type* = nullptr>
+            T& ref_w() noexcept [[cpu, hc]] { return components_[3]; }
+            T& ref_r() noexcept [[cpu, hc]] { return ref_x(); }
+            template<
+                int m = n, typename std::enable_if<m == 2>::type* = nullptr>
+            T& ref_g() noexcept [[cpu, hc]] { return ref_y(); }
+            template<
+                int m = n, typename std::enable_if<m == 3>::type* = nullptr>
+            T& ref_b() noexcept [[cpu, hc]] { return ref_z(); }
+            template<
+                int m = n, typename std::enable_if<m == 4>::type* = nullptr>
+            T& ref_a() noexcept [[cpu, hc]] { return ref_w(); }
+
+            // two-component access
+            template<
+                int m = n, typename std::enable_if<(m > 1)>::type* = nullptr>
+            void set_xy(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.xy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 1)>::type* = nullptr>
+            void set_yx(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.yx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_xz(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.xz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_zx(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.zx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xw(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.xw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wx(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.wx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_yz(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.yz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_zy(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.zy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_yw(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.yw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wy(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.wy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zw(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.zw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wz(Vector_base<T, 2> x) noexcept [[cpu, hc]]
+            {
+                data_.wz = x.data_;
+            }
+            // three-component access
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_xyz(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.xyz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_yzx(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.yzx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_zxy(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.zxy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_xzy(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.xzy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_yxz(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.yxz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            void set_zyx(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.zyx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xyw(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.xyw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_ywx(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.ywx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wxy(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.wxy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xwy(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.xwy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_yxw(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.yxw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wyx(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.wyx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xzw(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.xzw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zwx(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.zwx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wxz(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.wxz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xwz(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.xwz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zxw(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.zxw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wzx(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.wzx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_yzw(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.yzw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zwy(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.zwy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wyz(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.wyz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wzy(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.wzy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_ywz(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.ywz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zyw(Vector_base<T, 3> x) noexcept [[cpu, hc]]
+            {
+                data_.zyw = x.data_;
+            }
+
+            // four-component access
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xyzw(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.xyzw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xzwy(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.xzwy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xwyz(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.xwyz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xzyw(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.xzyw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xywz(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.xywz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_xwzy(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.xwzy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_yzwx(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.yzwx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_ywxz(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.ywxz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_yxzw(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.yxzw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_yxwz(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.yxwz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_yzxw(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.yzxw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_ywzx(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.ywzx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zwxy(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.zwxy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zxyw(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.zxyw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zywx(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.zywx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zyxw(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.zyxw = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zwyx(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.zwyx = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_zxwy(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.zxwy = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wxyz(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.wxyz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wyzx(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.wxyz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wzxy(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.wxyz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wzyx(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.wxyz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wxzy(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.wxyz = x.data_;
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            void set_wyxz(Vector_base<T, 4> x) noexcept [[cpu, hc]]
+            {
+                data_.wxyz = x.data_;
+            }
+
+            // ACCESSORS
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            Vector_base operator~() const noexcept [[cpu, hc]]
+            {
+                Vector_base tmp{*this};
+                tmp.data_ = ~tmp.data_;
+                return tmp;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
+            Vector_base operator-() const noexcept [[cpu, hc]]
+            {
+                Vector_base tmp{*this};
+                tmp.data_ = -tmp.data_;
+                return tmp;
+            }
+
+            // one-component access
+            T get_x() const noexcept [[cpu, hc]] { return T{data_.x}; }
+            template<
+                int m = n, typename std::enable_if<(m > 1)>::type* = nullptr>
+            T get_y() const noexcept [[cpu, hc]] { return T{data_.y}; }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            T get_z() const noexcept [[cpu, hc]] { return T{data_.z}; }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            T get_w() const noexcept [[cpu, hc]] { return T{data_.w}; }
+            T get_r() const noexcept [[cpu, hc]] { return get_x(); }
+            template<
+                int m = n, typename std::enable_if<(m > 1)>::type* = nullptr>
+            T get_g() const noexcept [[cpu, hc]] { return get_y(); }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            T get_b() const noexcept [[cpu, hc]] { return get_z(); }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            T get_a() const noexcept [[cpu, hc]] { return get_w(); }
+
+            // two-component access
+            template<
+                int m = n, typename std::enable_if<(m > 1)>::type* = nullptr>
+            Vector_base<T, 2> get_xy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.xy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 1)>::type* = nullptr>
+            Vector_base<T, 2> get_yx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.yx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 2> get_xz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.xz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 2> get_zx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.zx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 2> get_xw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.xw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 2> get_wx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.wx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 2> get_yz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.yz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 2> get_zy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.zy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 2> get_yw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.yw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 2> get_wy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.wy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 2> get_zw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.zw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 2> get_wz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 2>{data_.wz};
+            }
+
+            // three-component access
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 3> get_xyz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.xyz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 3> get_yzx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.yzx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 3> get_zxy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.zxy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 3> get_xzy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.xzy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 3> get_yxz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.yxz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 2)>::type* = nullptr>
+            Vector_base<T, 3> get_zyx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.zyx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_xyw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.xyw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_ywx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.ywx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_wxy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.wxy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_xwy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.xwy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_yxw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.yxw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_wyx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.wyx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_xzw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.xzw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_zwx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.zwx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_wxz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.wxz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_xwz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.xwz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_zxw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.zxw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_wzx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.wzx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_yzw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.yzw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_zwy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.zwy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_wyz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.wyz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_wzy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.wzy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_ywz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.ywz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 3> get_zyw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 3>{data_.zyw};
+            }
+
+            // four-component access
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_xyzw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.xyzw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_xzwy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.xzwy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_xwyz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.xwyz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_xzyw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.xzyw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_xywz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.xywz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_xwzy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.xwzy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_yzwx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.yzwx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_ywxz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.ywxz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_yxzw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.yxzw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_yxwz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.yxwz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_yzxw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.yzxw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_ywzx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.ywzx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_zwxy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.zwxy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_zxyw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.zxyw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_zywx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.zywx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_zyxw() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.zyxw};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_zwyx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.zwyx};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_zxwy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.zxwy};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_wxyz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.wxyz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_wyzx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.wxyz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_wzxy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.wxyz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_wzyx() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.wxyz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_wxzy() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.wxyz};
+            }
+            template<
+                int m = n, typename std::enable_if<(m > 3)>::type* = nullptr>
+            Vector_base<T, 4> get_wyxz() const noexcept [[cpu, hc]]
+            {
+                return Vector_base<T, 4>{data_.wxyz};
+            }
+        };
+
+        template<typename, int> struct short_vector;
+        template<typename> struct short_vector_traits; // TODO: don't use macro.
+
+        #define MAKE_HC_VECTOR_TYPE(T, n) \
+            using T##_##n = Vector_base<T, n>;\
+            using T##n = Vector_base<T, n>;\
+            template<> struct short_vector<T, n> { using type = T##_##n; };\
+            template<>\
+            struct short_vector_traits<T##_##n> {\
+                using value_type = T;\
+                static constexpr int size{n};\
+            };
+
+        using uchar = unsigned char;
+        using ushort = unsigned short;
+        using uint = unsigned int;
+        using ulong = unsigned long;
+        using longlong = long long;
+        using ulonglong = unsigned long long;
+        using half = _Float16;
+
+        MAKE_HC_VECTOR_TYPE(char, 1)
+        MAKE_HC_VECTOR_TYPE(char, 2)
+        MAKE_HC_VECTOR_TYPE(char, 3)
+        MAKE_HC_VECTOR_TYPE(char, 4)
+        MAKE_HC_VECTOR_TYPE(uchar, 1)
+        MAKE_HC_VECTOR_TYPE(uchar, 2)
+        MAKE_HC_VECTOR_TYPE(uchar, 3)
+        MAKE_HC_VECTOR_TYPE(uchar, 4)
+        MAKE_HC_VECTOR_TYPE(short, 1)
+        MAKE_HC_VECTOR_TYPE(short, 2)
+        MAKE_HC_VECTOR_TYPE(short, 3)
+        MAKE_HC_VECTOR_TYPE(short, 4)
+        MAKE_HC_VECTOR_TYPE(ushort, 1)
+        MAKE_HC_VECTOR_TYPE(ushort, 2)
+        MAKE_HC_VECTOR_TYPE(ushort, 3)
+        MAKE_HC_VECTOR_TYPE(ushort, 4)
+        MAKE_HC_VECTOR_TYPE(int, 1)
+        MAKE_HC_VECTOR_TYPE(int, 2)
+        MAKE_HC_VECTOR_TYPE(int, 3)
+        MAKE_HC_VECTOR_TYPE(int, 4)
+        MAKE_HC_VECTOR_TYPE(uint, 1)
+        MAKE_HC_VECTOR_TYPE(uint, 2)
+        MAKE_HC_VECTOR_TYPE(uint, 3)
+        MAKE_HC_VECTOR_TYPE(uint, 4)
+        MAKE_HC_VECTOR_TYPE(long, 1)
+        MAKE_HC_VECTOR_TYPE(long, 2)
+        MAKE_HC_VECTOR_TYPE(long, 3)
+        MAKE_HC_VECTOR_TYPE(long, 4)
+        MAKE_HC_VECTOR_TYPE(ulong, 1)
+        MAKE_HC_VECTOR_TYPE(ulong, 2)
+        MAKE_HC_VECTOR_TYPE(ulong, 3)
+        MAKE_HC_VECTOR_TYPE(ulong, 4)
+        MAKE_HC_VECTOR_TYPE(longlong, 1)
+        MAKE_HC_VECTOR_TYPE(longlong, 2)
+        MAKE_HC_VECTOR_TYPE(longlong, 3)
+        MAKE_HC_VECTOR_TYPE(longlong, 4)
+        MAKE_HC_VECTOR_TYPE(ulonglong, 1)
+        MAKE_HC_VECTOR_TYPE(ulonglong, 2)
+        MAKE_HC_VECTOR_TYPE(ulonglong, 3)
+        MAKE_HC_VECTOR_TYPE(ulonglong, 4)
+        MAKE_HC_VECTOR_TYPE(half, 1)
+        MAKE_HC_VECTOR_TYPE(half, 2)
+        MAKE_HC_VECTOR_TYPE(half, 3)
+        MAKE_HC_VECTOR_TYPE(half, 4)
+        MAKE_HC_VECTOR_TYPE(float, 1)
+        MAKE_HC_VECTOR_TYPE(float, 2)
+        MAKE_HC_VECTOR_TYPE(float, 3)
+        MAKE_HC_VECTOR_TYPE(float, 4)
+        MAKE_HC_VECTOR_TYPE(double, 1)
+        MAKE_HC_VECTOR_TYPE(double, 2)
+        MAKE_HC_VECTOR_TYPE(double, 3)
+        MAKE_HC_VECTOR_TYPE(double, 4)
+        MAKE_HC_VECTOR_TYPE(norm, 1)
+        MAKE_HC_VECTOR_TYPE(norm, 2)
+        MAKE_HC_VECTOR_TYPE(norm, 3)
+        MAKE_HC_VECTOR_TYPE(norm, 4)
+        MAKE_HC_VECTOR_TYPE(unorm, 1)
+        MAKE_HC_VECTOR_TYPE(unorm, 2)
+        MAKE_HC_VECTOR_TYPE(unorm, 3)
+        MAKE_HC_VECTOR_TYPE(unorm, 4)
+    } // namespace short_vector
+} // namespace hc
\ No newline at end of file
diff --git a/include/hc/hc_signal_pool.hpp b/include/hc/hc_signal_pool.hpp
new file mode 100644
index 00000000000..c37d19403f2
--- /dev/null
+++ b/include/hc/hc_signal_pool.hpp
@@ -0,0 +1,115 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <hsa/hsa.h>
+
+#include <array>
+#include <atomic>
+#include <climits>
+#include <cstddef>
+#include <iostream>
+#include <mutex>
+#include <stdexcept>
+#include <utility>
+
+namespace hc
+{
+    namespace detail
+    {
+        class Signal_pool {
+            struct RAII_signal {
+                hsa_signal_t signal;
+
+                // CREATORS
+                ~RAII_signal()
+                {
+                    try {
+                        throwing_hsa_result_check(
+                            hsa_signal_destroy(signal),
+                            __FILE__, __func__, __LINE__);
+                    }
+                    catch (const std::exception& ex) {
+                        std::cerr << ex.what() << std::endl;
+                    }
+                }
+
+                // ACCESSORS
+                constexpr
+                operator hsa_signal_t() const noexcept { return signal; }
+            };
+
+            // IMPLEMENTATION - DATA - STATICS
+            static constexpr hsa_signal_value_t init_value_{1};
+            static constexpr std::size_t pool_size_{256u};
+
+            using PoolType =
+                std::vector<std::pair<std::atomic_flag, RAII_signal>>;
+
+            // IMPLEMENTATION - STATICS
+            static
+            PoolType& pool_()
+            {
+                static PoolType r{pool_size_};
+                static std::once_flag f;
+
+                std::call_once(f, []() {
+                    for (auto&& s : r) {
+                        hsa_signal_create(
+                            init_value_, 0u, nullptr, &s.second.signal);
+                    }
+                });
+
+                return r;
+            }
+        public:
+            // DATA - STATICS
+            static constexpr hsa_signal_value_t init_value{init_value_};
+
+            // STATICS
+            static
+            hsa_signal_t allocate() noexcept
+            {   // TODO: add backoff and termination.
+                do {
+                    for (auto&& s : pool_()) {
+                        if (s.first.test_and_set()) continue;
+
+                        hsa_signal_store_release(s.second.signal, init_value_);
+
+                        return s.second;
+                    }
+                } while (true);
+            }
+
+            static
+            void deallocate(hsa_signal_t x)
+            {
+                for (auto&& s : pool_()) {
+                    if (s.second.signal.handle != x.handle) continue;
+
+                    s.first.clear();
+
+                    return;
+                }
+
+                throw std::logic_error{
+                    "Tried to deallocate unallocated signal."};
+            }
+
+            static
+            void wait(hsa_signal_t x) noexcept
+            {
+                while (hsa_signal_wait_scacquire(
+                    x,
+                    HSA_SIGNAL_CONDITION_LT,
+                    init_value,
+                    UINT64_MAX,
+                    HSA_WAIT_STATE_BLOCKED) > init_value);
+            }
+        };
+    } // Namespace hc::detail.
+} // Namespace hc
\ No newline at end of file
diff --git a/include/hc/implementation/CMakeLists.txt b/include/hc/implementation/CMakeLists.txt
new file mode 100644
index 00000000000..cc1247c3a07
--- /dev/null
+++ b/include/hc/implementation/CMakeLists.txt
@@ -0,0 +1,31 @@
+set(
+    HC_impl_headers
+        hc_code_object_bundle.hpp
+        hc_n_way_set_associative_cache.hpp
+        hc_program_state.hpp
+        hc_raii_handle.hpp
+        hc_type_support.hpp)
+
+# Set location for output directory
+set(output_dir "${PROJECT_BINARY_DIR}/include/hc/implementation")
+set(out_files)
+foreach(f ${HC_impl_headers})
+    set(src ${CMAKE_CURRENT_SOURCE_DIR}/${f})
+    set(dst ${output_dir}/${f})
+    add_custom_command(
+        OUTPUT ${dst}
+        DEPENDS ${src}
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+        COMMENT "Copying HCC's ${f}...")
+    list(APPEND out_files ${dst})
+endforeach()
+
+# Create target for hcc-headers and set dependencies
+add_custom_target(hc-impl-headers ALL DEPENDS ${out_files})
+add_dependencies(world hc-impl-headers)
+
+# Install command for headers
+install(
+    FILES ${HC_impl_headers}
+    PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+    DESTINATION include/hc/implementation)
\ No newline at end of file
diff --git a/include/hc/implementation/hc_code_object_bundle.hpp b/include/hc/implementation/hc_code_object_bundle.hpp
new file mode 100644
index 00000000000..13758a8ef3f
--- /dev/null
+++ b/include/hc/implementation/hc_code_object_bundle.hpp
@@ -0,0 +1,202 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <hsa/hsa.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <istream>
+#include <iterator>
+#include <string>
+#include <vector>
+
+namespace hc
+{
+    namespace detail
+    {
+        struct Bundled_code {
+            union {
+                struct {
+                    std::uint64_t offset;
+                    std::uint64_t bundle_sz;
+                    std::uint64_t triple_sz;
+                };
+                char cbuf[
+                    sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
+            };
+            std::string triple;
+            std::vector<char> blob;
+        };
+
+        class Bundled_code_header_base {
+            friend class Bundled_code_header;
+
+            static
+            constexpr
+            const char* magic_string_()
+            {
+                return "__CLANG_OFFLOAD_BUNDLE__";
+            }
+            static
+            constexpr
+            std::size_t strlen_(
+                const char* ptr, std::size_t n = 0u) noexcept
+            {
+                return ptr ? (*ptr ? strlen_(ptr + 1, n + 1) : n) : n;
+            }
+        };
+
+        class Bundled_code_header : private Bundled_code_header_base {
+            using Bundled_code_header_base::strlen_;
+
+            friend
+            inline
+            bool valid(const Bundled_code_header& x)
+            {
+                return std::equal(
+                    x.bundler_magic_string,
+                    x.bundler_magic_string + x.strlen_(x.magic_string_()),
+                    x.magic_string_());
+            }
+
+            friend
+            inline
+            const std::vector<Bundled_code>& bundles(
+                const Bundled_code_header& x)
+            {
+                return x.bundles;
+            }
+
+            template<typename RandomAccessIterator>
+            friend
+            inline
+            bool read(
+                RandomAccessIterator f,
+                RandomAccessIterator,
+                Bundled_code_header& x)
+            {
+                std::copy_n(f, sizeof(x.cbuf), x.cbuf);
+
+                if (!valid(x)) return false;
+
+                x.bundles.resize(x.bundle_cnt);
+
+                auto it = f + sizeof(x.cbuf);
+                for (auto&& y : x.bundles) {
+                    std::copy_n(it, sizeof(y.cbuf), y.cbuf);
+                    it += sizeof(y.cbuf);
+
+                    y.triple.assign(it, it + y.triple_sz);
+
+                    std::copy_n(
+                        f + y.offset, y.bundle_sz, std::back_inserter(y.blob));
+
+                    it += y.triple_sz;
+                }
+
+                return true;
+            }
+
+            friend
+            inline
+            bool read(const std::vector<char>& blob, Bundled_code_header& x)
+            {
+                return read(blob.cbegin(), blob.cend(), x);
+            }
+
+            friend
+            inline
+            bool read(std::istream& is, Bundled_code_header& x)
+            {
+                return read(std::vector<char>{
+                    std::istreambuf_iterator<char>{is},
+                    std::istreambuf_iterator<char>{}},
+                    x);
+            }
+
+            union {
+                struct {
+                    char bundler_magic_string[strlen_(magic_string_())];
+                    std::uint64_t bundle_cnt;
+                };
+                char cbuf[sizeof(bundler_magic_string) + sizeof(bundle_cnt)];
+            };
+            std::vector<Bundled_code> bundles;
+        public:
+            Bundled_code_header() = default;
+            Bundled_code_header(const Bundled_code_header&) = default;
+            Bundled_code_header(Bundled_code_header&&) = default;
+
+            template<typename RandomAccessIterator>
+            Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l)
+                : Bundled_code_header{}
+            {
+                read(f, l, *this);
+            }
+
+            explicit
+            Bundled_code_header(const std::vector<char>& blob)
+                : Bundled_code_header{blob.cbegin(), blob.cend()}
+            {}
+        };
+
+        inline
+        std::string transmogrify_triple(const std::string& triple)
+        {
+            static constexpr const char old_prefix[]{"hcc-amdgcn--amdhsa-gfx"};
+            static constexpr const char new_prefix[]{
+                "hcc-amdgcn-amd-amdhsa--gfx"};
+
+            if (triple.find(old_prefix) == 0) {
+                return new_prefix + triple.substr(sizeof(old_prefix) - 1);
+            }
+
+            return (triple.find(new_prefix) == 0) ? triple : "";
+        }
+
+        inline
+        std::string isa_name(std::string triple)
+        {
+            static constexpr const char offload_prefix[]{"hcc-"};
+
+            triple = transmogrify_triple(triple);
+            if (triple.empty()) return {};
+
+            triple.erase(0, sizeof(offload_prefix) - 1);
+
+            static hsa_isa_t r{};
+            static const bool is_old_rocr{
+                hsa_isa_from_name(triple.c_str(), &r) != HSA_STATUS_SUCCESS};
+
+            if (!is_old_rocr) return triple;
+
+            auto tmp{triple.substr(triple.rfind('x') + 1)};
+            triple.replace(0, std::string::npos, "AMD:AMDGPU");
+
+            for (auto&& x : tmp) {
+                triple.push_back(':');
+                triple.push_back(x);
+            }
+
+            return triple;
+        }
+
+        inline
+        hsa_isa_t triple_to_hsa_isa(std::string triple)
+        {
+            const auto isa = isa_name(std::move(triple));
+
+            if (isa.empty()) return {};
+
+            hsa_isa_t r{};
+            const auto s = hsa_isa_from_name(isa.c_str(), &r);
+
+            return (s == HSA_STATUS_SUCCESS) ? r : hsa_isa_t{};
+        }
+    } // Namespace hc::detail.
+} // Namespace hc.
diff --git a/include/hc/implementation/hc_n_way_set_associative_cache.hpp b/include/hc/implementation/hc_n_way_set_associative_cache.hpp
new file mode 100644
index 00000000000..d80c2cafea1
--- /dev/null
+++ b/include/hc/implementation/hc_n_way_set_associative_cache.hpp
@@ -0,0 +1,179 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <atomic>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+namespace hc
+{
+    namespace detail
+    {
+        template<typename T, std::size_t n = 128, std::size_t size = 65536u>
+        class N_way_set_associative_cache {
+            static_assert(
+                n <= size,
+                "Number of sets must not be greater than cache size.");
+
+            using GuardedLockedPtr_ = std::pair<
+                std::atomic_flag, std::pair<const void*, void*>>;
+
+            // IMPLEMENTATION - DATA - STATICS
+            static constexpr std::uint8_t bit_cnt_{
+                sizeof(std::uintptr_t) * CHAR_BIT};
+            static constexpr std::uint8_t byte_offset_bits_{2u};
+            static constexpr std::uint8_t set_bits_{
+                bit_cnt_ - __builtin_clzll(n) - 1u};
+            static constexpr auto set_size_ = size / n;
+            static constexpr std::uint8_t tag_bits_{
+                bit_cnt_ - set_bits_ - byte_offset_bits_};
+
+            // IMPLEMENTATION - DATA
+            std::array<GuardedLockedPtr_, size> cache_{};
+
+            // IMPLEMENTATION - STATICS
+            static
+            constexpr
+            std::uintptr_t make_bitmask_(
+                std::uint8_t first, std::uint8_t last) noexcept [[cpu, hc]]
+            {
+                return (first == last) ?
+                    0u : ((UINTPTR_MAX >> (bit_cnt_ - (first - last))) << last);
+            }
+
+            static
+            std::uintptr_t byte_offset_(const void* p) noexcept [[cpu, hc]]
+            {
+                constexpr auto mask = make_bitmask_(byte_offset_bits_, 0u);
+
+                return reinterpret_cast<std::uintptr_t>(p) & mask;
+            }
+
+            static
+            std::uintptr_t set_(const void* p) noexcept [[cpu, hc]]
+            {
+                constexpr auto mask = make_bitmask_(
+                    set_bits_ + byte_offset_bits_, byte_offset_bits_);
+
+                return (reinterpret_cast<std::uintptr_t>(p) & mask) >>
+                    byte_offset_bits_;
+            }
+
+            static
+            std::uintptr_t tag_(const void* p) noexcept [[cpu, hc]]
+            {
+                constexpr auto mask = make_bitmask_(
+                    tag_bits_ + set_bits_ + byte_offset_bits_,
+                    set_bits_ + byte_offset_bits_);
+
+                return (reinterpret_cast<std::uintptr_t>(p) & mask) >>
+                    (set_bits_ + byte_offset_bits_);
+            }
+
+            static
+            constexpr
+            std::uint32_t flat_set_idx_(const void* ptr) noexcept [[cpu, hc]]
+            {
+                return set_(ptr) * size / n;
+            }
+
+            // IMPLEMENTATION - ACCESSORS
+            typename decltype(cache_)::size_type find_cache_entry_(
+                const void* ptr) const noexcept [[cpu, hc]]
+            {
+                const auto idx = flat_set_idx_(ptr);
+
+                for (auto i = 0u; i != set_size_; ++i) {
+                    if (cache_[idx + i].second.first == ptr) return idx + i;
+                }
+
+                return cache_.size();
+            }
+        public:
+            // TODO: these are not yet truly iterators, and proper iteration is
+            //       to be added in the future.
+            using const_iterator = const T*;
+            using iterator = T*;
+            using size_type = std::size_t;
+
+            // CREATORS
+            N_way_set_associative_cache() [[cpu, hc]] = default;
+            N_way_set_associative_cache(
+                const N_way_set_associative_cache&) [[cpu, hc]] = delete;
+            N_way_set_associative_cache(
+                N_way_set_associative_cache&&) [[cpu, hc]] = default;
+            ~N_way_set_associative_cache() [[cpu, hc]] = default;
+
+            // MANIPULATORS
+            N_way_set_associative_cache& operator=(
+                const N_way_set_associative_cache&) [[cpu, hc]] = delete;
+            N_way_set_associative_cache& operator=(
+                N_way_set_associative_cache&&) [[cpu, hc]] = default;
+
+            constexpr
+            // TODO: C++11 is odd with constexpr, if / when we move up revisit.
+            iterator end() const noexcept [[cpu, hc]]
+            {
+                return nullptr;
+            }
+
+            iterator find(const void* ptr) noexcept [[cpu, hc]]
+            {
+                const auto idx = find_cache_entry_(ptr);
+
+                if (idx == cache_.size()) return end();
+
+                return &cache_[idx].second.second;
+            }
+
+            size_type erase(const void* ptr) noexcept [[cpu, hc]]
+            {
+                auto idx = find_cache_entry_(ptr);
+
+                if (idx == cache_.size()) return 0u;
+
+                cache_[idx].second = {};
+                cache_[idx].first.clear();
+
+                return 1u;
+            }
+
+            std::pair<iterator, bool> insert(
+                const void* ptr, T x) noexcept [[cpu, hc]]
+            {
+                const auto idx = flat_set_idx_(ptr);
+
+                for (auto i = 0u; i != set_size_; ++i) {
+                    if (cache_[idx + i].first.test_and_set()) continue;
+
+                    cache_[idx + i].second.first = ptr;
+                    cache_[idx + i].second.second = std::move(x);
+
+                    return {&cache_[idx + i].second.second, true};
+                }
+
+                return {end(), false};
+            }
+
+            // ACCESSORS
+            constexpr
+            const_iterator cend() const noexcept [[cpu, hc]]
+            {
+                return nullptr;
+            }
+
+            const_iterator find(const void* ptr) const noexcept [[cpu, hc]]
+            {   // TODO: remove abusive usage of const_cast.
+                return
+                    const_cast<N_way_set_associative_cache&>(*this).find(ptr);
+            }
+        };
+    } // Namespace hc::detail.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/implementation/hc_program_state.hpp b/include/hc/implementation/hc_program_state.hpp
new file mode 100644
index 00000000000..ff482cede41
--- /dev/null
+++ b/include/hc/implementation/hc_program_state.hpp
@@ -0,0 +1,491 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+// TODO: this must be completely redone, it is representative of a stale
+//       iteration of the approach to code object retrieval.
+
+#include "hc_raii_handle.hpp"
+#include "hc_code_object_bundle.hpp"
+#include "../hc_agent_pool.hpp"
+#include "../hc_runtime.hpp"
+
+#include <hsa/hsa.h>
+
+#include <elfio/elfio.hpp>
+
+#include <link.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <mutex>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+inline
+bool operator==(hsa_code_object_reader_t x, hsa_code_object_reader_t y) noexcept
+{
+    return x.handle == y.handle;
+}
+
+inline
+bool operator==(hsa_isa_t x, hsa_isa_t y) noexcept
+{
+    return x.handle == y.handle;
+}
+
+namespace std
+{
+    template<>
+    struct hash<hsa_code_object_reader_t> {
+        std::size_t operator()(hsa_code_object_reader_t x) const noexcept
+        {
+            return hash<decltype(x.handle)>{}(x.handle);
+        }
+    };
+
+    template<>
+    struct hash<hsa_isa_t> {
+        std::size_t operator()(hsa_isa_t x) const noexcept
+        {
+            return hash<decltype(x.handle)>{}(x.handle);
+        }
+    };
+}
+
+namespace hc
+{
+    namespace detail
+    {
+        class Program_state {
+            struct Symbol_ {
+                std::string name;
+                ELFIO::Elf64_Addr value = 0;
+                ELFIO::Elf_Xword size = 0;
+                ELFIO::Elf_Half sect_idx = 0;
+                std::uint8_t bind = 0;
+                std::uint8_t type = 0;
+                std::uint8_t other = 0;
+            };
+
+            using RAIICodeObjectReader_ =
+                RAII_move_only_handle<
+                    hsa_code_object_reader_t,
+                    decltype(hsa_code_object_reader_destroy)*>;
+            using RAIIExecutable_ = RAII_move_only_handle<
+                hsa_executable_t, decltype(hsa_executable_destroy)*>;
+
+            using CodeObjectTable_ = std::unordered_map<
+                hsa_isa_t, std::vector<RAIICodeObjectReader_>>;
+            using ExecutableTable_ = std::unordered_map<
+                hsa_agent_t, std::vector<RAIIExecutable_>>;
+            using KernelTable_ = std::unordered_map<
+                hsa_agent_t, std::vector<hsa_executable_symbol_t>>;
+
+            // IMPLEMENTATION - STATICS
+            template<typename T = std::vector<std::vector<char>>>
+            static
+            int copy_kernel_sections_(dl_phdr_info* info, size_t, void* kernels)
+            {
+                static constexpr const char self[]{"/proc/self/exe"};
+
+                ELFIO::elfio reader;
+
+                const auto f{info->dlpi_addr ? info->dlpi_name : self};
+
+                if (!reader.load(f)) return 0;
+
+                static constexpr const char kernel[]{".kernel"};
+                const auto it{std::find_if(
+                    reader.sections.begin(),
+                    reader.sections.end(),
+                    [](const ELFIO::section* x) {
+                        return x->get_name() == kernel;
+                })};
+
+                if (it == reader.sections.end()) return 0;
+
+                static_cast<T*>(kernels)->emplace_back(
+                    (*it)->get_data(), (*it)->get_data() + (*it)->get_size());
+
+                return 0;
+            }
+
+            static
+            const std::vector<Bundled_code_header>& kernel_sections_()
+            {
+                static std::vector<Bundled_code_header> r;
+                static std::once_flag f;
+
+                std::call_once(f, []() {
+                    std::vector<std::vector<char>> ks;
+                    dl_iterate_phdr(copy_kernel_sections_<>, &ks);
+
+                    for (auto&& x : ks) {
+                        Bundled_code_header tmp{x};
+
+                        if (valid(tmp)) r.push_back(std::move(tmp));
+                    }
+                });
+
+                return r;
+            }
+
+            static
+            RAIICodeObjectReader_ make_code_object_reader_(
+                const std::vector<char>& x)
+            {
+                if (x.empty()) return {};
+
+                RAIICodeObjectReader_ r{{}, hsa_code_object_reader_destroy};
+                throwing_hsa_result_check(
+                    hsa_code_object_reader_create_from_memory(
+                        x.data(), x.size(), &handle(r)),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            std::unordered_map<
+                hsa_code_object_reader_t,
+                const std::vector<char>*>& loaded_blobs_()
+            {
+                static std::unordered_map<
+                    hsa_code_object_reader_t, const std::vector<char>*> r;
+
+                return r;
+            }
+
+            static
+            void make_code_object_table_(
+                const Bundled_code_header& x, CodeObjectTable_& y)
+            {
+                for (auto&& z : bundles(x)) {
+                    if (z.blob.empty()) continue;
+
+                    const auto isa = triple_to_hsa_isa(z.triple);
+
+                    if (isa.handle == 0) continue;
+
+                    y[isa].push_back(make_code_object_reader_(z.blob));
+                    loaded_blobs_()[handle(y[isa].back())] = &z.blob;
+                }
+            }
+
+            static
+            const CodeObjectTable_& code_objects_()
+            {
+                static CodeObjectTable_ r;
+                static std::once_flag f;
+
+                std::call_once(f, []() {
+                    for (auto&& x : kernel_sections_()) {
+                        make_code_object_table_(x, r);
+                    }
+                });
+
+                return r;
+            }
+
+            static
+            hsa_isa_t agent_isa_(hsa_agent_t x)
+            {
+                hsa_isa_t r{};
+                throwing_hsa_result_check(
+                    hsa_agent_iterate_isas(x, [](hsa_isa_t isa, void* p) {
+                        *static_cast<hsa_isa_t*>(p) = isa;
+
+                        return HSA_STATUS_SUCCESS;
+                    }, &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            Symbol_ read_symbol_(
+                const ELFIO::symbol_section_accessor& section, unsigned int idx)
+            {
+                Symbol_ r{};
+                section.get_symbol(
+                    idx,
+                    r.name,
+                    r.value,
+                    r.size,
+                    r.bind,
+                    r.type,
+                    r.sect_idx,
+                    r.other);
+
+                return r;
+            }
+
+            static
+            const std::unordered_map<
+                std::string, std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>&
+                    symbol_addresses_()
+            {
+                static std::unordered_map<
+                    std::string,
+                    std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>> r;
+                static std::once_flag f;
+
+                std::call_once(f, []() {
+                    dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
+                        static constexpr const char self[]{"/proc/self/exe"};
+                        ELFIO::elfio reader;
+
+                        static unsigned int iter{0u};
+                        if (!reader.load(!iter++ ? self : info->dlpi_name)) {
+                            return 0;
+                        }
+
+                        auto it = std::find_if(
+                            reader.sections.begin(),
+                            reader.sections.end(),
+                            [](const ELFIO::section* x) {
+                                return x->get_type() == SHT_SYMTAB;
+                        });
+
+                        if (it == reader.sections.end()) return 0;
+
+                        const ELFIO::symbol_section_accessor symtab{
+                            reader, *it};
+
+                        for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
+                            auto tmp = read_symbol_(symtab, i);
+
+                            if (tmp.type != STT_OBJECT ||
+                                tmp.sect_idx == SHN_UNDEF) {
+                                continue;
+                            }
+
+                            r.emplace(
+                                std::move(tmp.name),
+                                std::make_pair(tmp.value, tmp.size));
+                        }
+
+                        return 0;
+                    }, nullptr);
+                });
+
+                return r;
+            }
+
+            static
+            std::vector<std::string> copy_names_of_undefined_symbols_(
+                const ELFIO::symbol_section_accessor& section)
+            {
+                std::vector<std::string> r;
+
+                for (auto i = 0u; i != section.get_symbols_num(); ++i) {
+                    // TODO: this is boyscout code, caching the temporaries
+                    //       may be of worth.
+
+                    auto tmp = read_symbol_(section, i);
+                    if (tmp.sect_idx != SHN_UNDEF || tmp.name.empty()) continue;
+
+                    r.push_back(std::move(tmp.name));
+                }
+
+                return r;
+            }
+
+            static
+            void* agent_ptr(void* host_ptr)
+            {
+                hsa_amd_pointer_info_t tmp{};
+                tmp.size = sizeof(hsa_amd_pointer_info_t);
+
+                throwing_hsa_result_check(
+                    hsa_amd_pointer_info(
+                        host_ptr, &tmp, nullptr, nullptr, nullptr),
+                    __FILE__, __func__, __LINE__);
+
+                return tmp.agentBaseAddress;
+            }
+
+            static
+            void associate_globals_with_host_allocation_(
+                hsa_agent_t agent,
+                hsa_executable_t executable,
+                hsa_code_object_reader_t cor)
+            {
+                ELFIO::elfio reader;
+
+                std::istringstream tmp{std::string{
+                    loaded_blobs_()[cor]->cbegin(),
+                    loaded_blobs_()[cor]->cend()}};
+                if (!reader.load(tmp)) return;
+
+                const auto it = std::find_if(
+                    reader.sections.begin(),
+                    reader.sections.end(),
+                    [](const ELFIO::section* x) {
+                    return x->get_type() == SHT_SYMTAB;
+                });
+                const auto undefined_symbols = copy_names_of_undefined_symbols_(
+                    ELFIO::symbol_section_accessor{reader, *it});
+
+                for (auto&& x : undefined_symbols) {
+                    using RAII_global =
+                        std::unique_ptr<void, decltype(hsa_amd_memory_unlock)*>;
+
+                    const auto it1 = symbol_addresses_().find(x);
+
+                    if (it1 == symbol_addresses_().cend()) {
+                        throw std::runtime_error{
+                            "Global symbol: " + x + " is undefined."};
+                    }
+
+                    static std::unordered_map<std::string, RAII_global> globals;
+
+                    static std::mutex mtx;
+                    std::lock_guard<std::mutex> lck{mtx};
+
+                    void* p{nullptr};
+                    if (globals.find(x) == globals.cend()) {
+                        void* host_ptr =
+                            reinterpret_cast<void*>(it1->second.first);
+                        throwing_hsa_result_check(
+                            hsa_amd_memory_lock(
+                                host_ptr, it1->second.second, nullptr, 0u, &p),
+                            __FILE__, __func__, __LINE__);
+
+                        globals.emplace(
+                            x, RAII_global{host_ptr, hsa_amd_memory_unlock});
+                    }
+                    else p = agent_ptr(globals.find(x)->second.get());
+
+                    throwing_hsa_result_check(
+                        hsa_executable_agent_global_variable_define(
+                            executable, agent, x.c_str(), p),
+                        __FILE__, __func__, __LINE__);
+                }
+            }
+
+            static
+            RAIIExecutable_ make_executable_(
+                const RAIICodeObjectReader_& x, hsa_agent_t a)
+            {
+                RAIIExecutable_ r{{}, hsa_executable_destroy};
+
+                throwing_hsa_result_check(
+                    hsa_executable_create_alt(
+                        HSA_PROFILE_FULL,// TODO: this is a bug.
+                        HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
+                        nullptr,
+                        &handle(r)),
+                    __FILE__, __func__, __LINE__);
+
+                associate_globals_with_host_allocation_(
+                    a, handle(r), handle(x));
+
+                throwing_hsa_result_check(
+                    hsa_executable_load_agent_code_object(
+                        handle(r), a, handle(x), nullptr, nullptr),
+                    __FILE__, __func__, __LINE__);
+
+                throwing_hsa_result_check(
+                    hsa_executable_freeze(handle(r), nullptr),
+                    __FILE__, __func__, __LINE__);
+
+                return r;
+            }
+
+            static
+            void make_executable_table_(
+                const CodeObjectTable_& x, ExecutableTable_& y)
+            {
+                for (auto&& agent : Agent_pool::pool()) {
+                    if (agent.second.is_cpu) continue;
+
+                    const auto it = x.find(agent_isa_(agent.first));
+
+                    if (it == x.cend()) continue;
+
+                    for (auto&& z : it->second) {
+                        y[agent.first].push_back(
+                            make_executable_(z, agent.first));
+                    }
+                }
+            }
+
+            static
+            const ExecutableTable_& executables_()
+            {
+                static ExecutableTable_ r;
+                static std::once_flag f;
+
+                std::call_once(f, []() {
+                    make_executable_table_(code_objects_(), r);
+                });
+
+                return r;
+            }
+
+            static
+            bool is_kernel_(hsa_executable_symbol_t x)
+            {
+                hsa_symbol_kind_t r{};
+                throwing_hsa_result_check(
+                    hsa_executable_symbol_get_info(
+                        x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r),
+                    __FILE__, __func__, __LINE__);
+
+                return r == HSA_SYMBOL_KIND_KERNEL;
+            }
+
+            static
+            hsa_status_t copy_kernel_symbols(
+                hsa_executable_t,
+                hsa_agent_t,
+                hsa_executable_symbol_t y,
+                void* z)
+            {
+                auto p = static_cast<typename KernelTable_::mapped_type*>(z);
+
+                if (is_kernel_(y)) p->push_back(y);
+
+                return HSA_STATUS_SUCCESS;
+            }
+
+            static
+            void make_kernel_table_(const ExecutableTable_& x, KernelTable_& y)
+            {
+                for (auto&& e : x) {
+                    for (auto&& ex : e.second) {
+                        throwing_hsa_result_check(
+                            hsa_executable_iterate_agent_symbols(
+                                handle(ex),
+                                e.first,
+                                copy_kernel_symbols,
+                                &y[e.first]),
+                            __FILE__, __func__, __LINE__);
+                    }
+                }
+            }
+        public:
+            static
+            KernelTable_& kernels()
+            {
+                static KernelTable_ r;
+                static std::once_flag f;
+
+                std::call_once(f, []() {
+                    make_kernel_table_(executables_(), r);
+                });
+
+                return r;
+            }
+        };
+    }// Namespace hc::detail.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/implementation/hc_raii_handle.hpp b/include/hc/implementation/hc_raii_handle.hpp
new file mode 100644
index 00000000000..93859398be6
--- /dev/null
+++ b/include/hc/implementation/hc_raii_handle.hpp
@@ -0,0 +1,131 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include "hc_type_support.hpp"
+
+#include <type_traits>
+#include <utility>
+
+namespace hc
+{
+    namespace detail
+    {
+        template<typename T, typename D>
+        class RAII_handle {
+            friend
+            inline
+            const T& handle(const RAII_handle& x) { return x.h_; }
+
+            friend
+            inline
+            T& handle(RAII_handle& x) { return x.h_; }
+
+            T h_;
+            D d_;
+        public:
+            RAII_handle() = default;
+            RAII_handle(const RAII_handle&) = default;
+            RAII_handle(RAII_handle&&) = default;
+
+            RAII_handle(T h, D d) : h_{std::move(h)}, d_{std::move(d)} {}
+
+            template<
+                typename E,
+                typename std::enable_if<
+                    std::is_convertible<E, D>{}>::type* = nullptr>
+            RAII_handle(T h, E d) : RAII_handle{std::move(h), std::move(d)} {}
+
+            RAII_handle& operator=(const RAII_handle&) = default;
+            RAII_handle& operator=(RAII_handle&&) = default;
+
+            operator T() const { return h_; }
+
+            ~RAII_handle() { d_(h_); }
+        };
+
+        template<typename T, typename D>
+        class RAII_move_only_handle :
+            public Swappable<RAII_move_only_handle<T, D>> {
+            friend class Swappable<RAII_move_only_handle>;
+
+            friend
+            inline
+            const T& handle(const RAII_move_only_handle& x) { return x.h_; }
+
+            friend
+            inline
+            T& handle(RAII_move_only_handle& x) { return x.h_; }
+
+            T h_;
+            D d_;
+            bool v_ = false;
+
+            void swp_(RAII_move_only_handle& x)
+            {
+                using std::swap;
+
+                swap(h_, x.h_);
+                swap(d_, x.d_);
+                swap(v_, x.v_);
+            }
+        public:
+            RAII_move_only_handle() = default;
+            RAII_move_only_handle(const RAII_move_only_handle&) = delete;
+            RAII_move_only_handle(RAII_move_only_handle&& x)
+                : RAII_move_only_handle{std::move(x.h_), std::move(x.d_)}
+            {
+                x.h_ = T{};
+                x.v_ = false;
+            }
+
+            RAII_move_only_handle(T h, D d)
+                : h_{std::move(h)}, d_{std::move(d)}, v_{true}
+            {}
+
+            template<
+                typename E,
+                typename std::enable_if<
+                    std::is_convertible<E, D>{}>::type* = nullptr>
+            RAII_move_only_handle(T h, E d)
+                : RAII_move_only_handle{std::move(h), std::move(d)}
+            {}
+
+            RAII_move_only_handle& operator=(RAII_move_only_handle x)
+            {
+                using std::swap;
+
+                swap(*this, x);
+
+                return *this;
+            }
+
+            ~RAII_move_only_handle() { if (v_) d_(h_); v_ = false; }
+        };
+
+        template<typename D>
+        class RAII_stateless_handle {
+            D d_;
+        public:
+            RAII_stateless_handle() = default;
+            RAII_stateless_handle(const RAII_stateless_handle&) = default;
+            RAII_stateless_handle(RAII_stateless_handle&&) = default;
+
+            template<typename C>
+            RAII_stateless_handle(const C& ctor, D dtor) : d_{std::move(dtor)}
+            {
+                ctor();
+            }
+
+            RAII_stateless_handle& operator=(
+                const RAII_stateless_handle&) = default;
+            RAII_stateless_handle& operator=(RAII_stateless_handle&&) = default;
+
+            ~RAII_stateless_handle() { d_(); }
+        };
+    } // Namespace hc::detail.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc/implementation/hc_type_support.hpp b/include/hc/implementation/hc_type_support.hpp
new file mode 100644
index 00000000000..8de7974d72c
--- /dev/null
+++ b/include/hc/implementation/hc_type_support.hpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+namespace hc
+{
+    namespace detail
+    {
+        template<typename T>
+        class Swappable {
+            friend
+            inline
+            void swap(T& x, T& y) { Swappable<T>::swap_(x, y); }
+        public:
+            static
+            void swap_(T& x, T& y) { x.swp_(y); }
+        };
+    } // Namespace hc::detail.
+} // Namespace hc.
\ No newline at end of file
diff --git a/include/hc_am.hpp b/include/hc_am.hpp
index 592efa6e59c..8f86d54c6b8 100644
--- a/include/hc_am.hpp
+++ b/include/hc_am.hpp
@@ -1,5 +1,9 @@
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 #include "hc.hpp"
 #include <cstddef>
 #include <mutex>
diff --git a/include/hc_am_internal.hpp b/include/hc_am_internal.hpp
index 029d420e6b5..2341e23261f 100644
--- a/include/hc_am_internal.hpp
+++ b/include/hc_am_internal.hpp
@@ -1,5 +1,9 @@
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 #include "hc_am.hpp"
 
 namespace hc {
diff --git a/include/hc_defines.h b/include/hc_defines.h
index 15e933aaf22..721af248b74 100644
--- a/include/hc_defines.h
+++ b/include/hc_defines.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 // C++ headers
 #include <algorithm>
 #include <cassert>
@@ -19,12 +23,12 @@
 #include <vector>
 
 // CPU execution path
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
+#if __HCC_ACCELERATOR__ == 2 || __HCC_CPU__ == 2
 #include <ucontext.h>
 #endif
 
 namespace hc {
-  typedef __fp16 half;
+  typedef _Float16 half;
 }
 
 //
@@ -37,21 +41,9 @@ extern "C" __attribute__((const,hc)) uint32_t hc_get_workitem_id(unsigned int n)
 extern "C" __attribute__((const,hc)) uint32_t hc_get_num_groups(unsigned int n);
 extern "C" __attribute__((const,hc)) uint32_t hc_get_group_id(unsigned int n);
 
-extern "C" __attribute__((const,amp)) uint32_t amp_get_global_size(unsigned int n);
-extern "C" __attribute__((const,amp)) uint32_t amp_get_global_id(unsigned int n);
-extern "C" __attribute__((const,amp)) uint32_t amp_get_local_size(unsigned int n);
-extern "C" __attribute__((const,amp)) uint32_t amp_get_local_id(unsigned int n);
-extern "C" __attribute__((const,amp)) uint32_t amp_get_num_groups(unsigned int n);
-extern "C" __attribute__((const,amp)) uint32_t amp_get_group_id(unsigned int n);
-
-#if __KALMAR_ACCELERATOR__ == 2
-#define tile_static thread_local
-#else
 #define tile_static __attribute__((tile_static))
-#endif
 
-extern "C" __attribute__((noduplicate,hc)) void hc_barrier(unsigned int n);
-extern "C" __attribute__((noduplicate,amp)) void amp_barrier(unsigned int n) ;
+extern "C" __attribute__((noduplicate, hc)) void hc_barrier(unsigned int n);
 
 /// macro to set if we want default queue be thread-local or not
 #define TLS_QUEUE (1)
diff --git a/include/hc_math.hpp b/include/hc_math.hpp
index f27dd4ee76b..5f320879f51 100644
--- a/include/hc_math.hpp
+++ b/include/hc_math.hpp
@@ -1,5 +1,9 @@
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 #include "hc.hpp"
 #include <cmath>
 
diff --git a/include/hc_norm_unorm.inl b/include/hc_norm_unorm.inl
deleted file mode 100644
index 4e7e82cf226..00000000000
--- a/include/hc_norm_unorm.inl
+++ /dev/null
@@ -1,199 +0,0 @@
-
-#pragma once
-
-#include <type_traits>
-
-#ifndef __CPU_GPU__
-
-#if __HCC_AMP__
-#define __CPU_GPU__   restrict(cpu,amp)
-#else
-#define __CPU_GPU__   [[cpu,hc]]
-#endif
-
-#endif
-
-template <bool isSigned> class __amp_norm_template;
-
-typedef __amp_norm_template<true>  __amp_norm;
-typedef __amp_norm_template<false> __amp_unorm;
-
-typedef __amp_norm   norm;
-typedef __amp_unorm unorm;
-
-template <bool isSigned>
-class __amp_norm_template {
-
-public:
-
-  typedef __amp_norm_template<isSigned> norm_type;
-
-  __amp_norm_template() __CPU_GPU__ : data(0.0f) { }
-
-  explicit __amp_norm_template(float v) __CPU_GPU__ {
-    set(v);
-  }
-  explicit __amp_norm_template(unsigned int v) __CPU_GPU__ {
-    set((float)v);
-  }
-  explicit __amp_norm_template(int v) __CPU_GPU__ {
-    set((float)v);
-  }
-  explicit __amp_norm_template(double v) __CPU_GPU__ {
-    set((float)v);
-  }
-  __amp_norm_template(const norm_type& other) __CPU_GPU__ {
-    data = other.data;
-  }
-
-  explicit __amp_norm_template(const __amp_norm_template<!isSigned>& other) __CPU_GPU__ {
-    set((float)other);
-  }
-
-  float get() __CPU_GPU__ {
-    return data;
-  }
-
-  void set(float f) __CPU_GPU__ {
-    data = clamp(f);
-  }
-
-  norm_type& operator=(const norm_type& other) __CPU_GPU__ {
-    data = other.data;
-    return *this;
-  }
-
-  norm_type& operator=(const float& other) __CPU_GPU__ {
-    set(other);
-    return *this;
-  }
-
-  operator float() const __CPU_GPU__ { return data; }
-
-  norm_type& operator+=(const norm_type& other) __CPU_GPU__ {  
-    set(data + other.data);
-    return *this;
-  }
-
-  norm_type& operator-=(const norm_type& other) __CPU_GPU__ {  
-    set(data - other.data);
-    return *this;
-  }
-
-  norm_type& operator*=(const norm_type& other) __CPU_GPU__ {  
-    set(data * other.data);
-    return *this;
-  }
-
-  norm_type& operator/=(const norm_type& other) __CPU_GPU__ {  
-    set(data / other.data);
-    return *this;
-  }
-  
-  norm_type& operator++() __CPU_GPU__ {
-    set(data + 1.0f);
-    return *this;
-  }
-
-  norm_type operator++(int) __CPU_GPU__ {
-    norm_type r(*this);
-    operator++();
-    return r;
-  }
-  
-  norm_type& operator--() __CPU_GPU__ {
-    set(data - 1.0f);
-    return *this;
-  }
-
-  norm_type operator--(int) __CPU_GPU__ {
-    norm_type r(*this);
-    operator--();
-    return r;
-  }
-
-  template <typename T = norm_type
-            , class = typename std::enable_if<T::isSigned,norm_type>::type >
-  T operator-() __CPU_GPU__ {
-    T r(-data);
-    return r;
-  }
-
-  static constexpr float min = isSigned?-1.0f:0.0f;
-  static constexpr float max = isSigned? 1.0f:1.0f;
-
-private:
-  float data;
-
-  float clamp(float v) __CPU_GPU__ {
-    return v>max?max:(v<min?min:v);
-  }
-};
-
-template <bool isSigned>
-__amp_norm_template<isSigned> operator+(const __amp_norm_template<isSigned>& lhs
-                                        , const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return __amp_norm_template<isSigned>((float)lhs + (float)rhs);
-}
- 
-template <bool isSigned>
-__amp_norm_template<isSigned> operator-(const __amp_norm_template<isSigned>& lhs
-                                        , const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return __amp_norm_template<isSigned>((float)lhs - (float)rhs);
-}
-
-template <bool isSigned>
-__amp_norm_template<isSigned> operator*(const __amp_norm_template<isSigned>& lhs
-                                        , const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return __amp_norm_template<isSigned>((float)lhs * (float)rhs);
-}
-
-template <bool isSigned>
-__amp_norm_template<isSigned> operator/(const __amp_norm_template<isSigned>& lhs
-                                        , const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return __amp_norm_template<isSigned>((float)lhs / (float)rhs);
-}
-
-template <bool isSigned>
-bool operator==(const __amp_norm_template<isSigned>& lhs
-               ,const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return ((float)lhs == (float)rhs);
-}
-
-template <bool isSigned>
-bool operator!=(const __amp_norm_template<isSigned>& lhs
-               ,const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return ((float)lhs != (float)rhs);
-}
-
-template <bool isSigned>
-bool operator>(const __amp_norm_template<isSigned>& lhs
-               ,const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return ((float)lhs > (float)rhs);
-}
-
-template <bool isSigned>
-bool operator<(const __amp_norm_template<isSigned>& lhs
-               ,const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return ((float)lhs < (float)rhs);
-}
-
-template <bool isSigned>
-bool operator>=(const __amp_norm_template<isSigned>& lhs
-               ,const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return ((float)lhs >= (float)rhs);
-}
-
-template <bool isSigned>
-bool operator<=(const __amp_norm_template<isSigned>& lhs
-               ,const __amp_norm_template<isSigned>& rhs) __CPU_GPU__ {
-  return ((float)lhs <= (float)rhs);
-}
-
-#define UNORM_MIN  ((unorm)0.0f)
-#define UNORM_MAX  ((unorm)1.0f)
-#define UNORM_ZERO ((norm)0.0f)
-#define NORM_ZERO  ((norm)0.0f)
-#define NORM_MIN   ((norm)-1.0f)
-#define NORM_MAX   ((norm)1.0f)
-
diff --git a/include/hc_printf.hpp b/include/hc_printf.hpp
index 954fa3490e6..ecbd8e32e15 100644
--- a/include/hc_printf.hpp
+++ b/include/hc_printf.hpp
@@ -1,5 +1,9 @@
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 #include <type_traits>
 #include <cstdlib>
 #include <cstdio>
@@ -11,7 +15,6 @@
 #include <algorithm>
 
 #include "hc_am_internal.hpp"
-#include "hsa_atomic.h"
 
 // The printf on the accelerator is only enabled when
 // The HCC_ENABLE_ACCELERATOR_PRINTF is defined
diff --git a/include/hc_rt_debug.h b/include/hc_rt_debug.h
index e5c624ba3ad..182fee92df0 100644
--- a/include/hc_rt_debug.h
+++ b/include/hc_rt_debug.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 #include <cstdlib>
 #include <cstdio>
 #ifndef USE_LIBCXX
diff --git a/include/hc_short_vector.hpp b/include/hc_short_vector.hpp
deleted file mode 100644
index 976f424fa05..00000000000
--- a/include/hc_short_vector.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#ifndef _HC_SHORT_VECTORS_HPP
-#define _HC_SHORT_VECTORS_HPP
-
-#include <cstddef>
-#include <type_traits>
-#include "kalmar_serialize.h"
-#include "hc_defines.h"
-
-namespace hc
-{
-
-namespace short_vector
-{
-
-#ifdef __HCC__
-#define __CPU_GPU__ [[cpu]] [[hc]]
-#else
-#define __CPU_GPU__
-#endif
-
-#if 1
-#include "hc_short_vector.inl"
-#else
-#include "kalmar_short_vectors.inl"
-#endif
-
-#undef __CPU_GPU__
-
-} // namespace short_vector
-
-} // namespace hc
-
-#endif // _HC_SHORT_VECTORS_H
diff --git a/include/hc_short_vector.inl b/include/hc_short_vector.inl
deleted file mode 100644
index 37d97adac7d..00000000000
--- a/include/hc_short_vector.inl
+++ /dev/null
@@ -1,1291 +0,0 @@
-
-#pragma once
-
-#include "hc_norm_unorm.inl"
-
-#ifndef __CPU_GPU__
-
-#if __HCC_AMP__
-#define __CPU_GPU__   restrict(cpu,amp)
-#else
-#define __CPU_GPU__   [[cpu,hc]]
-#endif
-
-#endif
-
-template <typename SCALAR_TYPE, unsigned int VECTOR_LENGTH>
-class __vector;
-
-// NOTE: A single-component vector (short vector with 1 component) in the hc namespace
-// is implemented with the __vector class with 1 component.
-// However, for C++AMP (Concurrency namespace), a single-component vector is mapped to a
-// scalar according to the C++AMP specification 
-#if !__HCC_AMP__
-
-#define DECLARE_VECTOR_TYPE_CLASS(SCALAR_TYPE, CLASS_PREFIX) \
-typedef __vector<SCALAR_TYPE, 1>    CLASS_PREFIX ## 1; \
-typedef __vector<SCALAR_TYPE, 2>    CLASS_PREFIX ## 2; \
-typedef __vector<SCALAR_TYPE, 3>    CLASS_PREFIX ## 3; \
-typedef __vector<SCALAR_TYPE, 4>    CLASS_PREFIX ## 4; \
-typedef __vector<SCALAR_TYPE, 8>    CLASS_PREFIX ## 8; \
-typedef __vector<SCALAR_TYPE, 16>   CLASS_PREFIX ## 16; 
-
-#else
-
-#define DECLARE_VECTOR_TYPE_CLASS(SCALAR_TYPE, CLASS_PREFIX) \
-typedef SCALAR_TYPE    CLASS_PREFIX ## 1; \
-typedef __vector<SCALAR_TYPE, 2>    CLASS_PREFIX ## 2; \
-typedef __vector<SCALAR_TYPE, 3>    CLASS_PREFIX ## 3; \
-typedef __vector<SCALAR_TYPE, 4>    CLASS_PREFIX ## 4; \
-typedef __vector<SCALAR_TYPE, 8>    CLASS_PREFIX ## 8; \
-typedef __vector<SCALAR_TYPE, 16>   CLASS_PREFIX ## 16; 
-
-#endif
-
-DECLARE_VECTOR_TYPE_CLASS(unsigned char, uchar);
-DECLARE_VECTOR_TYPE_CLASS(char, char);
-DECLARE_VECTOR_TYPE_CLASS(unsigned short, ushort);
-DECLARE_VECTOR_TYPE_CLASS(short, short);
-DECLARE_VECTOR_TYPE_CLASS(unsigned int, uint);
-DECLARE_VECTOR_TYPE_CLASS(int, int);
-DECLARE_VECTOR_TYPE_CLASS(unsigned long, ulong);
-DECLARE_VECTOR_TYPE_CLASS(long, long);
-DECLARE_VECTOR_TYPE_CLASS(unsigned long long, ulonglong);
-DECLARE_VECTOR_TYPE_CLASS(long long, longlong);
-#if !__HCC_AMP__
-DECLARE_VECTOR_TYPE_CLASS(hc::half, half);
-#endif
-DECLARE_VECTOR_TYPE_CLASS(float, float);
-DECLARE_VECTOR_TYPE_CLASS(double, double);
-DECLARE_VECTOR_TYPE_CLASS(norm, norm);
-DECLARE_VECTOR_TYPE_CLASS(unorm, unorm);
-
-typedef uchar1 uchar_1;
-typedef uchar2 uchar_2;
-typedef uchar3 uchar_3;
-typedef uchar4 uchar_4;
-typedef uchar8 uchar_8;
-typedef uchar16 uchar_16;
-
-typedef char1 char_1;
-typedef char2 char_2;
-typedef char3 char_3;
-typedef char4 char_4;
-typedef char8 char_8;
-typedef char16 char_16;
-
-typedef ushort1 ushort_1;
-typedef ushort2 ushort_2;
-typedef ushort3 ushort_3;
-typedef ushort4 ushort_4;
-typedef ushort8 ushort_8;
-typedef ushort16 ushort_16;
-
-typedef short1 short_1;
-typedef short2 short_2;
-typedef short3 short_3;
-typedef short4 short_4;
-typedef short8 short_8;
-typedef short16 short_16;
-
-typedef uint1 uint_1;
-typedef uint2 uint_2;
-typedef uint3 uint_3;
-typedef uint4 uint_4;
-typedef uint8 uint_8;
-typedef uint16 uint_16;
-
-typedef int1 int_1;
-typedef int2 int_2;
-typedef int3 int_3;
-typedef int4 int_4;
-typedef int8 int_8;
-typedef int16 int_16;
-
-typedef ulong1 ulong_1;
-typedef ulong2 ulong_2;
-typedef ulong3 ulong_3;
-typedef ulong4 ulong_4;
-typedef ulong8 ulong_8;
-typedef ulong16 ulong_16;
-
-typedef long1 long_1;
-typedef long2 long_2;
-typedef long3 long_3;
-typedef long4 long_4;
-typedef long8 long_8;
-typedef long16 long_16;
-
-typedef ulonglong1 ulonglong_1;
-typedef ulonglong2 ulonglong_2;
-typedef ulonglong3 ulonglong_3;
-typedef ulonglong4 ulonglong_4;
-typedef ulonglong8 ulonglong_8;
-typedef ulonglong16 ulonglong_16;
-
-typedef longlong1 longlong_1;
-typedef longlong2 longlong_2;
-typedef longlong3 longlong_3;
-typedef longlong4 longlong_4;
-typedef longlong8 longlong_8;
-typedef longlong16 longlong_16;
-
-#if !__HCC_AMP__
-typedef half1 half_1;
-typedef half2 half_2;
-typedef half3 half_3;
-typedef half4 half_4;
-typedef half8 half_8;
-typedef half16 half_16;
-#endif
-
-typedef float1 float_1;
-typedef float2 float_2;
-typedef float3 float_3;
-typedef float4 float_4;
-typedef float8 float_8;
-typedef float16 float_16;
-
-typedef double1 double_1;
-typedef double2 double_2;
-typedef double3 double_3;
-typedef double4 double_4;
-typedef double8 double_8;
-typedef double16 double_16;
-
-typedef norm1 norm_1;
-typedef norm2 norm_2;
-typedef norm3 norm_3;
-typedef norm4 norm_4;
-typedef norm8 norm_8;
-typedef norm16 norm_16;
-
-typedef unorm1 unorm_1;
-typedef unorm2 unorm_2;
-typedef unorm3 unorm_3;
-typedef unorm4 unorm_4;
-typedef unorm8 unorm_8;
-typedef unorm16 unorm_16;
-
-template<typename SCALAR_TYPE, int SIZE> 
-struct short_vector {
-#if !__HCC_AMP__
-  typedef typename __vector<SCALAR_TYPE,SIZE>::type type;
-#else
-  typedef typename std::conditional<SIZE==1
-                                  , SCALAR_TYPE
-                                  , __vector<SCALAR_TYPE,SIZE>>::type type;
-#endif
-};
-
-
-// short_vector_traits for single component vector
-template <typename SCALAR_TYPE>
-struct short_vector_traits {
-  static_assert((std::is_integral<SCALAR_TYPE>::value
-                || std::is_floating_point<SCALAR_TYPE>::value
-                || std::is_same<SCALAR_TYPE, norm>::value
-                || std::is_same<SCALAR_TYPE, unorm>::value
-#if !__HCC_AMP__
-                || std::is_same<SCALAR_TYPE,hc::half>::value
-#endif
-                )
-                , "short_vector of this data type is not supported");
-  typedef SCALAR_TYPE value_type;
-  static int const size = 1;
-};
-
-// short_vector_traits for non-single component vetor
-template <typename SCALAR_TYPE, int SIZE>
-struct short_vector_traits<__vector<SCALAR_TYPE, SIZE>> {
-  typedef typename __vector<SCALAR_TYPE, SIZE>::value_type value_type;
-  static int const size = __vector<SCALAR_TYPE, SIZE>::size;
-};
-
-
-
-template <typename SCALAR_TYPE, unsigned int VECTOR_LENGTH>
-class __vector_data_container {
-  static_assert((VECTOR_LENGTH==1 || VECTOR_LENGTH==2 || VECTOR_LENGTH==3 
-                || VECTOR_LENGTH==4 || VECTOR_LENGTH==8 || VECTOR_LENGTH==16)
-                , "short_vector of this size is not supported");
-};
-
-
-template <typename SCALAR_TYPE>
-class __vector_data_container<SCALAR_TYPE,1> {
-
-public:
-
-  static const unsigned int size = 1;
-  typedef SCALAR_TYPE value_type; 
-  typedef SCALAR_TYPE vector_value_type  __attribute__((ext_vector_type(size)));
-
-  union {
-    vector_value_type data;
-    SCALAR_TYPE           ar[size];
-    struct { SCALAR_TYPE  x; };
-  };
-
-  __vector_data_container() __CPU_GPU__ { 
-    data = static_cast<SCALAR_TYPE>(0); 
-  }
-
-  __vector_data_container(vector_value_type v) __CPU_GPU__ { 
-    data = v; 
-  }
-
-  __attribute__((annotate("user_deserialize")))
-  __vector_data_container(const SCALAR_TYPE x) __CPU_GPU__ {
-    data = { x };
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    for (auto &component : ar) {
-      s.Append(sizeof(SCALAR_TYPE), &component);
-    }
-  }
-};
-
-
-template <typename SCALAR_TYPE>
-class __vector_data_container<SCALAR_TYPE,2> {
-
-public:
-
-  static const unsigned int size = 2;
-  typedef SCALAR_TYPE value_type; 
-  typedef SCALAR_TYPE vector_value_type  __attribute__((ext_vector_type(size)));
-
-  union {
-    vector_value_type data;
-    SCALAR_TYPE           ar[size];
-    struct { SCALAR_TYPE  x,y; };
-  };
-
-  __vector_data_container() __CPU_GPU__ { 
-    data = static_cast<SCALAR_TYPE>(0); 
-  }
-
-  __vector_data_container(vector_value_type v) __CPU_GPU__ { 
-    data = v; 
-  }
-
-  __attribute__((annotate("user_deserialize")))
-  __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y) __CPU_GPU__ {
-    data = { x, y };
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    for (auto &component : ar) {
-      s.Append(sizeof(SCALAR_TYPE), &component);
-    }
-  }
-};
-
-
-template <typename SCALAR_TYPE>
-class __vector_data_container<SCALAR_TYPE,3> {
-
-public:
-
-  static const unsigned int size = 3;
-  typedef SCALAR_TYPE value_type; 
-  typedef SCALAR_TYPE vector_value_type  __attribute__((ext_vector_type(size)));
-
-  union {
-    vector_value_type data;
-    SCALAR_TYPE           ar[size];
-    struct { SCALAR_TYPE  x,y,z; };
-  };
-
-  __vector_data_container() __CPU_GPU__ { 
-    data = static_cast<SCALAR_TYPE>(0); 
-  }
-
-  __vector_data_container(vector_value_type v) __CPU_GPU__ { 
-    data = v; 
-  }
-
-  __attribute__((annotate("user_deserialize")))
-  __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z) __CPU_GPU__ {
-    data = { x, y, z };
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    for (auto &component : ar) {
-      s.Append(sizeof(SCALAR_TYPE), &component);
-    }
-  }
-};
-
-
-template <typename SCALAR_TYPE>
-class __vector_data_container<SCALAR_TYPE,4> {
-
-public:
-
-  static const unsigned int size = 4;
-  typedef SCALAR_TYPE value_type; 
-  typedef SCALAR_TYPE vector_value_type  __attribute__((ext_vector_type(size)));
-
-  union {
-    vector_value_type data;
-    SCALAR_TYPE           ar[size];
-    struct { SCALAR_TYPE  x,y,z,w; };
-  };
-
-  __vector_data_container() __CPU_GPU__ { 
-    data = static_cast<SCALAR_TYPE>(0); 
-  }
-
-  __vector_data_container(vector_value_type v) __CPU_GPU__ { 
-    data = v; 
-  }
-
-  __attribute__((annotate("user_deserialize")))
-  __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z, const SCALAR_TYPE w) __CPU_GPU__ {
-    data = { x,y,z,w };
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    for (auto &component : ar) {
-      s.Append(sizeof(SCALAR_TYPE), &component);
-    }
-  }
-};
-
-
-template <typename SCALAR_TYPE>
-class __vector_data_container<SCALAR_TYPE,8> {
-
-public:
-
-  static const unsigned int size = 8;
-  typedef SCALAR_TYPE value_type; 
-  typedef SCALAR_TYPE vector_value_type  __attribute__((ext_vector_type(size)));
-
-  union {
-    vector_value_type data;
-    SCALAR_TYPE           ar[size];
-    struct { SCALAR_TYPE  x,y,z,w,s4,s5,s6,s7; };
-  };
-
-  __vector_data_container() __CPU_GPU__ { 
-    data = static_cast<SCALAR_TYPE>(0); 
-  }
-
-  __vector_data_container(vector_value_type v) __CPU_GPU__ { 
-    data = v; 
-  }
-
-  __attribute__((annotate("user_deserialize")))
-  __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z, const SCALAR_TYPE w
-     , const SCALAR_TYPE s4, const SCALAR_TYPE s5, const SCALAR_TYPE s6, const SCALAR_TYPE s7) __CPU_GPU__ {
-    data = { x,y,z,w,s4,s5,s6,s7 };
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    for (auto &component : ar) {
-      s.Append(sizeof(SCALAR_TYPE), &component);
-    }
-  }
-};
-
-
-template <typename SCALAR_TYPE>
-class __vector_data_container<SCALAR_TYPE,16> {
-
-public:
-
-  static const unsigned int size = 16;
-  typedef SCALAR_TYPE value_type; 
-  typedef SCALAR_TYPE vector_value_type  __attribute__((ext_vector_type(size)));
-
-  union {
-    vector_value_type data;
-    SCALAR_TYPE           ar[size];
-    struct { SCALAR_TYPE  x,y,z,w,s4,s5,s6,s7,s8,s9,sA,sB,sC,sD,sE,sF; };
-  };
-
-  __vector_data_container() __CPU_GPU__ { 
-    data = static_cast<SCALAR_TYPE>(0); 
-  }
-
-  __vector_data_container(vector_value_type v) __CPU_GPU__ { 
-    data = v; 
-  }
-
-  __attribute__((annotate("user_deserialize")))
-  __vector_data_container(const SCALAR_TYPE x, const SCALAR_TYPE y, const SCALAR_TYPE z, const SCALAR_TYPE w
-     , const SCALAR_TYPE s4, const SCALAR_TYPE s5, const SCALAR_TYPE s6, const SCALAR_TYPE s7
-     , const SCALAR_TYPE s8, const SCALAR_TYPE s9, const SCALAR_TYPE sA, const SCALAR_TYPE sB
-     , const SCALAR_TYPE sC, const SCALAR_TYPE sD, const SCALAR_TYPE sE, const SCALAR_TYPE sF) __CPU_GPU__ {
-    data = { x,y,z,w,s4,s5,s6,s7,s8,s9,sA,sB,sC,sD,sE,sF };
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    for (auto &component : ar) {
-      s.Append(sizeof(SCALAR_TYPE), &component);
-    }
-  }
-};
-
-
-
-// Implementation of a generic short vector
-template <typename SCALAR_TYPE, unsigned int VECTOR_LENGTH>
-class __vector : public __vector_data_container<SCALAR_TYPE, VECTOR_LENGTH>   {
-
-  static_assert((std::is_integral<SCALAR_TYPE>::value
-                || std::is_floating_point<SCALAR_TYPE>::value
-#if !__HCC_AMP__
-                || std::is_same<SCALAR_TYPE,hc::half>::value
-#endif
-                )
-                , "short_vector of this data type is not supported");
-
-  static_assert((VECTOR_LENGTH==1 || VECTOR_LENGTH==2 || VECTOR_LENGTH==3 
-                || VECTOR_LENGTH==4 || VECTOR_LENGTH==8 || VECTOR_LENGTH==16)
-                  , "short_vector of this size is not supported");
-
-public:
-  typedef SCALAR_TYPE value_type;
-  static const unsigned int size = VECTOR_LENGTH;
-  typedef __vector<value_type,size> __scalartype_N;
-  typedef value_type vector_value_type  __attribute__((ext_vector_type(size)));
-  typedef __vector_data_container<value_type,size> vector_container_type;
-
-private:
-  typedef value_type v1_type_internal  __attribute__((ext_vector_type(1)));
-  typedef value_type v2_type_internal  __attribute__((ext_vector_type(2)));
-  typedef value_type v3_type_internal  __attribute__((ext_vector_type(3)));
-  typedef value_type v4_type_internal  __attribute__((ext_vector_type(4)));
-  typedef value_type v8_type_internal  __attribute__((ext_vector_type(8)));
-  typedef value_type v16_type_internal  __attribute__((ext_vector_type(16)));
-
-
-public:
-
-  __vector() __CPU_GPU__ { } 
-
-  // the vector type overloaded constructor below already covers this scalar case
-  //__vector(value_type value) __CPU_GPU__ { data = { static_cast<value_type>(value), static_cast<value_type>(value)}; }
-  __vector(const vector_value_type& value) __CPU_GPU__ : vector_container_type(value) {}
-
-  __vector(const __scalartype_N& other) __CPU_GPU__ : vector_container_type(other.data) { }
-
-  // component-wise constructor
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==2,value_type>::type > 
-  __vector(value_type x, value_type y) __CPU_GPU__ : vector_container_type(x,y) { }
-
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==3,value_type>::type > 
-  __vector(value_type x, value_type y, value_type z) __CPU_GPU__ : vector_container_type(x,y,z) { }
-
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==4,value_type>::type > 
-  __vector(value_type x, value_type y, value_type z, value_type w) __CPU_GPU__ : vector_container_type(x,y,z,w) { }
-
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==8,value_type>::type > 
-  __vector(value_type x, value_type y
-           , value_type z, value_type w
-           , value_type s4, value_type s5
-           , value_type s6, value_type s7) __CPU_GPU__ : vector_container_type(x,y,z,w
-                                                                               ,s4,s5,s6,s7) { }
-
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==16,value_type>::type > 
-  __vector(value_type x, value_type y
-          , value_type z, value_type w
-          , value_type s4, value_type s5
-          , value_type s6, value_type s7
-          , value_type s8, value_type s9
-          , value_type sA, value_type sB
-          , value_type sC, value_type sD
-          , value_type sE, value_type sF) __CPU_GPU__ : vector_container_type(x,y,z,w,s4,s5,s6,s7,s8
-                                                                              ,s9,sA,sB,sC,sD,sE,sF) { }
-
-  // conversion constructor from other short vector types
-  template <typename ST>
-  explicit __vector(const  __vector<ST,1>& other) __CPU_GPU__ 
-             : vector_container_type(other.x) {}
-
-  template <typename ST>
-  explicit __vector(const  __vector<ST,2>& other) __CPU_GPU__ 
-             : vector_container_type(other.x, other.y) { }
-
-  template < typename ST>
-  explicit __vector(const  __vector<ST,3>& other) __CPU_GPU__ 
-             : vector_container_type(other.x, other.y, other.z) { }
-  
-  template <typename ST>
-  explicit __vector(const  __vector<ST,4>& other) __CPU_GPU__
-             : vector_container_type(other.x, other.y, other.z, other.w) { }
-  
-  template <typename ST>
-  explicit __vector(const  __vector<ST,8>& other) __CPU_GPU__ 
-             : vector_container_type(other.x, other.y, other.z, other.w
-                                    , other.s4, other.s5, other.s6, other.s7) { }
-  
-   template <typename ST>
-  explicit __vector(const  __vector<ST,16>& other)  __CPU_GPU__ 
-             : vector_container_type(other.x, other.y, other.z, other.w
-                                    , other.s4, other.s5, other.s6, other.s7
-                                    , other.s8, other.s9, other.sA, other.sB
-                                    , other.sC, other.sD, other.sE, other.sF) { }
-
-  // one-component accessors
-
-#define DECLARE_VECTOR_ONE_COMPONENT_GET_SET(N,MIN_V_SIZE) \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  value_type get_s ##N() const __CPU_GPU__ { return this->data.s ##N; } \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  void set_s ##N(value_type v) __CPU_GPU__ { this->data.s ##N = v; }
-
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(0,1)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(1,2)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(2,3)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(3,4)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(4,8)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(5,8)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(6,8)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(7,8)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(8,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(9,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(A,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(B,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(C,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(D,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(E,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(F,16)
-
-  value_type get_x() const __CPU_GPU__ { return get_s0(); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=2,value_type>::type >
-  value_type get_y() const __CPU_GPU__ { return get_s1(); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=3,value_type>::type >
-  value_type get_z() const __CPU_GPU__ { return get_s2(); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=4,value_type>::type >
-  value_type get_w() const __CPU_GPU__ { return get_s3(); }
-
-  void set_x(value_type v) __CPU_GPU__ { set_s0(v); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=2,value_type>::type >
-  void set_y(value_type v) __CPU_GPU__ { set_s1(v); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=3,value_type>::type >
-  void set_z(value_type v) __CPU_GPU__ { set_s2(v); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=4,value_type>::type >
-  void set_w(value_type v) __CPU_GPU__ { set_s3(v); }
-
-
-  // two-component accessors
-
-#define DECLARE_VECTOR_TWO_COMPONENT_GET_SET(C0,C1,MIN_V_SIZE) \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  __vector<value_type, 2> get_ ##C0 ##C1 () { return create_vector2(this->data.C0 ## C1); } \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  __vector<value_type, 2> get_ ##C1 ##C0 () { return create_vector2(this->data.C1 ## C0); } \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  void set_ ##C0 ##C1 (const __vector<value_type, 2>& v) { this->data.C0 ## C1 = v.get_vector();  } \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  void set_ ##C1 ##C0 (const __vector<value_type, 2>& v) { this->data.C1 ## C0 = v.get_vector();  } 
-
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(x,y,2)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(x,z,3)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(x,w,4)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(y,z,3)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(y,w,4)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(w,z,4)
-
-
-  // three-component accessors
-#define DECLARE_VECTOR_THREE_COMPONENT_GET_SET_PAIR(C0,C1,C2,MIN_V_SIZE) \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  __vector<value_type, 3> get_ ##C0 ##C1 ## C2 () { return create_vector3(this->data.C0 ## C1 ## C2); } \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  void set_ ##C0 ##C1 ##C2 (const __vector<value_type, 3>& v) { this->data.C0 ## C1 ## C2 = v.get_vector(); }  
-
-#define DECLARE_VECTOR_THREE_COMPONENT_GET_SET(C0,C1,C2,MIN_V_SIZE) \
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET_PAIR(C0,C1,C2,MIN_V_SIZE) \
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET_PAIR(C0,C2,C1,MIN_V_SIZE) \
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET_PAIR(C1,C0,C2,MIN_V_SIZE) \
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET_PAIR(C1,C2,C0,MIN_V_SIZE) \
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET_PAIR(C2,C0,C1,MIN_V_SIZE) \
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET_PAIR(C2,C1,C0,MIN_V_SIZE) 
-
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET(x,y,z,3)
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET(x,y,w,4)
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET(x,z,w,4)
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET(y,z,w,4) 
-
-
-  // four-component accessors
-
-#define DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C0,C1,C2,C3,MIN_V_SIZE) \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  __vector<value_type, 4> get_ ##C0 ##C1 ## C2 ## C3 () { return create_vector4(this->data.C0 ## C1 ## C2 ## C3); } \
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=MIN_V_SIZE,value_type>::type > \
-  void set_ ##C0 ##C1 ##C2 ##C3 (const __vector<value_type, 4>& v) { this->data.C0 ## C1 ## C2 ## C3 = v.get_vector(); }  
-
-#define DECLARE_VECTOR_FOUR_COMPONENT_GET_SET(C0,C1,C2,C3,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C0,C1,C2,C3,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C0,C1,C3,C2,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C0,C2,C1,C3,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C0,C2,C3,C1,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C0,C3,C1,C2,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C0,C3,C2,C1,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C1,C0,C2,C3,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C1,C0,C3,C2,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C1,C2,C0,C3,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C1,C2,C3,C0,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C1,C3,C0,C2,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C1,C3,C2,C0,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C2,C0,C1,C3,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C2,C0,C3,C1,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C2,C1,C0,C3,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C2,C1,C3,C0,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C2,C3,C0,C1,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C2,C3,C1,C0,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C3,C0,C1,C2,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C3,C0,C2,C1,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C3,C1,C0,C2,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C3,C1,C2,C0,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C3,C2,C0,C1,MIN_V_SIZE) \
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET_PAIR(C3,C2,C1,C0,MIN_V_SIZE) 
-
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET(x,y,z,w,4);
-
-
-  vector_value_type get_vector() const __CPU_GPU__ { return this->data;  }
-  void set_vector(vector_value_type v)  __CPU_GPU__ { this->data = v; }
-
-  __scalartype_N& operator=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    this->data = rhs.data;
-    return *this;
-  }
-
-  __scalartype_N& operator++() __CPU_GPU__ { 
-     this->data += static_cast<vector_value_type>(static_cast<value_type>(1)); 
-     return *this; 
-  }
-  __scalartype_N operator++(int) __CPU_GPU__ { 
-    __scalartype_N r(*this);
-    operator++();
-    return r;
-  }
-  __scalartype_N& operator--() __CPU_GPU__ { 
-    this->data -= static_cast<vector_value_type>(static_cast<value_type>(1)); 
-    return *this;
-  }
-  __scalartype_N operator--(int) __CPU_GPU__ { 
-    __scalartype_N r(*this);
-    operator--();
-    return r;
-  }
-
-  __scalartype_N  operator+(const __scalartype_N& rhs) __CPU_GPU__ {
-    __scalartype_N r;   
-    r.data = this->data+rhs.data;
-    return r;
-  }
-  __scalartype_N& operator+=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    this->data += rhs.data;
-    return *this;
-  }
-
-  __scalartype_N& operator-=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    this->data -= rhs.data;
-    return *this;
-  }
- 
-  __scalartype_N& operator*=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    this->data *= rhs.data;
-    return *this;
-  }
- 
-  __scalartype_N& operator/=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    this->data /= rhs.data;
-    return *this;
-  }
-
-  __scalartype_N operator-() __CPU_GPU__ {
-    static_assert(std::is_signed<value_type>::value, "operator- can only support short vector of signed integral or floating-point types.");
-    __scalartype_N r;
-    r.data = -this->data;
-    return r;
-  }
-
-  __scalartype_N operator~() __CPU_GPU__ { 
-    static_assert(std::is_integral<value_type>::value, "operator~ can only support short vector of integral types.");
-    __scalartype_N r;
-    r.data = ~this->data;
-    return r;
-  }
-
-  __scalartype_N operator%(const __scalartype_N& lhs) __CPU_GPU__ { 
-    static_assert(std::is_integral<value_type>::value, "operator% can only support short vector of integral types.");
-    __scalartype_N r;
-    r.data = this->data%lhs.data;
-    return r;
-  }
-  __scalartype_N& operator%=(const __scalartype_N& lhs) __CPU_GPU__ { 
-    *this = *this%lhs;
-    return *this;
-  }
-
-  __scalartype_N operator^(const __scalartype_N& lhs) __CPU_GPU__ { 
-    static_assert(std::is_integral<value_type>::value, "operator^ can only support integral short vector.");
-    __scalartype_N r;
-    r.data = this->data^lhs.data;
-    return r;
-  }
-  __scalartype_N& operator^=(const __scalartype_N& lhs) __CPU_GPU__ { 
-    *this = *this^lhs;
-    return *this;
-  }
-
-  __scalartype_N operator|(const __scalartype_N& lhs) __CPU_GPU__ { 
-    static_assert(std::is_integral<value_type>::value, "operator| can only support integral short vector.");
-    __scalartype_N r;
-    r.data = this->data|lhs.data;
-    return r;
-  }
-  __scalartype_N& operator|=(const __scalartype_N& lhs) __CPU_GPU__ { 
-    *this = *this|lhs;
-    return *this;
-  }
-
-  __scalartype_N operator&(const __scalartype_N& lhs) __CPU_GPU__ { 
-   static_assert(std::is_integral<value_type>::value, "operator& can only support integral short vector.");
-    __scalartype_N r;
-    r.data = this->data&lhs.data;
-    return r;
-  }
-  __scalartype_N& operator&=(const __scalartype_N& lhs) __CPU_GPU__ { 
-    *this = *this&lhs;
-    return *this;
-  }
-
-  __scalartype_N operator>>(const __scalartype_N& lhs) __CPU_GPU__ { 
-    static_assert(std::is_integral<value_type>::value, "operator>> can only support integral short vector.");
-    __scalartype_N r;
-    r.data = this->data>>lhs.data;
-    return r;
-  }
-  __scalartype_N& operator>>=(const __scalartype_N& lhs) __CPU_GPU__ { 
-    *this = *this>>lhs;
-    return *this;
-  }
-
-  __scalartype_N operator<<(const __scalartype_N& lhs) __CPU_GPU__ { 
-    static_assert(std::is_integral<value_type>::value, "operator<< can only support integral short vector.");
-    __scalartype_N r;
-    r.data = this->data<<lhs.data;
-    return r;
-  }
-  __scalartype_N& operator<<=(const __scalartype_N& lhs) __CPU_GPU__ { 
-    *this = *this<<lhs;
-    return *this;
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==1,value_type>::type >
-  bool operator==(const __vector<value_type, 1>& rhs) __CPU_GPU__ { 
-    return (this->data.x == rhs.data.x); 
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==2,value_type>::type >
-  bool operator==(const __vector<value_type, 2>& rhs) __CPU_GPU__ { 
-    return (this->data.x == rhs.data.x 
-         && this->data.y == rhs.data.y); 
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==3,value_type>::type >
-  bool operator==(const __vector<value_type, 3>& rhs) __CPU_GPU__ { 
-    return   ((this->data.s0 == rhs.data.s0) && (this->data.s1 == rhs.data.s1))
-              && (this->data.s2 == rhs.data.s2);
-
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==4,value_type>::type >
-  bool operator==(const __vector<value_type, 4>& rhs) __CPU_GPU__ { 
-    return   ((this->data.s0 == rhs.data.s0) && (this->data.s1 == rhs.data.s1))
-              && ((this->data.s2 == rhs.data.s2) && (this->data.s3 == rhs.data.s3));
-
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==8,value_type>::type >
-  bool operator==(const __vector<value_type, 8>& rhs) __CPU_GPU__ {
-    return    (((this->data.s0 == rhs.data.s0) && (this->data.s1 == rhs.data.s1))
-              && ((this->data.s2 == rhs.data.s2) && (this->data.s3 == rhs.data.s3)))
-            &&  
-              (((this->data.s4 == rhs.data.s4) && (this->data.s5 == rhs.data.s5))
-              && ((this->data.s6 == rhs.data.s6) && (this->data.s7 == rhs.data.s7)))
-              ;
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==16,value_type>::type >
-  bool operator==(const __vector<value_type, 16>& rhs) __CPU_GPU__ {
-
-    return (   (((this->data.s0 == rhs.data.s0) && (this->data.s1 == rhs.data.s1))
-              && ((this->data.s2 == rhs.data.s2) && (this->data.s3 == rhs.data.s3)))
-            &&  
-              (((this->data.s4 == rhs.data.s4) && (this->data.s5 == rhs.data.s5))
-              && ((this->data.s6 == rhs.data.s6) && (this->data.s7 == rhs.data.s7)))
-           )
-           &&
-           (  (((this->data.s8 == rhs.data.s8) && (this->data.s9 == rhs.data.s9))
-              && ((this->data.sA == rhs.data.sA) && (this->data.sB == rhs.data.sB)))
-            &&  
-              (((this->data.sC == rhs.data.sC) && (this->data.sD == rhs.data.sD))
-              && ((this->data.sE == rhs.data.sE) && (this->data.sF == rhs.data.sF)))
-           )
-           ;
-  }
-
-  bool operator!=(const __scalartype_N& rhs) __CPU_GPU__ { return !(*this==rhs); }
-
-private:
-
-  __vector<value_type,2> create_vector2(v2_type_internal v) __CPU_GPU__ {
-    return __vector<value_type,2>(v);
-  }
-
-  __vector<value_type,3> create_vector3(v3_type_internal v) __CPU_GPU__ {
-    return __vector<value_type,3>(v);
-  }
-
-  __vector<value_type,4> create_vector4(v4_type_internal v) __CPU_GPU__ {
-    return __vector<value_type,4>(v);
-  }
-};
-
-
-template <typename SCALAR_TYPE, unsigned int VECTOR_LENGTH>
-__vector<SCALAR_TYPE,VECTOR_LENGTH> operator+(const __vector<SCALAR_TYPE,VECTOR_LENGTH>& lhs
-                                                          , const __vector<SCALAR_TYPE,VECTOR_LENGTH>& rhs) __CPU_GPU__ {
-  __vector<SCALAR_TYPE,VECTOR_LENGTH> r(lhs.get_vector() + rhs.get_vector());
-  return r;
-}
-
-
-template <typename SCALAR_TYPE, unsigned int VECTOR_LENGTH>
-__vector<SCALAR_TYPE,VECTOR_LENGTH> operator-(const __vector<SCALAR_TYPE,VECTOR_LENGTH>& lhs
-                                                          , const __vector<SCALAR_TYPE,VECTOR_LENGTH>& rhs) __CPU_GPU__ {
-  __vector<SCALAR_TYPE,VECTOR_LENGTH> r(lhs.get_vector() - rhs.get_vector());
-  return r;
-}
-
-template <typename SCALAR_TYPE, unsigned int VECTOR_LENGTH>
-__vector<SCALAR_TYPE,VECTOR_LENGTH> operator*(const __vector<SCALAR_TYPE,VECTOR_LENGTH>& lhs
-                                                          , const __vector<SCALAR_TYPE,VECTOR_LENGTH>& rhs) __CPU_GPU__ {
-  __vector<SCALAR_TYPE,VECTOR_LENGTH> r(lhs.get_vector() * rhs.get_vector());
-  return r;
-}
-
-template <typename SCALAR_TYPE, unsigned int VECTOR_LENGTH>
-__vector<SCALAR_TYPE,VECTOR_LENGTH> operator/(const __vector<SCALAR_TYPE,VECTOR_LENGTH>& lhs
-                                                          , const __vector<SCALAR_TYPE,VECTOR_LENGTH>& rhs) __CPU_GPU__ {
-  __vector<SCALAR_TYPE,VECTOR_LENGTH> r(lhs.get_vector() / rhs.get_vector());
-  return r;
-}
-
-// scalar * vector
-template <typename SCALAR_TYPE1, typename SCALAR_TYPE2, unsigned int VECTOR_LENGTH>
-typename std::enable_if<std::is_scalar<SCALAR_TYPE1>::value, __vector<SCALAR_TYPE2,VECTOR_LENGTH> >::type
-operator*(const SCALAR_TYPE1& lhs,
-          const __vector<SCALAR_TYPE2,VECTOR_LENGTH>& rhs) __CPU_GPU__ {
-  __vector<SCALAR_TYPE2,VECTOR_LENGTH> r(rhs.get_vector() * static_cast<SCALAR_TYPE2>(lhs));
-  return r;
-}
-
-// vector * scalar
-template <typename SCALAR_TYPE1, typename SCALAR_TYPE2, unsigned int VECTOR_LENGTH>
-typename std::enable_if<std::is_scalar<SCALAR_TYPE2>::value, __vector<SCALAR_TYPE1,VECTOR_LENGTH> >::type
-operator*(const __vector<SCALAR_TYPE1,VECTOR_LENGTH>& lhs,
-          const SCALAR_TYPE2& rhs) __CPU_GPU__ {
-  __vector<SCALAR_TYPE1,VECTOR_LENGTH> r(lhs.get_vector() * static_cast<SCALAR_TYPE1>(rhs));
-  return r;
-}
-
-// Specialization for norm, unorm
-template <bool normIsSigned, unsigned int VECTOR_LENGTH>
-class __vector<__amp_norm_template<normIsSigned>,VECTOR_LENGTH> :
-         public  __vector_data_container<float, VECTOR_LENGTH>  {
-
-  static_assert((VECTOR_LENGTH==1 || VECTOR_LENGTH==2 || VECTOR_LENGTH==3 
-                || VECTOR_LENGTH==4 || VECTOR_LENGTH==8 || VECTOR_LENGTH==16)
-                  , "short_vector of this size is not supported");
-
-public:
-  typedef __amp_norm_template<normIsSigned> value_type;
-  static const unsigned int size = VECTOR_LENGTH;
-  typedef __vector<value_type,size> __scalartype_N;
-  typedef float vector_value_type  __attribute__((ext_vector_type(size)));
-  typedef __vector_data_container<float,size> vector_container_type;
-
-private:
-  typedef float v1_type_internal  __attribute__((ext_vector_type(1)));
-  typedef float v2_type_internal  __attribute__((ext_vector_type(2)));
-  typedef float v3_type_internal  __attribute__((ext_vector_type(3)));
-  typedef float v4_type_internal  __attribute__((ext_vector_type(4)));
-  typedef float v8_type_internal  __attribute__((ext_vector_type(8)));
-  typedef float v16_type_internal  __attribute__((ext_vector_type(16)));
-
-  v1_type_internal clamp(v1_type_internal v) __CPU_GPU__ {
-    return { value_type(v.s0) };
-  }
-
-  v2_type_internal clamp(v2_type_internal v) __CPU_GPU__ {
-    return { value_type(v.s0)
-            ,value_type(v.s1)
-            };
-  }
-
-  v4_type_internal clamp(v4_type_internal v) __CPU_GPU__ {
-    return { value_type(v.s0)
-            ,value_type(v.s1)
-            ,value_type(v.s2)
-            ,value_type(v.s3)
-            };
-  }
-
-  v8_type_internal clamp(v8_type_internal v) __CPU_GPU__ {
-    return { value_type(v.s0)
-            ,value_type(v.s1)
-            ,value_type(v.s2)
-            ,value_type(v.s3)
-            ,value_type(v.s4)
-            ,value_type(v.s5)
-            ,value_type(v.s6)
-            ,value_type(v.s7)
-            };
-  }
-
-  v16_type_internal clamp(v16_type_internal v) __CPU_GPU__ {
-    return { value_type(v.s0)
-            ,value_type(v.s1)
-            ,value_type(v.s2)
-            ,value_type(v.s3)
-            ,value_type(v.s4)
-            ,value_type(v.s5)
-            ,value_type(v.s6)
-            ,value_type(v.s7)
-            ,value_type(v.s8)
-            ,value_type(v.s9)
-            ,value_type(v.sA)
-            ,value_type(v.sB)
-            ,value_type(v.sC)
-            ,value_type(v.sD)
-            ,value_type(v.sE)
-            ,value_type(v.sF)
-            };
-  }
-
-public:
-
-  __vector() __CPU_GPU__ { }
-
-  // the vector type overloaded constructor below already covers this scalar case
-  //__vector(value_type value) __CPU_GPU__ { data = { static_cast<value_type>(value), static_cast<value_type>(value)}; }
-  __vector(const vector_value_type& value) __CPU_GPU__   { set_vector(value); }
-
-  __vector(const __scalartype_N& other) __CPU_GPU__ : vector_container_type(other.data) {  }
-
-  // component-wise constructor
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==2,value_type>::type > 
-  __vector(value_type x, value_type y) __CPU_GPU__ : vector_container_type(x,y) { }
-
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==3,value_type>::type > 
-  __vector(value_type x, value_type y, value_type z) __CPU_GPU__ : vector_container_type(x,y,z) { }
-
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==4,value_type>::type > 
-  __vector(value_type x, value_type y, value_type z, value_type w) __CPU_GPU__ : vector_container_type(x,y,z,w) { }
-
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==8,value_type>::type > 
-  __vector(value_type x, value_type y
-           , value_type z, value_type w
-           , value_type s4, value_type s5
-           , value_type s6, value_type s7) __CPU_GPU__ : vector_container_type(x,y,z,w
-                                                                              ,s4,s5,s6,s7) { }
-
-  template<typename T = __scalartype_N
-          ,class = typename std::enable_if<T::size==16,value_type>::type > 
-  __vector(value_type x, value_type y
-          , value_type z, value_type w
-          , value_type s4, value_type s5
-          , value_type s6, value_type s7
-          , value_type s8, value_type s9
-          , value_type sA, value_type sB
-          , value_type sC, value_type sD
-          , value_type sE, value_type sF) __CPU_GPU__ : vector_container_type(x,y,z,w,s4,s5,s6,s7,s8
-                                                                              ,s9,sA,sB,sC,sD,sE,sF)  { }
-
-  
-  // conversion constructor from other short vector types
-  template <typename ST>
-  explicit __vector(const  __vector<ST,1>& other)  __CPU_GPU__ { this->data = { value_type(other.get_s0()) }; }
-
-  template <typename ST>
-  explicit __vector(const  __vector<ST,2>& other)  __CPU_GPU__ { this->data = { value_type(other.get_s0())
-                                                                               ,value_type(other.get_s1()) }; }
-
-  template < typename ST>
-  explicit __vector(const  __vector<ST,3>& other)  __CPU_GPU__ { this->data = { value_type(other.get_s0())
-                                                                               ,value_type(other.get_s1())
-                                                                               ,value_type(other.get_s2()) }; }
-
-  template <typename ST>
-  explicit __vector(const  __vector<ST,4>& other)  __CPU_GPU__ { this->data = { value_type(other.get_s0())
-                                                                               ,value_type(other.get_s1())
-                                                                               ,value_type(other.get_s2()) 
-                                                                               ,value_type(other.get_s3()) }; }
-
-  template <typename ST>
-  explicit __vector(const  __vector<ST,8>& other)  __CPU_GPU__ { this->data = { value_type(other.get_s0())
-                                                                               ,value_type(other.get_s1())
-                                                                               ,value_type(other.get_s2()) 
-                                                                               ,value_type(other.get_s3()) 
-                                                                               ,value_type(other.get_s4())
-                                                                               ,value_type(other.get_s5())
-                                                                               ,value_type(other.get_s6()) 
-                                                                               ,value_type(other.get_s7()) }; }
-
-  template <typename ST>
-  explicit __vector(const  __vector<ST,16>& other)  __CPU_GPU__ { this->data = { value_type(other.get_s0())
-                                                                                ,value_type(other.get_s1())
-                                                                                ,value_type(other.get_s2()) 
-                                                                                ,value_type(other.get_s3()) 
-                                                                                ,value_type(other.get_s4())
-                                                                                ,value_type(other.get_s5())
-                                                                                ,value_type(other.get_s6()) 
-                                                                                ,value_type(other.get_s7()) 
-                                                                                ,value_type(other.get_s8())
-                                                                                ,value_type(other.get_s9())
-                                                                                ,value_type(other.get_sA()) 
-                                                                                ,value_type(other.get_sB()) 
-                                                                                ,value_type(other.get_sC())
-                                                                                ,value_type(other.get_sD())
-                                                                                ,value_type(other.get_sE()) 
-                                                                                ,value_type(other.get_sF()) }; }
-
-
-
-  // one-component accessors
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(0,1)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(1,2)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(2,3)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(3,4)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(4,8)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(5,8)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(6,8)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(7,8)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(8,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(9,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(A,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(B,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(C,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(D,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(E,16)
-  DECLARE_VECTOR_ONE_COMPONENT_GET_SET(F,16)
-
-  value_type get_x() const __CPU_GPU__ { return get_s0(); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=2,value_type>::type >
-  value_type get_y() const __CPU_GPU__ { return get_s1(); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=3,value_type>::type >
-  value_type get_z() const __CPU_GPU__ { return get_s2(); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=4,value_type>::type >
-  value_type get_w() const __CPU_GPU__ { return get_s3(); }
-
-  void set_x(value_type v) __CPU_GPU__ { set_s0(v); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=2,value_type>::type >
-  void set_y(value_type v) __CPU_GPU__ { set_s1(v); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=3,value_type>::type >
-  void set_z(value_type v) __CPU_GPU__ { set_s2(v); }
-
-  template <typename T = __scalartype_N ,class = typename std::enable_if<T::size>=4,value_type>::type >
-  void set_w(value_type v) __CPU_GPU__ { set_s3(v); }
-
-
-  // two-component accessors
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(x,y,2)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(x,z,3)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(x,w,4)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(y,z,3)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(y,w,4)
-  DECLARE_VECTOR_TWO_COMPONENT_GET_SET(w,z,4)
-
-
-  // three-component accessors
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET(x,y,z,3)
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET(x,y,w,4)
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET(x,z,w,4)
-  DECLARE_VECTOR_THREE_COMPONENT_GET_SET(y,z,w,4) 
-
-
-  // four-component accessors
-  DECLARE_VECTOR_FOUR_COMPONENT_GET_SET(x,y,z,w,4);
-
-  vector_value_type get_vector() const __CPU_GPU__ { return this->data; }
-  void set_vector(vector_value_type v)  __CPU_GPU__ { this->data = clamp(v); }
-
-  __scalartype_N& operator=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    this->data = rhs.data;
-    return *this;
-  }
-
-  __scalartype_N& operator++() __CPU_GPU__ { 
-     set_vector(this->data + static_cast<vector_value_type>(static_cast<value_type>(1))); 
-     return *this; 
-  }
-  __scalartype_N operator++(int) __CPU_GPU__ { 
-    __scalartype_N r(*this);
-    operator++();
-    return r;
-  }
-  __scalartype_N& operator--() __CPU_GPU__ { 
-    set_vector(this->data - static_cast<vector_value_type>(static_cast<value_type>(1))); 
-    return *this;
-  }
-  __scalartype_N operator--(int) __CPU_GPU__ { 
-    __scalartype_N r(*this);
-    operator--();
-    return r;
-  }
-
-  __scalartype_N  operator+(const __scalartype_N& rhs) __CPU_GPU__ {
-    __scalartype_N r;   
-    r.set_vector(this->data+rhs.data);
-    return r;
-  }
-  __scalartype_N& operator+=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    set_vector(this->data + rhs.data);
-    return *this;
-  }
-
-  __scalartype_N& operator-=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    set_vector(this->data - rhs.data);
-    return *this;
-  }
- 
-  __scalartype_N& operator*=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    set_vector(this->data * rhs.data);
-    return *this;
-  }
- 
-  __scalartype_N& operator/=(const __scalartype_N& rhs) __CPU_GPU__ { 
-    set_vector(this->data / rhs.data);
-    return *this;
-  }
-
-  __scalartype_N operator-() __CPU_GPU__ {
-    static_assert(normIsSigned, "operator- can only support short vector of signed integral or floating-point types.");
-    __scalartype_N r;
-    r.data = -this->data;
-    return r;
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==1,value_type>::type >
-  bool operator==(const __vector<value_type, 1>& rhs) __CPU_GPU__ { 
-    return (this->data.x == rhs.data.x); 
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==2,value_type>::type >
-  bool operator==(const __vector<value_type, 2>& rhs) __CPU_GPU__ { 
-    return (this->data.x == rhs.data.x 
-         && this->data.y == rhs.data.y); 
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==3,value_type>::type >
-  bool operator==(const __vector<value_type, 3>& rhs) __CPU_GPU__ { 
-    return   ((this->data.s0 == rhs.data.s0) && (this->data.s1 == rhs.data.s1))
-              && (this->data.s2 == rhs.data.s2);
-
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==4,value_type>::type >
-  bool operator==(const __vector<value_type, 4>& rhs) __CPU_GPU__ { 
-    return   ((this->data.s0 == rhs.data.s0) && (this->data.s1 == rhs.data.s1))
-              && ((this->data.s2 == rhs.data.s2) && (this->data.s3 == rhs.data.s3));
-
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==8,value_type>::type >
-  bool operator==(const __vector<value_type, 8>& rhs) __CPU_GPU__ {
-    return    (((this->data.s0 == rhs.data.s0) && (this->data.s1 == rhs.data.s1))
-              && ((this->data.s2 == rhs.data.s2) && (this->data.s3 == rhs.data.s3)))
-            &&  
-              (((this->data.s4 == rhs.data.s4) && (this->data.s5 == rhs.data.s5))
-              && ((this->data.s6 == rhs.data.s6) && (this->data.s7 == rhs.data.s7)))
-              ;
-  }
-
-  template <typename T = __scalartype_N
-            , class = typename std::enable_if<T::size==16,value_type>::type >
-  bool operator==(const __vector<value_type, 16>& rhs) __CPU_GPU__ {
-
-    return (   (((this->data.s0 == rhs.data.s0) && (this->data.s1 == rhs.data.s1))
-              && ((this->data.s2 == rhs.data.s2) && (this->data.s3 == rhs.data.s3)))
-            &&  
-              (((this->data.s4 == rhs.data.s4) && (this->data.s5 == rhs.data.s5))
-              && ((this->data.s6 == rhs.data.s6) && (this->data.s7 == rhs.data.s7)))
-           )
-           &&
-           (  (((this->data.s8 == rhs.data.s8) && (this->data.s9 == rhs.data.s9))
-              && ((this->data.sA == rhs.data.sA) && (this->data.sB == rhs.data.sB)))
-            &&  
-              (((this->data.sC == rhs.data.sC) && (this->data.sD == rhs.data.sD))
-              && ((this->data.sE == rhs.data.sE) && (this->data.sF == rhs.data.sF)))
-           )
-           ;
-  }
-
-  bool operator!=(const __scalartype_N& rhs) __CPU_GPU__ { return !(*this==rhs); }
-
-private:
-
-  __vector<value_type,2> create_vector2(v2_type_internal v) __CPU_GPU__ {
-    return __vector<value_type,2>(v);
-  }
-
-  __vector<value_type,3> create_vector3(v3_type_internal v) __CPU_GPU__ {
-    return __vector<value_type,3>(v);
-  }
-
-  __vector<value_type,4> create_vector4(v4_type_internal v) __CPU_GPU__ {
-    return __vector<value_type,4>(v);
-  }
-};
-
diff --git a/include/hcc_features.hpp b/include/hcc_features.hpp
index 4191f79cbae..9e620716395 100644
--- a/include/hcc_features.hpp
+++ b/include/hcc_features.hpp
@@ -4,6 +4,10 @@
 //
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 //
 // If set, am_memtracker_update API accepts appPtr parm
 #define __HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE (1)
diff --git a/include/hsa_atomic.h b/include/hsa_atomic.h
deleted file mode 100644
index 599dc2be568..00000000000
--- a/include/hsa_atomic.h
+++ /dev/null
@@ -1,143 +0,0 @@
-#pragma once
-
-#define HSAIL_BUILTIN_GPU __attribute__((hc)) 
-#define HSAIL_BUILTIN_CPU __attribute__((cpu)) inline
-
-#ifdef __KALMAR_ACCELERATOR__
-
-// fetch_add
-extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_add_int(int* dest, int val);
-extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_add_unsigned(unsigned int* dest, unsigned int val);
-extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_add_int64(int64_t* dest, int64_t val);
-extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_add_uint64(uint64_t* dest, uint64_t val);
-
-// fetch_sub
-extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_sub_int(int* dest, int val);
-extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_sub_unsigned(unsigned int* dest, unsigned int val);
-extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_sub_int64(int64_t* dest, int64_t val);
-extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_sub_uint64(uint64_t* dest, uint64_t val);
-
-// fetch_and
-extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_and_int(int* dest, int val);
-extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_and_unsigned(unsigned int* dest, unsigned int val);
-extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_and_int64(int64_t* dest, int64_t val);
-extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_and_uint64(uint64_t* dest, uint64_t val);
-
-// fetch_or
-extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_or_int(int* dest, int val);
-extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_or_unsigned(unsigned int* dest, unsigned int val);
-extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_or_int64(int64_t* dest, int64_t val);
-extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_or_uint64(uint64_t* dest, uint64_t val);
-
-// fetch_xor
-extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_fetch_xor_int(int* dest, int val);
-extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_fetch_xor_unsigned(unsigned int* dest, unsigned int val);
-extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_fetch_xor_int64(int64_t* dest, int64_t val);
-extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_fetch_xor_uint64(uint64_t* dest, uint64_t val);
-
-// exchange
-extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_exchange_int(int* dest, int val);
-extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_exchange_unsigned(unsigned int* dest, unsigned int val);
-extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_exchange_int64(int64_t* dest, int64_t val);
-extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_exchange_uint64(uint64_t* dest, uint64_t val);
-
-// compare_exchange
-extern "C" HSAIL_BUILTIN_GPU int __hsail_atomic_compare_exchange_int(int* dest, int compare, int val);
-extern "C" HSAIL_BUILTIN_GPU unsigned int __hsail_atomic_compare_exchange_unsigned(unsigned int* dest, unsigned int compare, unsigned int val);
-extern "C" HSAIL_BUILTIN_GPU int64_t __hsail_atomic_compare_exchange_int64(int64_t* dest, int64_t compare, int64_t val);
-extern "C" HSAIL_BUILTIN_GPU uint64_t __hsail_atomic_compare_exchange_uint64(uint64_t* dest, uint64_t compare, uint64_t val);
-
-#else
-
-// fetch_add
-extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_add_int(int* dest, int val)
-{ return __sync_fetch_and_add(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_add_unsigned(unsigned int* dest, unsigned int val)
-{ return __sync_fetch_and_add(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_add_int64(int64_t* dest, int64_t val)
-{ return __sync_fetch_and_add(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_add_uint64(uint64_t* dest, uint64_t val)
-{ return __sync_fetch_and_add(dest, val); }
-
-// fetch_sub
-extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_sub_int(int* dest, int val)
-{ return __sync_fetch_and_sub(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_sub_unsigned(unsigned int* dest, unsigned int val)
-{ return __sync_fetch_and_sub(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_sub_int64(int64_t* dest, int64_t val)
-{ return __sync_fetch_and_sub(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_sub_uint64(uint64_t* dest, uint64_t val)
-{ return __sync_fetch_and_sub(dest, val); }
-
-// fetch_and
-extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_and_int(int* dest, int val)
-{ return __sync_fetch_and_and(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_and_unsigned(unsigned int* dest, unsigned int val)
-{ return __sync_fetch_and_and(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_and_int64(int64_t* dest, int64_t val)
-{ return __sync_fetch_and_and(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_and_uint64(uint64_t* dest, uint64_t val)
-{ return __sync_fetch_and_and(dest, val); }
-
-// fetch_or
-extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_or_int(int* dest, int val)
-{ return __sync_fetch_and_or(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_or_unsigned(unsigned int* dest, unsigned int val)
-{ return __sync_fetch_and_or(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_or_int64(int64_t* dest, int64_t val)
-{ return __sync_fetch_and_or(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_or_uint64(uint64_t* dest, uint64_t val)
-{ return __sync_fetch_and_or(dest, val); }
-
-// fetch_xor
-extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_fetch_xor_int(int* dest, int val)
-{ return __sync_fetch_and_xor(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_fetch_xor_unsigned(unsigned int* dest, unsigned int val)
-{ return __sync_fetch_and_xor(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_fetch_xor_int64(int64_t* dest, int64_t val)
-{ return __sync_fetch_and_xor(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_fetch_xor_uint64(uint64_t* dest, uint64_t val)
-{ return __sync_fetch_and_xor(dest, val); }
-
-// exchange
-extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_exchange_int(int* dest, int val)
-{ return __sync_swap(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_exchange_unsigned(unsigned int* dest, unsigned int val)
-{ return __sync_swap(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_exchange_int64(int64_t* dest, int64_t val)
-{ return __sync_swap(dest, val); }
-
-extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_exchange_uint64(uint64_t* dest, uint64_t val)
-{ return __sync_swap(dest, val); }
-
-// compare_exchange
-extern "C" HSAIL_BUILTIN_CPU int __hsail_atomic_compare_exchange_int(int* dest, int compare, int val)
-{ return __sync_val_compare_and_swap(dest, compare, val); }
-
-extern "C" HSAIL_BUILTIN_CPU unsigned int __hsail_atomic_compare_exchange_unsigned(unsigned int* dest, unsigned int compare, unsigned int val)
-{ return __sync_val_compare_and_swap(dest, compare, val); }
-
-extern "C" HSAIL_BUILTIN_CPU int64_t __hsail_atomic_compare_exchange_int64(int64_t* dest, int64_t compare, int64_t val)
-{ return __sync_val_compare_and_swap(dest, compare, val); }
-
-extern "C" HSAIL_BUILTIN_CPU uint64_t __hsail_atomic_compare_exchange_uint64(uint64_t* dest, uint64_t compare, uint64_t val)
-{ return __sync_val_compare_and_swap(dest, compare, val); }
-
-#endif
diff --git a/include/kalmar_aligned_alloc.h b/include/kalmar_aligned_alloc.h
index e2609c79569..e342775bce5 100644
--- a/include/kalmar_aligned_alloc.h
+++ b/include/kalmar_aligned_alloc.h
@@ -7,6 +7,11 @@
 
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
+#include <cassert>
 #include <memory>
 #include <stdlib.h>
 
diff --git a/include/kalmar_buffer.h b/include/kalmar_buffer.h
deleted file mode 100644
index 842c589f5d1..00000000000
--- a/include/kalmar_buffer.h
+++ /dev/null
@@ -1,110 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include "kalmar_runtime.h"
-#include "kalmar_serialize.h"
-
-/** \cond HIDDEN_SYMBOLS */
-namespace Kalmar {
-
-// Dummy interface that looks somewhat like std::shared_ptr<T>
-template <typename T>
-class _data {
-public:
-    _data() = delete;
-    _data(int count) : p_(nullptr) {}
-    _data(const _data& d) restrict(cpu, amp)
-        : p_(d.p_) {}
-    _data(int count, void* d) restrict(cpu, amp)
-        : p_(static_cast<T*>(d)) {}
-    template <typename U>
-        _data(const _data<U>& d) restrict(cpu, amp)
-        : p_(reinterpret_cast<T *>(d.get())) {}
-    __attribute__((annotate("user_deserialize")))
-        explicit _data(T* t) restrict(cpu, amp) { p_ = t; }
-    T* get(void) const restrict(cpu, amp) { return p_; }
-    T* get_device_pointer() const restrict(cpu, amp) { return p_; }
-    std::shared_ptr<KalmarQueue> get_av() const { return nullptr; }
-    void reset() const {}
-
-    T* map_ptr(bool modify, size_t count, size_t offset) const { return nullptr; }
-    void unmap_ptr(const void* addr, bool modify, size_t count, size_t offset) const {}
-    void synchronize(bool modify = false) const {}
-    void get_cpu_access(bool modify = false) const {}
-    void copy(_data<T> other, int, int, int) const {}
-    void write(const T*, int , int offset = 0, bool blocking = false) const {}
-    void read(T*, int , int offset = 0) const {}
-    void refresh() const {}
-    void set_const() const {}
-    access_type get_access() const { return access_type_auto; }
-    std::shared_ptr<KalmarQueue> get_stage() const { return nullptr; }
-
-private:
-    T* p_;
-};
-
-template <typename T>
-class _data_host {
-    mutable std::shared_ptr<rw_info> mm;
-    bool isArray;
-    template <typename U> friend class _data_host;
-public:
-    _data_host(size_t count, const void* src = nullptr)
-        : mm(std::make_shared<rw_info>(count*sizeof(T), const_cast<void*>(src))),
-        isArray(false) {}
-
-    _data_host(std::shared_ptr<KalmarQueue> av, std::shared_ptr<KalmarQueue> stage, int count,
-               access_type mode)
-        : mm(std::make_shared<rw_info>(av, stage, count*sizeof(T), mode)), isArray(true) {}
-
-    _data_host(std::shared_ptr<KalmarQueue> av, std::shared_ptr<KalmarQueue> stage, int count,
-               void* device_pointer, access_type mode)
-        : mm(std::make_shared<rw_info>(av, stage, count*sizeof(T), device_pointer, mode)), isArray(true) {}
-
-    _data_host(const _data_host& other) : mm(other.mm), isArray(false) {}
-
-    template <typename U>
-        _data_host(const _data_host<U>& other) : mm(other.mm), isArray(false) {}
-
-    T *get() const { return static_cast<T*>(mm->data); }
-    T* get_device_pointer() const { return static_cast<T*>(mm->get_device_pointer()); }
-    void synchronize(bool modify = false) const { mm->synchronize(modify); }
-    void discard() const { mm->disc(); }
-    void refresh() const {}
-    size_t size() const { return mm->count; }
-    void reset() const { mm.reset(); }
-    void get_cpu_access(bool modify = false) const { mm->get_cpu_access(modify); }
-    std::shared_ptr<KalmarQueue> get_av() const { return mm->master; }
-    std::shared_ptr<KalmarQueue> get_stage() const { return mm->stage; }
-    access_type get_access() const { return mm->mode; }
-    void copy(_data_host<T> other, int src_offset, int dst_offset, int size) const {
-        mm->copy(other.mm.get(), src_offset * sizeof(T), dst_offset * sizeof(T), size * sizeof(T));
-    }
-    void write(const T* src, int size, int offset = 0, bool blocking = false) const {
-        mm->write(src, size * sizeof(T), offset * sizeof(T), blocking);
-    }
-    void read(T* dst, int size, int offset = 0) const {
-        mm->read(dst, size * sizeof(T), offset * sizeof(T));
-    }
-    T* map_ptr(bool modify, size_t count, size_t offset) const {
-        return (T*)mm->map(count * sizeof(T), offset * sizeof(T), modify);
-    }
-    void unmap_ptr(const void* addr, bool modify, size_t count, size_t offset) const { return mm->unmap(const_cast<void*>(addr), count * sizeof(T), offset * sizeof(T), modify); }
-    void sync_to(std::shared_ptr<KalmarQueue> pQueue) const { mm->sync(pQueue, false); }
-
-    __attribute__((annotate("serialize")))
-        void __cxxamp_serialize(Serialize& s) const {
-            s.visit_buffer(mm.get(), !std::is_const<T>::value, isArray);
-        }
-    __attribute__((annotate("user_deserialize")))
-        explicit _data_host(typename std::remove_const<T>::type* t) {}
-};
-
-} // namespace Kalmar
-/** \endcond */
diff --git a/include/kalmar_cpu_launch.h b/include/kalmar_cpu_launch.h
deleted file mode 100644
index b442d14b23f..00000000000
--- a/include/kalmar_cpu_launch.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include "hc_defines.h"
-#include "kalmar_runtime.h"
-#include "kalmar_serialize.h"
-
-namespace Kalmar {
-template <int D0, int D1=0, int D2=0> class tiled_extent;
-
-#if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-static const unsigned int NTHREAD = std::thread::hardware_concurrency();
-
-template <typename Kernel>
-class CPUKernelRAII
-{
-    const std::shared_ptr<Kalmar::KalmarQueue> pQueue;
-    const Kernel& f;
-    std::vector<std::thread> th;
-public:
-    CPUKernelRAII(const std::shared_ptr<Kalmar::KalmarQueue> pQueue, const Kernel& f)
-        : pQueue(pQueue), f(f), th(NTHREAD) {
-        CPUVisitor vis(pQueue);
-        Serialize s(&vis);
-        f.__cxxamp_serialize(s);
-        CLAMP::enter_kernel();
-    }
-    std::thread& operator[](int i) { return th[i]; }
-    ~CPUKernelRAII() {
-        for (auto& t : th)
-            if (t.joinable())
-                t.join();
-        CPUVisitor vis(pQueue);
-        Serialize ss(&vis);
-        f.__cxxamp_serialize(ss);
-        CLAMP::leave_kernel();
-    }
-};
-
-#endif
-
-}
diff --git a/include/kalmar_exception.h b/include/kalmar_exception.h
index b865540f9fc..227c1f6a08a 100644
--- a/include/kalmar_exception.h
+++ b/include/kalmar_exception.h
@@ -7,6 +7,10 @@
 
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 #include <string>
 #include <exception>
 
diff --git a/include/kalmar_index.h b/include/kalmar_index.h
deleted file mode 100644
index c5ef82bb9c7..00000000000
--- a/include/kalmar_index.h
+++ /dev/null
@@ -1,587 +0,0 @@
-#pragma once
-
-//forward declaration
-namespace Concurrency {
-template <int N> class extent;
-} // namespace Concurrency
-
-//forward declaration
-namespace hc {
-template <int N> class extent;
-} // namespace hc
-
-namespace Kalmar {
-
-/** \cond HIDDEN_SYMBOLS */
-template <int...> struct __indices {};
-
-template <int _Sp, class _IntTuple, int _Ep>
-struct __make_indices_imp;
-
-template <int _Sp, int ..._Indices, int _Ep>
-struct __make_indices_imp<_Sp, __indices<_Indices...>, _Ep> {
-    typedef typename __make_indices_imp<_Sp+1, __indices<_Indices..., _Sp>, _Ep>::type type;
-};
-
-template <int _Ep, int ..._Indices>
-struct __make_indices_imp<_Ep, __indices<_Indices...>, _Ep> {
-    typedef __indices<_Indices...> type;
-};
-
-template <int _Ep, int _Sp = 0>
-struct __make_indices {
-    static_assert(_Sp <= _Ep, "__make_indices input error");
-    typedef typename __make_indices_imp<_Sp, __indices<>, _Ep>::type type;
-};
-
-template <int _Ip>
-class __index_leaf {
-    int __idx;
-    int dummy;
-public:
-    explicit __index_leaf(int __t) restrict(amp,cpu) : __idx(__t) {}
-
-    __index_leaf& operator=(const int __t) restrict(amp,cpu) {
-        __idx = __t;
-        return *this;
-    }
-    __index_leaf& operator+=(const int __t) restrict(amp,cpu) {
-        __idx += __t;
-        return *this;
-    }
-    __index_leaf& operator-=(const int __t) restrict(amp,cpu) {
-        __idx -= __t;
-        return *this;
-    }
-    __index_leaf& operator*=(const int __t) restrict(amp,cpu) {
-        __idx *= __t;
-        return *this;
-    }
-    __index_leaf& operator/=(const int __t) restrict(amp,cpu) {
-        __idx /= __t;
-        return *this;
-    }
-    __index_leaf& operator%=(const int __t) restrict(amp,cpu) {
-        __idx %= __t;
-        return *this;
-    }
-          int& get()       restrict(amp,cpu) { return __idx; }
-    const int& get() const restrict(amp,cpu) { return __idx; }
-};
-
-template <class _Indx> struct index_impl;
-
-template <int ...N>
-struct index_impl<__indices<N...> > : public __index_leaf<N>...  {
-    index_impl() restrict(amp,cpu) : __index_leaf<N>(0)... {}
-
-    template<class ..._Up>
-        explicit index_impl(_Up... __u) restrict(amp,cpu)
-        : __index_leaf<N>(__u)... {}
-
-    index_impl(const index_impl& other) restrict(amp,cpu)
-        : index_impl(static_cast<const __index_leaf<N>&>(other).get()...) {}
-
-    index_impl(int component) restrict(amp,cpu)
-        : __index_leaf<N>(component)... {}
-    index_impl(int components[]) restrict(amp,cpu)
-        : __index_leaf<N>(components[N])... {}
-    index_impl(const int components[]) restrict(amp,cpu)
-        : __index_leaf<N>(components[N])... {}
-
-    template<class ..._Tp>
-        inline void __swallow(_Tp...) restrict(amp,cpu) {}
-
-    int operator[] (unsigned int c) const restrict(amp,cpu) {
-        return static_cast<const __index_leaf<0>&>(*((__index_leaf<0> *)this + c)).get();
-    }
-    int& operator[] (unsigned int c) restrict(amp,cpu) {
-        return static_cast<__index_leaf<0>&>(*((__index_leaf<0> *)this + c)).get();
-    }
-    index_impl& operator=(const index_impl& __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator=(static_cast<const __index_leaf<N>&>(__t).get())...);
-        return *this;
-    }
-    index_impl& operator+=(const index_impl& __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator+=(static_cast<const __index_leaf<N>&>(__t).get())...);
-        return *this;
-    }
-    index_impl& operator-=(const index_impl& __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator-=(static_cast<const __index_leaf<N>&>(__t).get())...);
-        return *this;
-    }
-    index_impl& operator*=(const index_impl& __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator*=(static_cast<const __index_leaf<N>&>(__t).get())...);
-        return *this;
-    }
-    index_impl& operator/=(const index_impl& __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator/=(static_cast<const __index_leaf<N>&>(__t).get())...);
-        return *this;
-    }
-    index_impl& operator%=(const index_impl& __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator%=(static_cast<const __index_leaf<N>&>(__t).get())...);
-        return *this;
-    }
-    index_impl& operator+=(const int __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator+=(__t)...);
-        return *this;
-    }
-    index_impl& operator-=(const int __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator-=(__t)...);
-        return *this;
-    }
-    index_impl& operator*=(const int __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator*=(__t)...);
-        return *this;
-    }
-    index_impl& operator/=(const int __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator/=(__t)...);
-        return *this;
-    }
-    index_impl& operator%=(const int __t) restrict(amp,cpu) {
-        __swallow(__index_leaf<N>::operator%=(__t)...);
-        return *this;
-    }
-};
-
-template <int N, typename _Tp>
-struct index_helper
-{
-    static inline void set(_Tp& now) restrict(amp,cpu) {
-        now[N - 1] = amp_get_global_id(_Tp::rank - N);
-        index_helper<N - 1, _Tp>::set(now);
-    }
-    static inline bool equal(const _Tp& _lhs, const _Tp& _rhs) restrict(amp,cpu) {
-        return (_lhs[N - 1] == _rhs[N - 1]) &&
-            (index_helper<N - 1, _Tp>::equal(_lhs, _rhs));
-    }
-    static inline int count_size(const _Tp& now) restrict(amp,cpu) {
-        return now[N - 1] * index_helper<N - 1, _Tp>::count_size(now);
-    }
-};
-
-template<typename _Tp>
-struct index_helper<1, _Tp>
-{
-    static inline void set(_Tp& now) restrict(amp,cpu) {
-        now[0] = amp_get_global_id(_Tp::rank - 1);
-    }
-    static inline bool equal(const _Tp& _lhs, const _Tp& _rhs) restrict(amp,cpu) {
-        return (_lhs[0] == _rhs[0]);
-    }
-    static inline int count_size(const _Tp& now) restrict(amp,cpu) {
-        return now[0];
-    }
-};
-
-template <int N, typename _Tp1, typename _Tp2>
-struct amp_helper
-{
-    static bool inline contains(const _Tp1& idx, const _Tp2& ext) restrict(amp,cpu) {
-        return idx[N - 1] >= 0 && idx[N - 1] < ext[N - 1] &&
-            amp_helper<N - 1, _Tp1, _Tp2>::contains(idx, ext);
-    }
-
-    static bool inline contains(const _Tp1& idx, const _Tp2& ext,const _Tp2& ext2) restrict(amp,cpu) {
-        return idx[N - 1] >= 0 && ext[N - 1] > 0 && (idx[N - 1] + ext[N - 1]) <= ext2[N - 1] &&
-            amp_helper<N - 1, _Tp1, _Tp2>::contains(idx, ext,ext2);
-    }
-
-    static int inline flatten(const _Tp1& idx, const _Tp2& ext) restrict(amp,cpu) {
-        return idx[N - 1] + ext[N - 1] * amp_helper<N - 1, _Tp1, _Tp2>::flatten(idx, ext);
-    }
-    static void inline minus(const _Tp1& idx, _Tp2& ext) restrict(amp,cpu) {
-        ext.base_ -= idx.base_;
-    }
-};
-
-template <typename _Tp1, typename _Tp2>
-struct amp_helper<1, _Tp1, _Tp2>
-{
-    static bool inline contains(const _Tp1& idx, const _Tp2& ext) restrict(amp,cpu) {
-        return idx[0] >= 0 && idx[0] < ext[0];
-    }
-
-    static bool inline contains(const _Tp1& idx, const _Tp2& ext,const _Tp2& ext2) restrict(amp,cpu) {
-        return idx[0] >= 0 && ext[0] > 0 && (idx[0] + ext[0]) <= ext2[0] ;
-    }
-
-    static int inline flatten(const _Tp1& idx, const _Tp2& ext) restrict(amp,cpu) {
-        return idx[0];
-    }
-    static void inline minus(const _Tp1& idx, _Tp2& ext) restrict(amp,cpu) {
-        ext.base_ -= idx.base_;
-    }
-};
-/** \endcond */
-
-/**
- * Represents a unique position in N-dimensional space.
- *
- * @tparam N The dimensionality space into which this index applies. Special
- *           constructors are supplied for the cases where @f$N \in \{1,2,3\}@f$,
- *           but N can be any integer greater than 0.
- */
-template <int N>
-class index {
-public:
-    /**
-     * A static member of index<N> that contains the rank of this index.
-     */
-    static const int rank = N;
-
-    /**
-     * The element type of index<N>.
-     */
-    typedef int value_type;
-
-    /**
-     * Default constructor. The value at each dimension is initialized to zero.
-     * Thus, "index<3> ix;" initializes the variable to the position (0,0,0).
-     */
-    index() restrict(amp,cpu) : base_() {
-        static_assert( N>0, "rank should bigger than 0 ");
-    };
-
-    /**
-     * Copy constructor. Constructs a new index<N> from the supplied argument
-     * "other".
-     *
-     * @param[in] other An object of type index<N> from which to initialize
-     *                  this new index.
-     */
-    index(const index& other) restrict(amp,cpu)
-        : base_(other.base_) {}
-
-    /** @{ */
-    /**
-     * Constructs an index<N> with the coordinate values provided by @f$i_{0..2}@f$.
-     * These are specialized constructors that are only valid when the rank of
-     * the index @f$N \in \{1,2,3\}@f$. Invoking a specialized constructor whose argument
-     * @f$count \ne N@f$ will result in a compilation error.
-     *
-     * @param[in] i0 The component values of the index vector.
-     */
-    explicit index(int i0) restrict(amp,cpu)
-        : base_(i0) {}
-
-    template <typename ..._Tp>
-        explicit index(_Tp ... __t) restrict(amp,cpu)
-        : base_(__t...) {
-            static_assert(sizeof...(_Tp) <= 3, "Explicit constructor with rank greater than 3 is not allowed");
-            static_assert(sizeof...(_Tp) == N, "rank should be consistency");
-        }
-
-    /** @} */
-
-    /**
-     * Constructs an index<N> with the coordinate values provided the array of
-     * int component values. If the coordinate array length @f$\ne@f$ N, the
-     * behavior is undefined. If the array value is NULL or not a valid
-     * pointer, the behavior is undefined.
-     *
-     * @param[in] components An array of N int values.
-     */
-    explicit index(const int components[]) restrict(amp,cpu)
-        : base_(components) {}
-
-    /**
-     * Constructs an index<N> with the coordinate values provided the array of
-     * int component values. If the coordinate array length @f$\ne@f$ N, the
-     * behavior is undefined. If the array value is NULL or not a valid
-     * pointer, the behavior is undefined.
-     *
-     * @param[in] components An array of N int values.
-     */
-    // FIXME: this function is not defined in C++AMP specification.
-    explicit index(int components[]) restrict(amp,cpu)
-        : base_(components) {}
-
-    /**
-     * Assigns the component values of "other" to this index<N> object.
-     *
-     * @param[in] other An object of type index<N> from which to copy into this
-     *                  index.
-     * @return Returns *this.
-     */
-    index& operator=(const index& other) restrict(amp,cpu) {
-        base_.operator=(other.base_);
-        return *this;
-    }
-
-    /** @{ */
-    /**
-     * Returns the index component value at position c.
-     *
-     * @param[in] c The dimension axis whose coordinate is to be accessed.
-     * @return A the component value at position c.
-     */
-    int operator[] (unsigned int c) const restrict(amp,cpu) {
-        return base_[c];
-    }
-    int& operator[] (unsigned int c) restrict(amp,cpu) {
-        return base_[c];
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * Compares two objects of index<N>.
-     *
-     * The expression
-     * @f$leftIdx \oplus rightIdx@f$
-     * is true if @f$leftIdx[i] \oplus rightIdx[i]@f$ for every i from 0 to N-1.
-     *
-     * @param[in] other The right-hand index<N> to be compared.
-     */
-    // FIXME: the signature is not entirely the same as defined in:
-    //        C++AMP spec v1.2 #1137
-    bool operator== (const index& other) const restrict(amp,cpu) {
-        return index_helper<N, index<N> >::equal(*this, other);
-    }
-    bool operator!= (const index& other) const restrict(amp,cpu) {
-        return !(*this == other);
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * For a given operator @f$\oplus@f$, produces the same effect as
-     * (*this) = (*this) @f$\oplus@f$ rhs;
-     * The return value is "*this".
-     *
-     * @param[in] rhs The right-hand index<N> of the arithmetic operation.
-     */
-    index& operator+=(const index& rhs) restrict(amp,cpu) {
-        base_.operator+=(rhs.base_);
-        return *this;
-    }
-    index& operator-=(const index& rhs) restrict(amp,cpu) {
-        base_.operator-=(rhs.base_);
-        return *this;
-    }
-
-    // FIXME: this function is not defined in C++AMP specification.
-    index& operator*=(const index& __r) restrict(amp,cpu) {
-        base_.operator*=(__r.base_);
-        return *this;
-    }
-    // FIXME: this function is not defined in C++AMP specification.
-    index& operator/=(const index& __r) restrict(amp,cpu) {
-        base_.operator/=(__r.base_);
-        return *this;
-    }
-    // FIXME: this function is not defined in C++AMP specification.
-    index& operator%=(const index& __r) restrict(amp,cpu) {
-        base_.operator%=(__r.base_);
-        return *this;
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * For a given operator @f$\oplus@f$, produces the same effect as
-     * (*this) = (*this) @f$\oplus@f$ value;
-     * The return value is "*this".
-     *
-     * @param[in] value The right-hand int of the arithmetic operation.
-     */
-    index& operator+=(int value) restrict(amp,cpu) {
-        base_.operator+=(value);
-        return *this;
-    }
-    index& operator-=(int value) restrict(amp,cpu) {
-        base_.operator-=(value);
-        return *this;
-    }
-    index& operator*=(int value) restrict(amp,cpu) {
-        base_.operator*=(value);
-        return *this;
-    }
-    index& operator/=(int value) restrict(amp,cpu) {
-        base_.operator/=(value);
-        return *this;
-    }
-    index& operator%=(int value) restrict(amp,cpu) {
-        base_.operator%=(value);
-        return *this;
-    }
-
-    /** @} */
-
-    /** @{ */
-    /**
-     * For a given operator @f$\oplus@f$, produces the same effect as
-     * (*this) = (*this) @f$\oplus@f$ 1;
-     *
-     * For prefix increment and decrement, the return value is "*this".
-     * Otherwise a new index<N> is returned.
-     */
-    index& operator++() restrict(amp,cpu) {
-        base_.operator+=(1);
-        return *this;
-    }
-    index operator++(int) restrict(amp,cpu) {
-        index ret = *this;
-        base_.operator+=(1);
-        return ret;
-    }
-    index& operator--() restrict(amp,cpu) {
-        base_.operator-=(1);
-        return *this;
-    }
-    index operator--(int) restrict(amp,cpu) {
-        index ret = *this;
-        base_.operator-=(1);
-        return ret;
-    }
-
-    /** @} */
-
-private:
-    typedef index_impl<typename __make_indices<N>::type> base;
-    base base_;
-    template <int T> friend class Concurrency::extent;
-    template <int T> friend class hc::extent;
-    template <int K, typename Q> friend struct index_helper;
-    template <int K, typename Q1, typename Q2> friend struct amp_helper;
-
-public:
-    __attribute__((annotate("__cxxamp_opencl_index")))
-    void __cxxamp_opencl_index() restrict(amp,cpu)
-#if __KALMAR_ACCELERATOR__ == 1
-    {
-        index_helper<N, index<N>>::set(*this);
-    }
-#elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
-    {}
-#else
-    ;
-#endif
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// explicit instantions
-///////////////////////////////////////////////////////////////////////////////
-template class index<1>;
-template class index<2>;
-template class index<3>;
-
-///////////////////////////////////////////////////////////////////////////////
-// operators for index<N>
-///////////////////////////////////////////////////////////////////////////////
-
-/** @{ */
-/**
- * Binary arithmetic operations that produce a new index<N> that is the result
- * of performing the corresponding pair-wise binary arithmetic operation on the
- * elements of the operands. The result index<N> is such that for a given
- * operator @f$\oplus@f$,
- * @f$result[i] = leftIdx[i] \oplus rightIdx[i]@f$
- * for every i from 0 to N-1.
- *
- * @param[in] lhs The left-hand index<N> of the arithmetic operation.
- * @param[in] rhs The right-hand index<N> of the arithmetic operation.
- */
-// FIXME: the signature is not entirely the same as defined in:
-//        C++AMP spec v1.2 #1138
-template <int N>
-index<N> operator+(const index<N>& lhs, const index<N>& rhs) restrict(amp,cpu) {
-    index<N> __r = lhs;
-    __r += rhs;
-    return __r;
-}
-template <int N>
-index<N> operator-(const index<N>& lhs, const index<N>& rhs) restrict(amp,cpu) {
-    index<N> __r = lhs;
-    __r -= rhs;
-    return __r;
-}
-
-/** @} */
-
-/** @{ */
-/**
- * Binary arithmetic operations that produce a new index<N> that is the result
- * of performing the corresponding binary arithmetic operation on the elements
- * of the index operands. The result index<N> is such that for a given
- * operator @f$\oplus@f$,
- * result[i] = idx[i] @f$\oplus@f$ value
- * or
- * result[i] = value @f$\oplus@f$ idx[i]
- * for every i from 0 to N-1.
- *
- * @param[in] idx The index<N> operand
- * @param[in] value The integer operand
- */
-// FIXME: the signature is not entirely the same as defined in:
-//        C++AMP spec v1.2 #1141
-template <int N>
-index<N> operator+(const index<N>& idx, int value) restrict(amp,cpu) {
-    index<N> __r = idx;
-    __r += value;
-    return __r;
-}
-template <int N>
-index<N> operator+(int value, const index<N>& idx) restrict(amp,cpu) {
-    index<N> __r = idx;
-    __r += value;
-    return __r;
-}
-template <int N>
-index<N> operator-(const index<N>& idx, int value) restrict(amp,cpu) {
-    index<N> __r = idx;
-    __r -= value;
-    return __r;
-}
-template <int N>
-index<N> operator-(int value, const index<N>& idx) restrict(amp,cpu) {
-    index<N> __r(value);
-    __r -= idx;
-    return __r;
-}
-template <int N>
-index<N> operator*(const index<N>& idx, int value) restrict(amp,cpu) {
-    index<N> __r = idx;
-    __r *= value;
-    return __r;
-}
-template <int N>
-index<N> operator*(int value, const index<N>& idx) restrict(amp,cpu) {
-    index<N> __r(value);
-    __r *= idx;
-    return __r;
-}
-template <int N>
-index<N> operator/(const index<N>& idx, int value) restrict(amp,cpu) {
-    index<N> __r = idx;
-    __r /= value;
-    return __r;
-}
-template <int N>
-index<N> operator/(int value, const index<N>& idx) restrict(amp,cpu) {
-    index<N> __r(value);
-    __r /= idx;
-    return __r;
-}
-template <int N>
-index<N> operator%(const index<N>& idx, int value) restrict(amp,cpu) {
-    index<N> __r = idx;
-    __r %= value;
-    return __r;
-}
-template <int N>
-index<N> operator%(int value, const index<N>& idx) restrict(amp,cpu) {
-    index<N> __r(value);
-    __r %= idx;
-    return __r;
-}
-
-/** @} */
-
-
-} // namespace Kalmar
-
diff --git a/include/kalmar_launch.h b/include/kalmar_launch.h
deleted file mode 100644
index e06f501d50b..00000000000
--- a/include/kalmar_launch.h
+++ /dev/null
@@ -1,126 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include "hc_defines.h"
-#include "kalmar_runtime.h"
-#include "kalmar_serialize.h"
-
-/** \cond HIDDEN_SYMBOLS */
-namespace Kalmar {
-
-template <typename Kernel>
-static void append_kernel(const std::shared_ptr<KalmarQueue>& pQueue, const Kernel& f, void* kernel)
-{
-  Kalmar::BufferArgumentsAppender vis(pQueue, kernel);
-  Kalmar::Serialize s(&vis);
-  f.__cxxamp_serialize(s);
-}
-
-template <typename Kernel>
-static inline std::shared_ptr<KalmarQueue> get_availabe_que(const Kernel& f)
-{
-    Kalmar::QueueSearcher ser;
-    Kalmar::Serialize s(&ser);
-    f.__cxxamp_serialize(s);
-    if (ser.get_que())
-        return ser.get_que();
-    else
-        return getContext()->auto_select();
-}
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-variable"
-template<typename Kernel, int dim_ext>
-inline std::shared_ptr<KalmarAsyncOp>
-mcw_cxxamp_launch_kernel_async(const std::shared_ptr<KalmarQueue>& pQueue, size_t *ext,
-  size_t *local_size, const Kernel& f) restrict(cpu,amp) {
-#if __KALMAR_ACCELERATOR__ != 1
-  //Invoke Kernel::__cxxamp_trampoline as an kernel
-  //to ensure functor has right operator() defined
-  //this triggers the trampoline code being emitted
-  // FIXME: implicitly casting to avoid pointer to int error
-  int* foo = reinterpret_cast<int*>(&Kernel::__cxxamp_trampoline);
-  void *kernel = NULL;
-  {
-      std::string kernel_name(f.__cxxamp_trampoline_name());
-      kernel = CLAMP::CreateKernel(kernel_name, pQueue.get());
-  }
-  append_kernel(pQueue, f, kernel);
-  return pQueue->LaunchKernelAsync(kernel, dim_ext, ext, local_size);
-#endif
-}
-#pragma clang diagnostic pop
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-variable"
-template<typename Kernel, int dim_ext>
-inline
-void mcw_cxxamp_launch_kernel(const std::shared_ptr<KalmarQueue>& pQueue, size_t *ext,
-                              size_t *local_size, const Kernel& f) restrict(cpu,amp) {
-#if __KALMAR_ACCELERATOR__ != 1
-  //Invoke Kernel::__cxxamp_trampoline as an kernel
-  //to ensure functor has right operator() defined
-  //this triggers the trampoline code being emitted
-  // FIXME: implicitly casting to avoid pointer to int error
-  int* foo = reinterpret_cast<int*>(&Kernel::__cxxamp_trampoline);
-  void *kernel = NULL;
-  {
-      std::string kernel_name(f.__cxxamp_trampoline_name());
-      kernel = CLAMP::CreateKernel(kernel_name, pQueue.get());
-  }
-  append_kernel(pQueue, f, kernel);
-  pQueue->LaunchKernel(kernel, dim_ext, ext, local_size);
-#endif // __KALMAR_ACCELERATOR__
-}
-#pragma clang diagnostic pop
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-variable"
-template<typename Kernel>
-inline void* mcw_cxxamp_get_kernel(const std::shared_ptr<KalmarQueue>& pQueue, const Kernel& f) restrict(cpu,amp) {
-#if __KALMAR_ACCELERATOR__ != 1
-  //Invoke Kernel::__cxxamp_trampoline as an kernel
-  //to ensure functor has right operator() defined
-  //this triggers the trampoline code being emitted
-  // FIXME: implicitly casting to avoid pointer to int error
-  int* foo = reinterpret_cast<int*>(&Kernel::__cxxamp_trampoline);
-  void *kernel = NULL;
-  std::string kernel_name (f.__cxxamp_trampoline_name());
-  kernel = CLAMP::CreateKernel(kernel_name, pQueue.get());
-  return kernel;
-#else
-  return NULL;
-#endif
-}
-#pragma clang diagnostic pop
-
-template<typename Kernel, int dim_ext>
-inline
-void mcw_cxxamp_execute_kernel_with_dynamic_group_memory(
-  const std::shared_ptr<KalmarQueue>& pQueue, size_t *ext, size_t *local_size,
-  const Kernel& f, void *kernel, size_t dynamic_group_memory_size) restrict(cpu,amp) {
-#if __KALMAR_ACCELERATOR__ != 1
-  append_kernel(pQueue, f, kernel);
-  pQueue->LaunchKernelWithDynamicGroupMemory(kernel, dim_ext, ext, local_size, dynamic_group_memory_size);
-#endif // __KALMAR_ACCELERATOR__
-}
-
-template<typename Kernel, int dim_ext>
-inline std::shared_ptr<KalmarAsyncOp>
-mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(
-  const std::shared_ptr<KalmarQueue>& pQueue, size_t *ext, size_t *local_size,
-  const Kernel& f, void *kernel, size_t dynamic_group_memory_size) restrict(cpu,amp) {
-#if __KALMAR_ACCELERATOR__ != 1
-  append_kernel(pQueue, f, kernel);
-  return pQueue->LaunchKernelWithDynamicGroupMemoryAsync(kernel, dim_ext, ext, local_size, dynamic_group_memory_size);
-#endif // __KALMAR_ACCELERATOR__
-}
-
-} // namespace Kalmar
-/** \endcond */
diff --git a/include/kalmar_math.h b/include/kalmar_math.h
deleted file mode 100644
index 21fc8729239..00000000000
--- a/include/kalmar_math.h
+++ /dev/null
@@ -1,1663 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cmath>
-#include <stdexcept>
-
-extern "C" _Float16 __ocml_acos_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_acos_f32(float x) [[hc]];
-extern "C" double __ocml_acos_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_acosh_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_acosh_f32(float x) [[hc]];
-extern "C" double __ocml_acosh_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_asin_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_asin_f32(float x) [[hc]];
-extern "C" double __ocml_asin_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_asinh_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_asinh_f32(float x) [[hc]];
-extern "C" double __ocml_asinh_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_atan_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_atan_f32(float x) [[hc]];
-extern "C" double __ocml_atan_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_atanh_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_atanh_f32(float x) [[hc]];
-extern "C" double __ocml_atanh_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_atan2_f16(_Float16 y, _Float16 x) [[hc]];
-extern "C" float __ocml_atan2_f32(float y, float x) [[hc]];
-extern "C" double __ocml_atan2_f64(double y, double x) [[hc]];
-
-extern "C" _Float16 __ocml_cbrt_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_cbrt_f32(float x) [[hc]];
-extern "C" double __ocml_cbrt_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_ceil_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_ceil_f32(float x) [[hc]];
-extern "C" double __ocml_ceil_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_copysign_f16(_Float16 x, _Float16 y) [[hc]];
-extern "C" float __ocml_copysign_f32(float x, float y) [[hc]];
-extern "C" double __ocml_copysign_f64(double x, double y) [[hc]];
-
-extern "C" _Float16 __ocml_cos_f16(_Float16 x) [[hc]];
-extern "C" _Float16 __ocml_native_cos_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_cos_f32(float x) [[hc]];
-extern "C" float __ocml_native_cos_f32(float x) [[hc]];
-extern "C" double __ocml_cos_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_cosh_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_cosh_f32(float x) [[hc]];
-extern "C" double __ocml_cosh_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_cospi_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_cospi_f32(float x) [[hc]];
-extern "C" double __ocml_cospi_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_erf_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_erf_f32(float x) [[hc]];
-extern "C" double __ocml_erf_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_erfc_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_erfc_f32(float x) [[hc]];
-extern "C" double __ocml_erfc_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_erfcinv_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_erfcinv_f32(float x) [[hc]];
-extern "C" double __ocml_erfcinv_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_erfinv_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_erfinv_f32(float x) [[hc]];
-extern "C" double __ocml_erfinv_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_exp_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_exp_f32(float x) [[hc]];
-extern "C" double __ocml_exp_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_exp10_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_exp10_f32(float x) [[hc]];
-extern "C" double __ocml_exp10_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_native_exp2_f16(_Float16 x) [[hc]];
-extern "C" _Float16 __ocml_exp2_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_exp2_f32(float x) [[hc]];
-extern "C" float __ocml_native_exp2_f32(float x) [[hc]];
-extern "C" double __ocml_exp2_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_expm1_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_expm1_f32(float x) [[hc]];
-extern "C" double __ocml_expm1_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_fabs_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_fabs_f32(float x) [[hc]];
-extern "C" double __ocml_fabs_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_fdim_f16(_Float16 x, _Float16 y) [[hc]];
-extern "C" float __ocml_fdim_f32(float x, float y) [[hc]];
-extern "C" double __ocml_fdim_f64(double x, double y) [[hc]];
-
-extern "C" _Float16 __ocml_floor_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_floor_f32(float x) [[hc]];
-extern "C" double __ocml_floor_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_fma_f16(_Float16 x, _Float16 y, _Float16 z) [[hc]];
-extern "C" float __ocml_fma_f32(float x, float y, float z) [[hc]];
-extern "C" double __ocml_fma_f64(double x, double y, double z) [[hc]];
-
-extern "C" _Float16 __ocml_fmax_f16(_Float16 x, _Float16 y) [[hc]];
-extern "C" float __ocml_fmax_f32(float x, float y) [[hc]];
-extern "C" double __ocml_fmax_f64(double x, double y) [[hc]];
-
-extern "C" _Float16 __ocml_fmin_f16(_Float16 x, _Float16 y) [[hc]];
-extern "C" float __ocml_fmin_f32(float x, float y) [[hc]];
-extern "C" double __ocml_fmin_f64(double x, double y) [[hc]];
-
-extern "C" _Float16 __ocml_fmod_f16(_Float16 x, _Float16 y) [[hc]];
-extern "C" float __ocml_fmod_f32(float x, float y) [[hc]];
-extern "C" double __ocml_fmod_f64(double x, double y) [[hc]];
-
-extern "C" int __ocml_fpclassify_f16(_Float16 x) [[hc]];
-extern "C" int __ocml_fpclassify_f32(float x) [[hc]];
-extern "C" int __ocml_fpclassify_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_frexp_f16(_Float16 x, __attribute__((address_space(5))) int *exp) [[hc]];
-extern "C" float __ocml_frexp_f32(float x, __attribute__((address_space(5))) int *exp) [[hc]];
-extern "C" double __ocml_frexp_f64(double x, __attribute__((address_space(5))) int *exp) [[hc]];
-
-extern "C" _Float16 __ocml_hypot_f16(_Float16 x, _Float16 y) [[hc]];
-extern "C" float __ocml_hypot_f32(float x, float y) [[hc]];
-extern "C" double __ocml_hypot_f64(double x, double y) [[hc]];
-
-extern "C" int __ocml_ilogb_f16(_Float16 x) [[hc]];
-extern "C" int __ocml_ilogb_f32(float x) [[hc]];
-extern "C" int __ocml_ilogb_f64(double x) [[hc]];
-
-extern "C" int __ocml_isfinite_f16(_Float16 x) [[hc]];
-extern "C" int __ocml_isfinite_f32(float x) [[hc]];
-extern "C" int __ocml_isfinite_f64(double x) [[hc]];
-
-extern "C" int __ocml_isinf_f16(_Float16 x) [[hc]];
-extern "C" int __ocml_isinf_f32(float x) [[hc]];
-extern "C" int __ocml_isinf_f64(double x) [[hc]];
-
-extern "C" int __ocml_isnan_f16(_Float16 x) [[hc]];
-extern "C" int __ocml_isnan_f32(float x) [[hc]];
-extern "C" int __ocml_isnan_f64(double x) [[hc]];
-
-extern "C" int __ocml_isnormal_f16(_Float16 x) [[hc]];
-extern "C" int __ocml_isnormal_f32(float x) [[hc]];
-extern "C" int __ocml_isnormal_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_ldexp_f16(_Float16 x, std::int16_t exp) [[hc]];
-extern "C" float __ocml_ldexp_f32(float x, int exp) [[hc]];
-extern "C" double __ocml_ldexp_f64(double x, int exp) [[hc]];
-
-extern "C" _Float16 __ocml_lgamma_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_lgamma_f32(float x) [[hc]];
-extern "C" double __ocml_lgamma_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_log_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_log_f32(float x) [[hc]];
-extern "C" double __ocml_log_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_log10_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_log10_f32(float x) [[hc]];
-extern "C" double __ocml_log10_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_log2_f16(_Float16 x) [[hc]];
-extern "C" _Float16 __ocml_native_log2_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_log2_f32(float x) [[hc]];
-extern "C" float __ocml_native_log2_f32(float x) [[hc]];
-extern "C" double __ocml_log2_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_log1p_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_log1p_f32(float x) [[hc]];
-extern "C" double __ocml_log1p_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_logb_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_logb_f32(float x) [[hc]];
-extern "C" double __ocml_logb_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_modf_f16(_Float16 x, __attribute__((address_space(5))) _Float16 *iptr) [[hc]];
-extern "C" float __ocml_modf_f32(float x, __attribute__((address_space(5))) float *iptr) [[hc]];
-extern "C" double __ocml_modf_f64(double x, __attribute__((address_space(5))) double *iptr) [[hc]];
-
-extern "C" _Float16 __ocml_nan_f16(int tagp) [[hc]];
-extern "C" float __ocml_nan_f32(int tagp) [[hc]];
-extern "C" double __ocml_nan_f64(unsigned long tagp) [[hc]];
-
-extern "C" _Float16 __ocml_nearbyint_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_nearbyint_f32(float x) [[hc]];
-extern "C" double __ocml_nearbyint_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_nextafter_f16(_Float16 x, _Float16 y) [[hc]];
-extern "C" float __ocml_nextafter_f32(float x, float y) [[hc]];
-extern "C" double __ocml_nextafter_f64(double x, double y) [[hc]];
-
-extern "C" _Float16 __ocml_pow_f16(_Float16 x, _Float16 y) [[hc]];
-extern "C" float __ocml_pow_f32(float x, float y) [[hc]];
-extern "C" double __ocml_pow_f64(double x, double y) [[hc]];
-
-extern "C" _Float16 __ocml_rcbrt_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_rcbrt_f32(float x) [[hc]];
-extern "C" double __ocml_rcbrt_f64(double x) [[hc]];
-
-// TODO: rcp is implementation only, it does not have a public interface.
-extern "C" _Float16 __hc_rcp_native_f16(_Float16 x) [[hc]];
-extern "C" float __hc_rcp_native(float x) [[hc]];
-
-extern "C" _Float16 __ocml_remainder_f16(_Float16 x, _Float16 y) [[hc]];
-extern "C" float __ocml_remainder_f32(float x, float y) [[hc]];
-extern "C" double __ocml_remainder_f64(double x, double y) [[hc]];
-
-extern "C" _Float16 __ocml_remquo_f16(_Float16 x, _Float16 y, __attribute__((address_space(5))) int *quo) [[hc]];
-extern "C" float __ocml_remquo_f32(float x, float y, __attribute__((address_space(5))) int *quo) [[hc]];
-extern "C" double __ocml_remquo_f64(double x, double y, __attribute__((address_space(5))) int *quo) [[hc]];
-
-extern "C" _Float16 __ocml_round_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_round_f32(float x) [[hc]];
-extern "C" double __ocml_round_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_rsqrt_f16(_Float16 x) [[hc]];
-extern "C" _Float16 __ocml_native_rsqrt_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_rsqrt_f32(float x) [[hc]];
-extern "C" float __ocml_native_rsqrt_f32(float x) [[hc]];
-extern "C" double __ocml_rsqrt_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_scalb_f16(_Float16 x, _Float16 exp) [[hc]];
-extern "C" float __ocml_scalb_f32(float x, float exp) [[hc]];
-extern "C" double __ocml_scalb_f64(double x, double exp) [[hc]];
-
-extern "C" _Float16 __ocml_scalbn_f16(_Float16 x, int exp) [[hc]];
-extern "C" float __ocml_scalbn_f32(float x, int exp) [[hc]];
-extern "C" double __ocml_scalbn_f64(double x, int exp) [[hc]];
-
-extern "C" _Float16 __ocml_sinpi_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_sinpi_f32(float x) [[hc]];
-extern "C" double __ocml_sinpi_f64(double x) [[hc]];
-
-extern "C" int __ocml_signbit_f16(_Float16 x) [[hc]];
-extern "C" int __ocml_signbit_f32(float x) [[hc]];
-extern "C" int __ocml_signbit_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_sin_f16(_Float16 x) [[hc]];
-extern "C" _Float16 __ocml_native_sin_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_sin_f32(float x) [[hc]];
-extern "C" float __ocml_native_sin_f32(float x) [[hc]];
-extern "C" double __ocml_sin_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_sincos_f16(_Float16 x, __attribute__((address_space(5))) _Float16 *c) [[hc]];
-extern "C" float __ocml_sincos_f32(float x, __attribute__((address_space(5))) float *c) [[hc]];
-extern "C" double __ocml_sincos_f64(double x, __attribute__((address_space(5))) double *c) [[hc]];
-
-extern "C" _Float16 __ocml_sinh_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_sinh_f32(float x) [[hc]];
-extern "C" double __ocml_sinh_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_sqrt_f16(_Float16 x) [[hc]];
-extern "C" _Float16 __ocml_native_sqrt_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_sqrt_f32(float x) [[hc]];
-extern "C" float __ocml_native_sqrt_f32(float x) [[hc]];
-extern "C" double __ocml_sqrt_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_tgamma_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_tgamma_f32(float x) [[hc]];
-extern "C" double __ocml_tgamma_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_tan_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_tan_f32(float x) [[hc]];
-extern "C" double __ocml_tan_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_tanh_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_tanh_f32(float x) [[hc]];
-extern "C" double __ocml_tanh_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_tanpi_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_tanpi_f32(float x) [[hc]];
-extern "C" double __ocml_tanpi_f64(double x) [[hc]];
-
-extern "C" _Float16 __ocml_trunc_f16(_Float16 x) [[hc]];
-extern "C" float __ocml_trunc_f32(float x) [[hc]];
-extern "C" double __ocml_trunc_f64(double x) [[hc]];
-
-#define HCC_MATH_LIB_FN inline __attribute__((used, hc))
-namespace Kalmar
-{
-    namespace fast_math
-    {
-        using std::acos;
-        using ::acosf;
-        using std::asin;
-        using ::asinf;
-        using std::atan;
-        using ::atanf;
-        using std::atan2;
-        using ::atan2f;
-        using std::ceil;
-        using ::ceilf;
-        using std::cos;
-        using ::cosf;
-        using std::cosh;
-        using ::coshf;
-        using std::exp;
-        using ::exp10;
-        using std::exp2;
-        using ::exp10f;
-        using ::exp2f;
-        using ::expf;
-        using std::fabs;
-        using ::fabsf;
-        using std::floor;
-        using ::floorf;
-        using std::fmax;
-        using ::fmaxf;
-        using std::fmin;
-        using ::fminf;
-        using std::fmod;
-        using ::fmodf;
-        using std::frexp;
-        using ::frexpf;
-        using std::isfinite;
-        using std::isinf;
-        using std::isnan;
-        using std::isnormal;
-        using std::ldexp;
-        using ::ldexpf;
-        using std::log;
-        using ::logf;
-        using std::log10;
-        using ::log10f;
-        using std::log2;
-        using ::log2f;
-        using std::modf;
-        using ::modff;
-        using std::pow;
-        using ::powf;
-        using std::round;
-        using ::roundf;
-        using std::signbit;
-        using std::sin;
-        using ::sinf;
-        using std::sinh;
-        using ::sinhf;
-        using std::sqrt;
-        using ::sqrtf;
-        using std::tan;
-        using ::tanf;
-        using std::tanh;
-        using ::tanhf;
-        using std::trunc;
-        using ::truncf;
-
-        HCC_MATH_LIB_FN
-        float acosf(float x) { return __ocml_acos_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 acos(_Float16 x) { return __ocml_acos_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float acos(float x) { return fast_math::acosf(x); }
-
-        HCC_MATH_LIB_FN
-        float asinf(float x) { return __ocml_asin_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 asin(_Float16 x) { return __ocml_asin_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float asin(float x) { return fast_math::asinf(x); }
-
-        HCC_MATH_LIB_FN
-        float atanf(float x) { return __ocml_atan_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 atan(_Float16 x) { return __ocml_atan_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float atan(float x) { return fast_math::atanf(x); }
-
-        HCC_MATH_LIB_FN
-        float atan2f(float y, float x) { return __ocml_atan2_f32(y, x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 atan2(_Float16 y, _Float16 x) { return __ocml_atan2_f16(y, x); }
-
-        HCC_MATH_LIB_FN
-        float atan2(float y, float x) { return fast_math::atan2f(y, x); }
-
-        HCC_MATH_LIB_FN
-        float ceilf(float x) { return __ocml_ceil_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 ceil(_Float16 x) { return __ocml_ceil_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float ceil(float x) { return fast_math::ceilf(x); }
-
-        HCC_MATH_LIB_FN
-        float cosf(float x) { return __ocml_native_cos_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 cos(_Float16 x) { return __ocml_native_cos_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float cos(float x) { return fast_math::cosf(x); }
-
-        HCC_MATH_LIB_FN
-        float coshf(float x) { return __ocml_cosh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 cosh(_Float16 x) { return __ocml_cosh_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float cosh(float x) { return fast_math::coshf(x); }
-
-        HCC_MATH_LIB_FN
-        float expf(float x) { return __ocml_native_exp2_f32(M_LOG2E * x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 exp(_Float16 x) { return __ocml_native_exp2_f16(M_LOG2E * x); }
-
-        HCC_MATH_LIB_FN
-        float exp(float x) { return fast_math::expf(x); }
-
-        HCC_MATH_LIB_FN
-        float exp2f(float x) { return __ocml_native_exp2_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 exp2(_Float16 x) { return __ocml_native_exp2_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float exp2(float x) { return fast_math::exp2f(x); }
-
-        HCC_MATH_LIB_FN
-        float fabsf(float x) { return __ocml_fabs_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fabs(_Float16 x) { return __ocml_fabs_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float fabs(float x) { return fast_math::fabsf(x); }
-
-        HCC_MATH_LIB_FN
-        float floorf(float x) { return __ocml_floor_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 floor(_Float16 x) { return __ocml_floor_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float floor(float x) { return fast_math::floorf(x); }
-
-        HCC_MATH_LIB_FN
-        float fmaxf(float x, float y) { return __ocml_fmax_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fmax(_Float16 x, _Float16 y) { return __ocml_fmax_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fmax(float x, float y) { return fast_math::fmaxf(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fminf(float x, float y) { return __ocml_fmin_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fmin(_Float16 x, _Float16 y) { return __ocml_fmin_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fmin(float x, float y) { return fast_math::fminf(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fmodf(float x, float y) { return __ocml_fmod_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fmod(_Float16 x, _Float16 y) { return __ocml_fmod_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fmod(float x, float y) { return fast_math::fmodf(x, y); }
-
-        HCC_MATH_LIB_FN
-        float frexpf(float x, int *exp) {
-        	int e;
-        	float ret = __ocml_frexp_f32(x, (__attribute__((address_space(5))) int*) &e);
-        	*exp = e; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        _Float16 frexp(_Float16 x, int *exp) {
-        	int e;
-        	_Float16 ret = __ocml_frexp_f16(x, (__attribute__((address_space(5))) int*) &e);
-            *exp = e; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        float frexp(float x, int *exp) { return fast_math::frexpf(x, exp); }
-
-        HCC_MATH_LIB_FN
-        int isfinite(_Float16 x) { return __ocml_isfinite_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int isfinite(float x) { return __ocml_isfinite_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int isinf(_Float16 x) { return __ocml_isinf_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int isinf(float x) { return __ocml_isinf_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int isnan(_Float16 x) { return __ocml_isnan_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int isnan(float x) { return __ocml_isnan_f32(x); }
-
-        HCC_MATH_LIB_FN
-        float ldexpf(float x, int exp) { return __ocml_ldexp_f32(x,exp); }
-
-        HCC_MATH_LIB_FN
-        _Float16 ldexp(_Float16 x, std::uint16_t exp)
-        {
-            return __ocml_ldexp_f16(x, exp);
-        }
-
-        HCC_MATH_LIB_FN
-        float ldexp(float x, int exp) { return fast_math::ldexpf(x, exp); }
-
-        namespace
-        {   // TODO: this is temporary, lifted straight out of irif.h.
-            // Namespace is merely for documentation.
-            #define M_LOG2_10_F 0x1.a934f0p+1f
-            // Value of 1 / log2(10)
-            #define M_RLOG2_10_F 0x1.344136p-2f
-            // Value of 1 / M_LOG2E_F = 1 / log2(e)
-            #define M_RLOG2_E_F 0x1.62e430p-1f
-        }
-
-        HCC_MATH_LIB_FN
-        float logf(float x) { return __ocml_native_log2_f32(x) * M_RLOG2_E_F; }
-
-        HCC_MATH_LIB_FN
-        _Float16 log(_Float16 x)
-        {
-            return __ocml_native_log2_f16(x) * static_cast<_Float16>(M_RLOG2_E_F);
-        }
-
-        HCC_MATH_LIB_FN
-        float log(float x) { return fast_math::logf(x); }
-
-        HCC_MATH_LIB_FN
-        float log10f(float x) { return __ocml_native_log2_f32(x) * M_RLOG2_10_F; }
-
-        HCC_MATH_LIB_FN
-        _Float16 log10(_Float16 x)
-        {
-            return __ocml_native_log2_f16(x) * static_cast<_Float16>(M_RLOG2_10_F);
-        }
-
-        HCC_MATH_LIB_FN
-        float log10(float x) { return fast_math::log10f(x); }
-
-        HCC_MATH_LIB_FN
-        float log2f(float x) { return __ocml_native_log2_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 log2(_Float16 x) { return __ocml_native_log2_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float log2(float x) { return fast_math::log2f(x); }
-
-        HCC_MATH_LIB_FN
-        float modff(float x, float *iptr) {
-        	float i;  float ret = __ocml_modf_f32(x, (__attribute__((address_space(5))) float*)&i);
-        	*iptr = i; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        _Float16 modf(_Float16 x, _Float16 *iptr) {
-        	_Float16 i; _Float16 ret = __ocml_modf_f16(x, (__attribute__((address_space(5))) _Float16*) &i);
-        	*iptr = i; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        float modf(float x, float *iptr) { return fast_math::modff(x, iptr); }
-
-        HCC_MATH_LIB_FN
-        float powf(float x, float y) { return __ocml_pow_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 pow(_Float16 x, _Float16 y) { return __ocml_pow_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float pow(float x, float y) { return fast_math::powf(x, y); }
-
-        HCC_MATH_LIB_FN
-        float roundf(float x) { return __ocml_round_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 round(_Float16 x) { return __ocml_round_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float round(float x) { return fast_math::roundf(x); }
-
-        HCC_MATH_LIB_FN
-        float rsqrtf(float x) { return __ocml_native_rsqrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 rsqrt(_Float16 x) { return __ocml_native_rsqrt_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float rsqrt(float x) { return fast_math::rsqrtf(x); }
-
-        HCC_MATH_LIB_FN
-        int signbitf(float x) { return __ocml_signbit_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int signbit(_Float16 x) { return __ocml_signbit_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int signbit(float x) { return fast_math::signbitf(x); }
-
-        HCC_MATH_LIB_FN
-        float sinf(float x) { return __ocml_native_sin_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 sin(_Float16 x) { return __ocml_native_sin_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float sin(float x) { return fast_math::sinf(x); }
-
-        HCC_MATH_LIB_FN
-        void sincosf(float x, float *s, float *c) {
-        	float lc;
-        	*s = __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*)&lc);
-        	*c=lc;
-        }
-
-        HCC_MATH_LIB_FN
-        void sincos(_Float16 x, _Float16 *s, _Float16 *c)
-        {
-        	_Float16 lc;
-            *s = __ocml_sincos_f16(x, (__attribute__((address_space(5))) _Float16*) &lc);
-            *c = lc;
-        }
-
-        HCC_MATH_LIB_FN
-        void sincos(float x, float *s, float *c)
-        {
-            fast_math::sincosf(x, s, c);
-        }
-
-        HCC_MATH_LIB_FN
-        float sinhf(float x) { return __ocml_sinh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 sinh(_Float16 x) { return __ocml_sinh_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float sinh(float x) { return fast_math::sinhf(x); }
-
-        HCC_MATH_LIB_FN
-        float sqrtf(float x) { return __ocml_native_sqrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 sqrt(_Float16 x) { return __ocml_native_sqrt_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float sqrt(float x) { return fast_math::sqrtf(x); }
-
-        HCC_MATH_LIB_FN
-        float tanf(float x) { return __ocml_tan_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 tan(_Float16 x)
-        {
-            return __ocml_native_sin_f16(x) *
-                __hc_rcp_native_f16(__ocml_native_cos_f16(x));
-        }
-
-        HCC_MATH_LIB_FN
-        float tan(float x) { return fast_math::tanf(x); }
-
-        HCC_MATH_LIB_FN
-        float tanhf(float x) { return __ocml_tanh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 tanh(_Float16 x) { return __ocml_tanh_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float tanh(float x) { return fast_math::tanhf(x); }
-
-        HCC_MATH_LIB_FN
-        float truncf(float x) { return __ocml_trunc_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 trunc(_Float16 x) { return __ocml_trunc_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float trunc(float x) { return fast_math::truncf(x); }
-    } // namespace fast_math
-
-    namespace precise_math
-    {
-        using std::acos;
-        using std::acosh;
-        using ::acoshf;
-        using ::acosf;
-        using std::asin;
-        using std::asinh;
-        using ::asinhf;
-        using ::asinf;
-        using std::atan;
-        using std::atan2;
-        using ::atan2f;
-        using std::atanh;
-        using ::atanhf;
-        using ::atanf;
-        using std::cbrt;
-        using ::cbrtf;
-        using std::ceil;
-        using ::ceilf;
-        using std::copysign;
-        using ::copysignf;
-        using std::cos;
-        using std::cosh;
-        using ::coshf;
-        using ::cosf;
-        using std::erf;
-        using std::erfc;
-        using ::erfcf;
-        using ::erff;
-        using std::exp;
-        using ::exp10;
-        using ::exp10f;
-        using std::exp2;
-        using ::exp2f;
-        using ::expf;
-        using std::expm1;
-        using ::expm1f;
-        using std::fabs;
-        using ::fabsf;
-        using std::fdim;
-        using ::fdimf;
-        using std::floor;
-        using ::floorf;
-        using std::fma;
-        using ::fmaf;
-        using std::fmax;
-        using ::fmaxf;
-        using std::fmin;
-        using ::fminf;
-        using std::fmod;
-        using ::fmodf;
-        using std::frexp;
-        using ::frexpf;
-        using std::hypot;
-        using ::hypotf;
-        using std::ilogb;
-        using ::ilogbf;
-        using std::isfinite;
-        using std::isinf;
-        using std::isnan;
-        using std::isnormal;
-        using std::ldexp;
-        using ::ldexpf;
-        using std::log;
-        using std::log10;
-        using std::log1p;
-        using std::log2;
-        using std::logb;
-        using ::log10f;
-        using ::log1pf;
-        using ::log2f;
-        using ::logbf;
-        using ::logf;
-        using std::modf;
-        using ::modff;
-        using std::nearbyint;
-        using ::nearbyintf;
-        using std::nextafter;
-        using ::nextafterf;
-        using std::pow;
-        using ::powf;
-        using std::remainder;
-        using ::remainderf;
-        using std::remquo;
-        using ::remquof;
-        using std::round;
-        using ::roundf;
-        using std::scalbn;
-        using ::scalbnf;
-        using std::signbit;
-        using std::sin;
-        using std::sinh;
-        using ::sinhf;
-        using ::sinf;
-        using std::sqrt;
-        using ::sqrtf;
-        using std::tan;
-        using std::tanh;
-        using ::tanhf;
-        using ::tanf;
-        using std::tgamma;
-        using ::tgammaf;
-        using std::trunc;
-        using ::truncf;
-
-        HCC_MATH_LIB_FN
-        float acosf(float x) { return __ocml_acos_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 acos(_Float16 x) { return __ocml_acos_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float acos(float x) { return __ocml_acos_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double acos(double x) { return __ocml_acos_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float acoshf(float x) { return __ocml_acosh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 acosh(_Float16 x) { return __ocml_acosh_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float acosh(float x) { return __ocml_acosh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double acosh(double x) { return __ocml_acosh_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float asinf(float x) { return __ocml_asin_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 asin(_Float16 x) { return __ocml_asin_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float asin(float x) { return __ocml_asin_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double asin(double x) { return __ocml_asin_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float asinhf(float x) { return __ocml_asinh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 asinh(_Float16 x) { return __ocml_asinh_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float asinh(float x) { return __ocml_asinh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double asinh(double x) { return __ocml_asinh_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float atanf(float x) { return __ocml_atan_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 atan(_Float16 x) { return __ocml_atan_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float atan(float x) { return __ocml_atan_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double atan(double x) { return __ocml_atan_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float atanhf(float x) { return __ocml_atanh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 atanh(_Float16 x) { return __ocml_atanh_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float atanh(float x) { return __ocml_atanh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double atanh(double x) { return __ocml_atanh_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float atan2f(float y, float x) { return __ocml_atan2_f32(y, x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 atan2(_Float16 x, _Float16 y) { return __ocml_atan2_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float atan2(float y, float x) { return __ocml_atan2_f32(y, x); }
-
-        HCC_MATH_LIB_FN
-        double atan2(double y, double x) { return __ocml_atan2_f64(y, x); }
-
-        HCC_MATH_LIB_FN
-        float cbrtf(float x) { return __ocml_cbrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 cbrt(_Float16 x) { return __ocml_cbrt_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float cbrt(float x) { return __ocml_cbrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double cbrt(double x) { return __ocml_cbrt_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float ceilf(float x) { return __ocml_ceil_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 ceil(_Float16 x) { return __ocml_ceil_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float ceil(float x) { return __ocml_ceil_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double ceil(double x) { return __ocml_ceil_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float copysignf(float x, float y) { return __ocml_copysign_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 copysign(_Float16 x, _Float16 y) { return __ocml_copysign_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float copysign(float x, float y)
-        {
-            return __ocml_copysign_f32(x, y);
-        }
-
-        HCC_MATH_LIB_FN
-        double copysign(double x, double y)
-        {
-            return __ocml_copysign_f64(x, y);
-        }
-
-        HCC_MATH_LIB_FN
-        float cosf(float x) { return __ocml_cos_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 cos(_Float16 x) { return __ocml_cos_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float cos(float x) { return __ocml_cos_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double cos(double x) { return __ocml_cos_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float coshf(float x) { return __ocml_cosh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 cosh(_Float16 x) { return __ocml_cosh_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float cosh(float x) { return __ocml_cosh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double cosh(double x) { return __ocml_cosh_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float cospif(float x) { return __ocml_cospi_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 cospi(_Float16 x) { return __ocml_cospi_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float cospi(float x) { return __ocml_cospi_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double cospi(double x) { return __ocml_cospi_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float erff(float x) { return __ocml_erf_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 erf(_Float16 x) { return __ocml_erf_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float erf(float x) { return __ocml_erf_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double erf(double x) { return __ocml_erf_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float erfcf(float x) { return __ocml_erfc_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 erfc(_Float16 x) { return __ocml_erfc_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float erfc(float x) { return __ocml_erfc_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double erfc(double x) { return __ocml_erfc_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float erfcinvf(float x) { return __ocml_erfcinv_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 erfcinv(_Float16 x) { return __ocml_erfcinv_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float erfcinv(float x) { return __ocml_erfcinv_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double erfcinv(double x) { return __ocml_erfcinv_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float erfinvf(float x) { return __ocml_erfinv_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 erfinv(_Float16 x) { return __ocml_erfinv_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float erfinv(float x) { return __ocml_erfinv_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double erfinv(double x) { return __ocml_erfinv_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float expf(float x) { return __ocml_exp_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 exp(_Float16 x) { return __ocml_exp_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float exp(float x) { return __ocml_exp_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double exp(double x) { return __ocml_exp_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float exp2f(float x) { return __ocml_exp2_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 exp2(_Float16 x) { return __ocml_exp2_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float exp2(float x) { return __ocml_exp2_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double exp2(double x) { return __ocml_exp2_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float exp10f(float x) { return __ocml_exp10_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 exp10(_Float16 x) { return __ocml_exp10_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float exp10(float x) { return __ocml_exp10_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double exp10(double x) { return __ocml_exp10_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float expm1f(float x) { return __ocml_expm1_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 expm1(_Float16 x) { return __ocml_expm1_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float expm1(float x) { return __ocml_expm1_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double expm1(double x) { return __ocml_expm1_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float fabsf(float x) { return __ocml_fabs_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fabs(_Float16 x) { return __ocml_fabs_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float fabs(float x) { return __ocml_fabs_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double fabs(double x) { return __ocml_fabs_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float fdimf(float x, float y) { return __ocml_fdim_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fdim(_Float16 x, _Float16 y) { return __ocml_fdim_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fdim(float x, float y) { return __ocml_fdim_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        double fdim(double x, double y) { return __ocml_fdim_f64(x, y); }
-
-        HCC_MATH_LIB_FN
-        float floorf(float x) { return __ocml_floor_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 floor(_Float16 x) { return __ocml_floor_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float floor(float x) { return __ocml_floor_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double floor(double x) { return __ocml_floor_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float fmaf(float x, float y, float z) { return __ocml_fma_f32(x, y, z); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fma(_Float16 x, _Float16 y, _Float16 z)
-        {
-            return __ocml_fma_f16(x, y, z);
-        }
-
-        HCC_MATH_LIB_FN
-        float fma(float x, float y, float z)
-        {
-            return __ocml_fma_f32(x, y, z);
-        }
-
-        HCC_MATH_LIB_FN
-        double fma(double x, double y, double z)
-        {
-            return __ocml_fma_f64(x, y, z);
-        }
-
-        HCC_MATH_LIB_FN
-        float fmaxf(float x, float y) { return __ocml_fmax_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fmax(_Float16 x, _Float16 y) { return __ocml_fmax_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fmax(float x, float y) { return __ocml_fmax_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        double fmax(double x, double y) { return __ocml_fmax_f64(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fminf(float x, float y) { return __ocml_fmin_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fmin(_Float16 x, _Float16 y) { return __ocml_fmin_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fmin(float x, float y) { return __ocml_fmin_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        double fmin(double x, double y) { return __ocml_fmin_f64(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fmodf(float x, float y) { return __ocml_fmod_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 fmod(_Float16 x, _Float16 y) { return __ocml_fmod_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float fmod(float x, float y) { return __ocml_fmod_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        double fmod(double x, double y) { return __ocml_fmod_f64(x, y); }
-
-        HCC_MATH_LIB_FN
-        int fpclassify(_Float16 x) { return __ocml_fpclassify_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int fpclassify(float x) { return __ocml_fpclassify_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int fpclassify(double x) { return __ocml_fpclassify_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float frexpf(float x, int *exp) {
-        	int e; float ret =__ocml_frexp_f32(x, (__attribute__((address_space(5))) int*) &e);
-            *exp = e;  return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        _Float16 frexp(_Float16 x, int* exp) {
-        	int e; _Float16 ret = __ocml_frexp_f16(x, (__attribute__((address_space(5))) int*) &e);
-        	*exp = e; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        float frexp(float x, int *exp) { return precise_math::frexpf(x, exp); }
-
-        HCC_MATH_LIB_FN
-        double frexp(double x, int *exp) { return precise_math::frexpf(x, exp); }
-
-        HCC_MATH_LIB_FN
-        float hypotf(float x, float y) { return __ocml_hypot_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 hypot(_Float16 x, _Float16 y) { return __ocml_hypot_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float hypot(float x, float y) { return __ocml_hypot_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        double hypot(double x, double y) { return __ocml_hypot_f64(x, y); }
-
-        HCC_MATH_LIB_FN
-        int ilogbf(float x) { return __ocml_ilogb_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int ilogb(_Float16 x) { return __ocml_ilogb_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int ilogb(float x) { return __ocml_ilogb_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int ilogb(double x) { return __ocml_ilogb_f64(x); }
-
-        HCC_MATH_LIB_FN
-        int isfinite(_Float16 x) { return __ocml_isfinite_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int isfinite(float x) { return __ocml_isfinite_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int isfinite(double x) { return __ocml_isfinite_f64(x); }
-
-        HCC_MATH_LIB_FN
-        int isinf(_Float16 x) { return __ocml_isinf_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int isinf(float x) { return __ocml_isinf_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int isinf(double x) { return __ocml_isinf_f64(x); }
-
-        HCC_MATH_LIB_FN
-        int isnan(_Float16 x) { return __ocml_isnan_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int isnan(float x) { return __ocml_isnan_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int isnan(double x) { return __ocml_isnan_f64(x); }
-
-        HCC_MATH_LIB_FN
-        int isnormal(_Float16 x) { return __ocml_isnormal_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int isnormal(float x) { return __ocml_isnormal_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int isnormal(double x) { return __ocml_isnormal_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float ldexpf(float x, int exp) { return __ocml_ldexp_f32(x, exp); }
-
-        HCC_MATH_LIB_FN
-        _Float16 ldexp(_Float16 x, std::int16_t e) { return __ocml_ldexp_f16(x, e); }
-
-        HCC_MATH_LIB_FN
-        float ldexp(float x, int exp) { return __ocml_ldexp_f32(x, exp); }
-
-        HCC_MATH_LIB_FN
-        double ldexp(double x, int exp) { return __ocml_ldexp_f64(x,exp); }
-
-        HCC_MATH_LIB_FN
-        float lgammaf(float x) { return __ocml_lgamma_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 lgamma(_Float16 x) { return __ocml_lgamma_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float lgamma(float x) { return __ocml_lgamma_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double lgamma(double x) { return __ocml_lgamma_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float logf(float x) { return __ocml_log_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 log(_Float16 x) { return __ocml_log_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float log(float x) { return __ocml_log_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double log(double x) { return __ocml_log_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float log10f(float x) { return __ocml_log10_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 log10(_Float16 x) { return __ocml_log10_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float log10(float x) { return __ocml_log10_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double log10(double x) { return __ocml_log10_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float log2f(float x) { return __ocml_log2_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 log2(_Float16 x) { return __ocml_log2_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float log2(float x) { return __ocml_log2_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double log2(double x) { return __ocml_log2_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float log1pf(float x) { return __ocml_log1p_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 log1p(_Float16 x) { return __ocml_log1p_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float log1p(float x) { return __ocml_log1p_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double log1p(double x) { return __ocml_log1p_f32(x); }
-
-        HCC_MATH_LIB_FN
-        float logbf(float x) { return __ocml_logb_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 logb(_Float16 x) { return __ocml_logb_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float logb(float x) { return __ocml_logb_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double logb(double x) { return __ocml_logb_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float modff(float x, float *iptr) {
-        	float i;  float ret = __ocml_modf_f32(x, (__attribute__((address_space(5))) float*)&i);
-        	*iptr = i; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        _Float16 modf(_Float16 x, _Float16* p) {
-        	_Float16 lp; _Float16 ret = __ocml_modf_f16(x, (__attribute__((address_space(5))) _Float16*) &lp);
-        	*p = lp; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        float modf(float x, float* p) { return precise_math::modff(x, p); }
-
-        HCC_MATH_LIB_FN
-        double modf(double x, double* p) {
-        	double lp; double ret = __ocml_modf_f64(x, (__attribute__((address_space(5))) double*) &lp);
-        	*p = lp; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        _Float16 nanh(int x) { return __ocml_nan_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float nanf(int tagp) { return __ocml_nan_f32(tagp); }
-
-        HCC_MATH_LIB_FN
-        double nan(int tagp)
-        {
-            return __ocml_nan_f64(static_cast<unsigned long>(tagp));
-        }
-
-        HCC_MATH_LIB_FN
-        float nearbyintf(float x) { return __ocml_nearbyint_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 nearbyint(_Float16 x) { return __ocml_nearbyint_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float nearbyint(float x) { return __ocml_nearbyint_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double nearbyint(double x) { return __ocml_nearbyint_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float nextafterf(float x, float y) { return __ocml_nextafter_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 nextafter(_Float16 x, _Float16 y)
-        {
-            return __ocml_nextafter_f16(x, y);
-        }
-
-        HCC_MATH_LIB_FN
-        float nextafter(float x, float y)
-        {
-            return __ocml_nextafter_f32(x, y);
-        }
-
-        HCC_MATH_LIB_FN
-        double nextafter(double x, double y)
-        {
-            return __ocml_nextafter_f64(x, y);
-        }
-
-        HCC_MATH_LIB_FN
-        float powf(float x, float y) { return __ocml_pow_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 pow(_Float16 x, _Float16 y) { return __ocml_pow_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float pow(float x, float y) { return __ocml_pow_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        double pow(double x, double y) { return __ocml_pow_f64(x, y); }
-
-        HCC_MATH_LIB_FN
-        float rcbrtf(float x) { return __ocml_rcbrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 rcbrt(_Float16 x) { return __ocml_rcbrt_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float rcbrt(float x) { return __ocml_rcbrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double rcbrt(double x) { return __ocml_rcbrt_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float remainderf(float x, float y) { return __ocml_remainder_f32(x, y); }
-
-        HCC_MATH_LIB_FN
-        _Float16 remainder(_Float16 x, _Float16 y)
-        {
-            return __ocml_remainder_f16(x, y);
-        }
-
-        HCC_MATH_LIB_FN
-        float remainder(float x, float y)
-        {
-            return __ocml_remainder_f32(x, y);
-        }
-
-        HCC_MATH_LIB_FN
-        double remainder(double x, double y)
-        {
-            return __ocml_remainder_f64(x, y);
-        }
-
-        HCC_MATH_LIB_FN
-        float remquof(float x, float y, int *quo)
-        {
-        	int lq; float ret = __ocml_remquo_f32(x, y, (__attribute__((address_space(5))) int*) &lq);
-        	*quo = lq; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        _Float16 remquo(_Float16 x, _Float16 y, int* q)
-        {
-        	int lq; _Float16 ret = __ocml_remquo_f16(x, y, (__attribute__((address_space(5))) int*) &lq);
-        	*q = lq; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        float remquo(float x, float y, int *quo) { return precise_math::remquof(x, y, quo); }
-
-        HCC_MATH_LIB_FN
-        double remquo(double x, double y, int *quo)
-        {
-        	int lq; double ret = __ocml_remquo_f64(x, y, (__attribute__((address_space(5))) int*) &lq);
-        	*quo = lq; return ret;
-        }
-
-        HCC_MATH_LIB_FN
-        float roundf(float x) { return __ocml_round_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 round(_Float16 x) { return __ocml_round_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float round(float x) { return __ocml_round_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double round(double x) { return __ocml_round_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float rsqrtf(float x) { return __ocml_rsqrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 rsqrt(_Float16 x) { return __ocml_rsqrt_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float rsqrt(float x) { return __ocml_rsqrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double rsqrt(double x) { return __ocml_rsqrt_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float sinpif(float x) { return __ocml_sinpi_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 sinpi(_Float16 x) { return __ocml_sinpi_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float sinpi(float x) { return __ocml_sinpi_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double sinpi(double x) { return __ocml_sinpi_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float scalbf(float x, float exp) { return __ocml_scalb_f32(x, exp); }
-
-        HCC_MATH_LIB_FN
-        _Float16 scalb(_Float16 x, _Float16 y) { return __ocml_scalb_f16(x, y); }
-
-        HCC_MATH_LIB_FN
-        float scalb(float x, float exp) { return __ocml_scalb_f32(x, exp); }
-
-        HCC_MATH_LIB_FN
-        double scalb(double x, double exp) { return __ocml_scalb_f64(x, exp); }
-
-        HCC_MATH_LIB_FN
-        float scalbnf(float x, int exp) { return __ocml_scalbn_f32(x, exp); }
-
-        HCC_MATH_LIB_FN
-        _Float16 scalbn(_Float16 x, int e) { return __ocml_scalbn_f16(x, e); }
-
-        HCC_MATH_LIB_FN
-        float scalbn(float x, int exp) { return __ocml_scalbn_f32(x, exp); }
-
-        HCC_MATH_LIB_FN
-        double scalbn(double x, int exp) { return __ocml_scalbn_f64(x, exp); }
-
-        HCC_MATH_LIB_FN
-        int signbitf(float x) { return __ocml_signbit_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int signbit(_Float16 x) { return __ocml_signbit_f16(x); }
-
-        HCC_MATH_LIB_FN
-        int signbit(float x) { return __ocml_signbit_f32(x); }
-
-        HCC_MATH_LIB_FN
-        int signbit(double x) { return __ocml_signbit_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float sinf(float x) { return __ocml_sin_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 sin(_Float16 x) { return __ocml_sin_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float sin(float x) { return __ocml_sin_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double sin(double x) { return __ocml_sin_f64(x); }
-
-        HCC_MATH_LIB_FN
-        void sincosf(float x, float *s, float *c) {
-        	float lc; *s = __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*) &lc);
-        	*c = lc;
-        }
-
-        HCC_MATH_LIB_FN
-        void sincos(_Float16 x, _Float16* s, _Float16* c)
-        {
-            _Float16 lc; *s = __ocml_sincos_f16(x, (__attribute__((address_space(5))) _Float16*) &lc);
-            *c = lc;
-        }
-
-        HCC_MATH_LIB_FN
-        void sincos(float x, float *s, float *c) { precise_math::sincosf(x, s, c); }
-       
-        HCC_MATH_LIB_FN
-        void sincos(double x, double *s, double *c)
-        {
-        	double lc; *s = __ocml_sincos_f64(x, (__attribute__((address_space(5))) double*) &lc);
-        	*c = lc;
-        }
-
-        HCC_MATH_LIB_FN
-        float sinhf(float x) { return __ocml_sinh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 sinh(_Float16 x) { return __ocml_sinh_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float sinh(float x) { return __ocml_sinh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double sinh(double x) { return __ocml_sinh_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float sqrtf(float x) { return __ocml_sqrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 sqrt(_Float16 x) { return __ocml_sqrt_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float sqrt(float x) { return __ocml_sqrt_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double sqrt(double x) { return __ocml_sqrt_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float tgammaf(float x) { return __ocml_tgamma_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 tgamma(_Float16 x) { return __ocml_tgamma_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float tgamma(float x) { return __ocml_tgamma_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double tgamma(double x) { return __ocml_tgamma_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float tanf(float x) { return __ocml_tan_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 tan(_Float16 x) { return __ocml_tan_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float tan(float x) { return __ocml_tan_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double tan(double x) { return __ocml_tan_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float tanhf(float x) { return __ocml_tanh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 tanh(_Float16 x) { return __ocml_tanh_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float tanh(float x) { return __ocml_tanh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double tanh(double x) { return __ocml_tanh_f32(x); }
-
-        HCC_MATH_LIB_FN
-        float tanpif(float x) { return __ocml_tanpi_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 tanpi(_Float16 x) { return __ocml_tanpi_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float tanpi(float x) { return __ocml_tanpi_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double tanpi(double x) { return __ocml_tanpi_f64(x); }
-
-        HCC_MATH_LIB_FN
-        float truncf(float x) { return __ocml_trunc_f32(x); }
-
-        HCC_MATH_LIB_FN
-        _Float16 trunc(_Float16 x) { return __ocml_trunc_f16(x); }
-
-        HCC_MATH_LIB_FN
-        float trunc(float x) { return __ocml_trunc_f32(x); }
-
-        HCC_MATH_LIB_FN
-        double trunc(double x) { return __ocml_trunc_f64(x); }
-    } // namespace precise_math
-} // namespace Kalmar
diff --git a/include/kalmar_runtime.h b/include/kalmar_runtime.h
index 7f773b6898a..1ff7682e7f9 100644
--- a/include/kalmar_runtime.h
+++ b/include/kalmar_runtime.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#if !defined(__HIPCC__)
+  #warning "This header is only intended for HIP usage, and not for direct inclusion."
+#endif
+
 #include "hc_defines.h"
 #include "kalmar_aligned_alloc.h"
 
@@ -446,7 +450,7 @@ class KalmarDevice
 
     virtual int get_seqnum() const {return -1;}
 
-    virtual bool has_cpu_accessible_am() {return false;}
+    virtual bool has_cpu_accessible_am() const {return false;}
 
 };
 
@@ -496,7 +500,7 @@ class CPUDevice final : public KalmarDevice
     std::shared_ptr<KalmarQueue> createQueue(execute_order order = execute_in_order, queue_priority priority = priority_normal) override { return std::shared_ptr<KalmarQueue>(new CPUQueue(this)); }
     void* create(size_t count, struct rw_info* /* not used */ ) override { return kalmar_aligned_alloc(0x1000, count); }
     void release(void* ptr, struct rw_info* /* nout used */) override { kalmar_aligned_free(ptr); }
-    void* CreateKernel(const char* fun, KalmarQueue *queue) { return nullptr; }
+    void* CreateKernel(const char* fun, KalmarQueue *queue) override { return nullptr; }
 };
 
 /// KalmarContext
diff --git a/include/kalmar_serialize.h b/include/kalmar_serialize.h
deleted file mode 100644
index 5cc2d932f99..00000000000
--- a/include/kalmar_serialize.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#pragma once
-
-#include <set>
-#include "kalmar_runtime.h"
-#include "kalmar_exception.h"
-
-/** \cond HIDDEN_SYMBOLS */
-namespace Kalmar
-{
-
-/// traverse all the buffers that are going to be used in kernel
-class FunctorBufferWalker {
-public:
-    virtual void Append(size_t sz, const void* s) {}
-    virtual void AppendPtr(size_t sz, const void* s) {}
-    virtual void visit_buffer(struct rw_info* rw, bool modify, bool isArray) = 0;
-};
-
-/// This is used to avoid incorrect compiler error
-class Serialize {
-    FunctorBufferWalker* vis;
-public:
-    Serialize(FunctorBufferWalker* vis) : vis(vis) {}
-    void Append(size_t sz, const void* s) { vis->Append(sz, s); }
-    void AppendPtr(size_t sz, const void* s) { vis->AppendPtr(sz, s); }
-    void visit_buffer(struct rw_info* rw, bool modify, bool isArray) {
-        vis->visit_buffer(rw, modify, isArray);
-    }
-};
-
-/// Change the data pointer with device pointer
-/// before/after kernel launches in cpu path
-class CPUVisitor : public FunctorBufferWalker
-{
-    std::shared_ptr<KalmarQueue> pQueue;
-    std::set<struct rw_info*> bufs;
-public:
-    CPUVisitor(std::shared_ptr<KalmarQueue> pQueue) : pQueue(pQueue) {}
-    void visit_buffer(struct rw_info* rw, bool modify, bool isArray) override {
-        if (isArray) {
-            auto curr = pQueue->getDev()->get_path();
-            auto path = rw->master->getDev()->get_path();
-            if (path == L"cpu") {
-                auto asoc = rw->stage->getDev()->get_path();
-                if (asoc == L"cpu" || path != curr)
-                    throw runtime_exception(__errorMsg_UnsupportedAccelerator, E_FAIL);
-            }
-        }
-        rw->sync(pQueue, modify, false);
-        if (bufs.find(rw) == std::end(bufs)) {
-            void*& device = rw->devs[pQueue->getDev()].data;
-            void*& data = rw->data;
-            bufs.insert(rw);
-            std::swap(device, data);
-        }
-    }
-};
-
-/// Append kernel argument to kernel
-class BufferArgumentsAppender : public FunctorBufferWalker
-{
-    std::shared_ptr<KalmarQueue> pQueue;
-    void* k_;
-    int current_idx_;
-public:
-    BufferArgumentsAppender(std::shared_ptr<KalmarQueue> pQueue, void* k)
-        : pQueue(pQueue), k_(k), current_idx_(0) {}
-    void Append(size_t sz, const void *s) override {
-        CLAMP::PushArg(k_, current_idx_++, sz, s);
-    }
-    void AppendPtr(size_t sz, const void *s) override {
-        CLAMP::PushArgPtr(k_, current_idx_++, sz, s);
-    }
-    void visit_buffer(struct rw_info* rw, bool modify, bool isArray) override {
-        if (isArray) {
-            auto curr = pQueue->getDev()->get_path();
-            auto path = rw->master->getDev()->get_path();
-            if (path == L"cpu") {
-                auto asoc = rw->stage->getDev()->get_path();
-                if (asoc == L"cpu" || path != curr)
-                    throw runtime_exception(__errorMsg_UnsupportedAccelerator, E_FAIL);
-            }
-        }
-        rw->sync(pQueue, modify, false);
-        pQueue->Push(k_, current_idx_++, rw->devs[pQueue->getDev()].data, modify);
-    }
-};
-
-/// In C++AMP Standard V1.2 Line 3014
-/// If pfe is launched without explicitly specified view, the target accelerator
-/// and the view using which work is submitted to the accelerator, is chosen
-/// from the objects of type array<T,N> that were captured in the kernel lambda.
-///
-/// Thise Searcher will visit all the array<T, N> and find a view to launch kernel
-class QueueSearcher : public FunctorBufferWalker
-{
-    std::shared_ptr<KalmarQueue> pQueue;
-public:
-    QueueSearcher() = default;
-    void visit_buffer(struct rw_info* rw, bool modify, bool isArray) override {
-        if (isArray && !pQueue) {
-            if (rw->master->getDev()->get_path() != L"cpu")
-                pQueue = rw->master;
-            else if (rw->stage->getDev()->get_path() != L"cpu")
-                pQueue = rw->stage;
-        }
-    }
-    std::shared_ptr<KalmarQueue> get_que() const { return pQueue; }
-};
-
-} // namespace Kalmar
-/** \endcond */
diff --git a/include/kalmar_short_vectors.inl b/include/kalmar_short_vectors.inl
deleted file mode 100644
index 9938e581915..00000000000
--- a/include/kalmar_short_vectors.inl
+++ /dev/null
@@ -1,4718 +0,0 @@
-#ifndef _KALMAR_SHORT_VECTORS_H
-#define _KALMAR_SHORT_VECTORS_H
-
-class norm;
-class unorm;
-
-// Do not rely on macro rescanning and further replacement 
-
-// FIXME: The explicit keyword doesn't work if we define constructor outside
-//        the class definition (bug of AST inlining?)
-
-#define NORM_COMMON_PRIVATE_MEMBER(F) \
-friend class F; \
-float Value; 
-
-// FIXME: C() __CPU_GPU__'s behavior is not specified in Specification
-/// C& operator=(const C& other) __CPU_GPU__ do not need to check self-
-/// assignment for accerlation on modern CPU
-#define NORM_COMMON_PUBLIC_MEMBER(C) \
-C() __CPU_GPU__ { set(Value); } \
-\
-~C() __CPU_GPU__ {} \
-\
-C(const C& other) __CPU_GPU__ { Value = other.Value; } \
-\
-C& operator=(const C& other) __CPU_GPU__ \
-{ \
-  Value = other.Value; \
-  return *this; \
-} \
-\
-operator float(void) const __CPU_GPU__ { return Value; } \
-\
-C& operator+=(const C& other) __CPU_GPU__ \
-{ \
-  float Res = Value; \
-  Res += other.Value; \
-  set(Res); \
-  return *this; \
-} \
-\
-C& operator-=(const C& other) __CPU_GPU__ \
-{ \
-  float Res = Value; \
-  Res -= other.Value; \
-  set(Res); \
-  return *this; \
-} \
-\
-C& operator*=(const C& other) __CPU_GPU__ \
-{ \
-  float Res = Value; \
-  Res *= other.Value; \
-  set(Res); \
-  return *this; \
-} \
-\
-C& operator/=(const C& other) __CPU_GPU__ \
-{ \
-  float Res = Value; \
-  Res /= other.Value; \
-  set(Res); \
-  return *this; \
-} \
-\
-C& operator++() __CPU_GPU__ \
-{ \
-  float Res = Value; \
-  ++Res; \
-  set(Res); \
-  return *this; \
-} \
-\
-C operator++(int) __CPU_GPU__ \
-{ \
-  C Ret(*this); \
-  operator++(); \
-  return Ret; \
-} \
-\
-C& operator--() __CPU_GPU__ \
-{ \
-  float Res = Value; \
-  --Res; \
-  set(Res); \
-  return *this; \
-} \
-\
-C operator--(int) __CPU_GPU__ \
-{ \
-  C Ret(*this); \
-  operator--(); \
-  return Ret; \
-}
-
-#if !__HCC_AMP__
-
-#define NORM_CONVERSION_CTOR(C) \
-\
-explicit C(float v) __CPU_GPU__ { set(v); } \
-\
-explicit C(unsigned int v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(int v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(double v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(char v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(short v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(long v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(long long int v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(unsigned char v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(unsigned short v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(unsigned long v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(unsigned long long int v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-
-#else
-
-#define NORM_CONVERSION_CTOR(C) \
-\
-explicit C(float v) __CPU_GPU__ { set(v); } \
-\
-explicit C(unsigned int v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(int v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-\
-explicit C(double v) __CPU_GPU__ { set(static_cast<float>(v)); } \
-
-#endif
-
-// C++ AMP Specification 10.7 norm
-class norm
-{
-private:
-  void set(float v) __CPU_GPU__
-  {
-    v = v < -1.0f ? -1.0f : v;
-    v = v > 1.0f ? 1.0f : v;
-    Value = v;
-  }
-
-public:
-  NORM_COMMON_PRIVATE_MEMBER(unorm)
-
-public:
-  norm(const unorm& other) __CPU_GPU__;
-
-  norm operator-() __CPU_GPU__
-  {
-    norm Ret;
-    Ret.Value = -Value;
-    return Ret;
-  }
-
-  NORM_COMMON_PUBLIC_MEMBER(norm)
-
-  NORM_CONVERSION_CTOR(norm)
-};
-
-// C++ AMP Specification 10.7 unorm
-class unorm
-{
-private:
-  void set(float v) __CPU_GPU__
-  {
-    v = v < 0.0f ? 0.0f : v;
-    v = v > 1.0f ? 1.0f : v;
-    Value = v;
-  }
-public:
-  NORM_COMMON_PRIVATE_MEMBER(norm)
-
-public:
-  explicit unorm(const norm& other) __CPU_GPU__ { set(other.Value); }
-
-  NORM_COMMON_PUBLIC_MEMBER(unorm)
-
-  NORM_CONVERSION_CTOR(unorm)
-};
-
-inline norm::norm(const unorm& other) __CPU_GPU__
-{
-  set(other.Value);
-}
-
-#undef NORM_COMMON_PRIVATE_MEMBER
-#undef NORM_COMMON_PUBLIC_MEMBER
-
-#define NORM_OPERATOR(C) \
-inline C operator+(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return C(static_cast<float>(lhs) + static_cast<float>(rhs)); \
-} \
-\
-inline C operator-(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return C(static_cast<float>(lhs) - static_cast<float>(rhs)); \
-} \
-\
-inline C operator*(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return C(static_cast<float>(lhs) * static_cast<float>(rhs)); \
-} \
-\
-inline C operator/(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return C(static_cast<float>(lhs) / static_cast<float>(rhs)); \
-} \
-\
-inline bool operator==(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return static_cast<float>(lhs) == static_cast<float>(rhs); \
-} \
-\
-inline bool operator!=(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return static_cast<float>(lhs) != static_cast<float>(rhs); \
-} \
-\
-inline bool operator>(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return static_cast<float>(lhs) > static_cast<float>(rhs); \
-} \
-\
-inline bool operator<(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return static_cast<float>(lhs) < static_cast<float>(rhs); \
-} \
-\
-inline bool operator>=(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return static_cast<float>(lhs) >= static_cast<float>(rhs); \
-} \
-\
-inline bool operator<=(const C& lhs, const C& rhs) __CPU_GPU__ \
-{ \
-  return static_cast<float>(lhs) <= static_cast<float>(rhs); \
-}
-
-NORM_OPERATOR(unorm)
-
-NORM_OPERATOR(norm)
-
-#undef NORM_OPERATOR
-
-#define UNORM_MIN ((unorm)0.0f)
-#define UNORM_MAX ((unorm)1.0f)
-#define UNORM_ZERO ((norm)0.0f)
-#define NORM_ZERO ((norm)0.0f)
-#define NORM_MIN ((norm)-1.0f)
-#define NORM_MAX ((norm)1.0f)
-
-// C++ AMP Specification 10.8 short vector types
-
-// How to Define short vector types (Layout):
-//   Class Declaration (10.8.1 Synopsis)
-//   Explicit Conversion Constructor Definitions (10.8.2.2)
-//   Operators between Two References (10.8.1 Synopsis)
-//
-// Class Declaration:
-//   class scalartype_N
-//   {
-//   private:
-//     SCALARTYPE_N_COMMON_PRIVATE_MEMBER
-//
-//   public:
-//     SCALARTYPE_N_COMMON_PUBLIC_MEMBER
-//     /* scalartype_N specific public member */
-//     SINGLE_COMPONENT_ACCESS
-//     SCALARTYPE_N_REFERENCE_SINGLE_COMPONENT_ACCESS
-//     TWO_COMPONENT_ACCESS
-//     THREE_COMPONENT_ACCESS
-//     FOUR_COMPONENT_ACCESS
-//   };
-//
-// Operators between Two References:
-//   SCALARTYPE_N_OPERATOR
-//   /* scalartype_N specific operator */
-
-class int_2;
-class int_3;
-class int_4;
-class uint_2;
-class uint_3;
-class uint_4;
-class float_2;
-class float_3;
-class float_4;
-class double_2;
-class double_3;
-class double_4;
-class norm_2;
-class norm_3;
-class norm_4;
-class unorm_2;
-class unorm_3;
-class unorm_4;
-
-#if !__HCC_AMP__
-// additional short vector types not specified in C++AMP
-class int_1;
-class uint_1;
-class float_1;
-class double_1;
-class char_1;
-class char_2;
-class char_3;
-class char_4;
-class uchar_1;
-class uchar_2;
-class uchar_3;
-class uchar_4;
-class short_1;
-class short_2;
-class short_3;
-class short_4;
-class ushort_1;
-class ushort_2;
-class ushort_3;
-class ushort_4;
-class long_1;
-class long_2;
-class long_3;
-class long_4;
-class ulong_1;
-class ulong_2;
-class ulong_3;
-class ulong_4;
-class longlong_1;
-class longlong_2;
-class longlong_3;
-class longlong_4;
-class ulonglong_1;
-class ulonglong_2;
-class ulonglong_3;
-class ulonglong_4;
-#endif
-
-typedef int_2 int2;
-typedef int_3 int3;
-typedef int_4 int4;
-typedef uint_2 uint2;
-typedef uint_3 uint3;
-typedef uint_4 uint4;
-typedef float_2 float2;
-typedef float_3 float3;
-typedef float_4 float4;
-typedef double_2 double2;
-typedef double_3 double3;
-typedef double_4 double4;
-typedef norm_2 norm2;
-typedef norm_3 norm3;
-typedef norm_4 norm4;
-typedef unorm_2 unorm2;
-typedef unorm_3 unorm3;
-typedef unorm_4 unorm4;
-
-#if !__HCC_AMP__
-// additional short vector types not specified in C++AMP
-typedef int_1 int1;
-typedef uint_1 uint1;
-typedef float_1 float1;
-typedef double_1 double1;
-typedef char_1 char1;
-typedef char_2 char2;
-typedef char_3 char3;
-typedef char_4 char4;
-typedef uchar_1 uchar1;
-typedef uchar_2 uchar2;
-typedef uchar_3 uchar3;
-typedef uchar_4 uchar4;
-typedef short_1 short1;
-typedef short_2 short2;
-typedef short_3 short3;
-typedef short_4 short4;
-typedef ushort_1 ushort1;
-typedef ushort_2 ushort2;
-typedef ushort_3 ushort3;
-typedef ushort_4 ushort4;
-typedef long_1 long1;
-typedef long_2 long2;
-typedef long_3 long3;
-typedef long_4 long4;
-typedef ulong_1 ulong1;
-typedef ulong_2 ulong2;
-typedef ulong_3 ulong3;
-typedef ulong_4 ulong4;
-typedef longlong_1 longlong1;
-typedef longlong_2 longlong2;
-typedef longlong_3 longlong3;
-typedef longlong_4 longlong4;
-typedef ulonglong_1 ulonglong1;
-typedef ulonglong_2 ulonglong2;
-typedef ulonglong_3 ulonglong3;
-typedef ulonglong_4 ulonglong4;
-#endif
-
-//   Class Declaration (10.8.1 Synopsis)
-
-#define SINGLE_COMPONENT_ACCESS(ST, Dim) \
-ST get ## _ ## Dim() const __CPU_GPU__ { return Dim; } \
-\
-void set ## _ ## Dim(ST v) __CPU_GPU__ { Dim = v; }
-
-#define TWO_COMPONENT_ACCESS(ST_2, Dim1, Dim2) \
-ST_2 get_ ## Dim1 ## Dim2() const __CPU_GPU__ \
-{ \
-  return ST_2(Dim1, Dim2); \
-} \
-\
-ST_2 get_ ## Dim2 ## Dim1() const __CPU_GPU__ \
-{ \
-  return ST_2(Dim2, Dim1); \
-} \
-\
-void set_ ## Dim1 ## Dim2(ST_2 v) __CPU_GPU__ \
-{ \
-  Dim1 = v.get_x(); \
-  Dim2 = v.get_y(); \
-} \
-void set_ ## Dim2 ## Dim1(ST_2 v) __CPU_GPU__ \
-{ \
-  Dim2 = v.get_x(); \
-  Dim1 = v.get_y(); \
-}
-
-#define THREE_COMPONENT_ACCESS(ST_3, Dim1, Dim2, Dim3) \
-ST_3 get_ ## Dim1 ## Dim2 ## Dim3() const __CPU_GPU__ \
-{ \
-  return ST_3(Dim1, Dim2, Dim3); \
-} \
-\
-ST_3 get_ ## Dim1 ## Dim3 ## Dim2() const __CPU_GPU__ \
-{ \
-  return ST_3(Dim1, Dim3, Dim2); \
-} \
-\
-ST_3 get_ ## Dim2 ## Dim1 ## Dim3() const __CPU_GPU__ \
-{ \
-  return ST_3(Dim2, Dim1, Dim3); \
-} \
-\
-ST_3 get_ ## Dim2 ## Dim3 ## Dim1() const __CPU_GPU__ \
-{ \
-  return ST_3(Dim2, Dim3, Dim1); \
-} \
-\
-ST_3 get_ ## Dim3 ## Dim1 ## Dim2() const __CPU_GPU__ \
-{ \
-  return ST_3(Dim3, Dim1, Dim2); \
-} \
-\
-ST_3 get_ ## Dim3 ## Dim2 ## Dim1() const __CPU_GPU__ \
-{ \
-  return ST_3(Dim3, Dim2, Dim1); \
-} \
-\
-void set_ ## Dim1 ## Dim2 ## Dim3(ST_3 v) __CPU_GPU__ \
-{ \
-  Dim1 = v.get_x(); \
-  Dim2 = v.get_y(); \
-  Dim3 = v.get_z(); \
-} \
-\
-void set_ ## Dim1 ## Dim3 ## Dim2(ST_3 v) __CPU_GPU__ \
-{ \
-  Dim1 = v.get_x(); \
-  Dim3 = v.get_y(); \
-  Dim2 = v.get_z(); \
-} \
-\
-void set_ ## Dim2 ## Dim1 ## Dim3(ST_3 v) __CPU_GPU__ \
-{ \
-  Dim2 = v.get_x(); \
-  Dim1 = v.get_y(); \
-  Dim3 = v.get_z(); \
-} \
-\
-void set_ ## Dim2 ## Dim3 ## Dim1(ST_3 v) __CPU_GPU__ \
-{ \
-  Dim2 = v.get_x(); \
-  Dim3 = v.get_y(); \
-  Dim1 = v.get_z(); \
-} \
-\
-void set_ ## Dim3 ## Dim1 ## Dim2(ST_3 v) __CPU_GPU__ \
-{ \
-  Dim3 = v.get_x(); \
-  Dim1 = v.get_y(); \
-  Dim2 = v.get_z(); \
-} \
-\
-void set_ ## Dim3 ## Dim2 ## Dim1(ST_3 v) __CPU_GPU__ \
-{ \
-  Dim3 = v.get_x(); \
-  Dim2 = v.get_y(); \
-  Dim1 = v.get_z(); \
-}
-
-#define FOUR_COMPONENT_ACCESS(ST_4, Dim1, Dim2, Dim3, Dim4) \
-ST_4 get_ ## Dim1 ## Dim2 ## Dim3 ## Dim4() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim1, Dim2, Dim3, Dim4); \
-} \
-\
-ST_4 get_ ## Dim1 ## Dim2 ## Dim4 ## Dim3() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim1, Dim2, Dim4, Dim3); \
-} \
-\
-ST_4 get_ ## Dim1 ## Dim3 ## Dim2 ## Dim4() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim1, Dim3, Dim2, Dim4); \
-} \
-\
-ST_4 get_ ## Dim1 ## Dim3 ## Dim4 ## Dim2() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim1, Dim3, Dim4, Dim2); \
-} \
-\
-ST_4 get_ ## Dim1 ## Dim4 ## Dim2 ## Dim3() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim1, Dim4, Dim2, Dim3); \
-} \
-\
-ST_4 get_ ## Dim1 ## Dim4 ## Dim3 ## Dim2() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim1, Dim4, Dim3, Dim2); \
-} \
-\
-ST_4 get_ ## Dim2 ## Dim1 ## Dim3 ## Dim4() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim2, Dim1, Dim3, Dim4); \
-} \
-\
-ST_4 get_ ## Dim2 ## Dim1 ## Dim4 ## Dim3() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim2, Dim1, Dim4, Dim3); \
-} \
-\
-ST_4 get_ ## Dim2 ## Dim3 ## Dim1 ## Dim4() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim2, Dim3, Dim1, Dim4); \
-} \
-\
-ST_4 get_ ## Dim2 ## Dim3 ## Dim4 ## Dim1() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim2, Dim3, Dim4, Dim1); \
-} \
-\
-ST_4 get_ ## Dim2 ## Dim4 ## Dim1 ## Dim3() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim2, Dim4, Dim1, Dim3); \
-} \
-\
-ST_4 get_ ## Dim2 ## Dim4 ## Dim3 ## Dim1() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim2, Dim4, Dim3, Dim1); \
-} \
-\
-ST_4 get_ ## Dim3 ## Dim1 ## Dim2 ## Dim4() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim3, Dim1, Dim2, Dim4); \
-} \
-\
-ST_4 get_ ## Dim3 ## Dim1 ## Dim4 ## Dim2() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim3, Dim1, Dim4, Dim2); \
-} \
-\
-ST_4 get_ ## Dim3 ## Dim2 ## Dim1 ## Dim4() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim3, Dim2, Dim1, Dim4); \
-} \
-\
-ST_4 get_ ## Dim3 ## Dim2 ## Dim4 ## Dim1() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim3, Dim2, Dim4, Dim1); \
-} \
-\
-ST_4 get_ ## Dim3 ## Dim4 ## Dim1 ## Dim2() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim3, Dim4, Dim1, Dim2); \
-} \
-\
-ST_4 get_ ## Dim3 ## Dim4 ## Dim2 ## Dim1() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim3, Dim4, Dim2, Dim1); \
-} \
-\
-ST_4 get_ ## Dim4 ## Dim1 ## Dim2 ## Dim3() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim4, Dim1, Dim2, Dim3); \
-} \
-\
-ST_4 get_ ## Dim4 ## Dim1 ## Dim3 ## Dim2() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim4, Dim1, Dim3, Dim2); \
-} \
-\
-ST_4 get_ ## Dim4 ## Dim2 ## Dim1 ## Dim3() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim4, Dim2, Dim1, Dim3); \
-} \
-\
-ST_4 get_ ## Dim4 ## Dim2 ## Dim3 ## Dim1() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim4, Dim2, Dim3, Dim1); \
-} \
-\
-ST_4 get_ ## Dim4 ## Dim3 ## Dim1 ## Dim2() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim4, Dim3, Dim1, Dim2); \
-} \
-\
-ST_4 get_ ## Dim4 ## Dim3 ## Dim2 ## Dim1() const __CPU_GPU__ \
-{ \
-  return ST_4(Dim4, Dim3, Dim2, Dim1); \
-} \
-\
-void set_ ## Dim1 ## Dim2 ## Dim3 ## Dim4(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim1 = v.get_x(); \
-  Dim2 = v.get_y(); \
-  Dim3 = v.get_z(); \
-  Dim4 = v.get_w(); \
-} \
-\
-void set_ ## Dim1 ## Dim2 ## Dim4 ## Dim3(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim1 = v.get_x(); \
-  Dim2 = v.get_y(); \
-  Dim4 = v.get_z(); \
-  Dim3 = v.get_w(); \
-} \
-\
-void set_ ## Dim1 ## Dim3 ## Dim2 ## Dim4(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim1 = v.get_x(); \
-  Dim3 = v.get_y(); \
-  Dim2 = v.get_z(); \
-  Dim4 = v.get_w(); \
-} \
-\
-void set_ ## Dim1 ## Dim3 ## Dim4 ## Dim2(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim1 = v.get_x(); \
-  Dim3 = v.get_y(); \
-  Dim4 = v.get_z(); \
-  Dim2 = v.get_w(); \
-} \
-\
-void set_ ## Dim1 ## Dim4 ## Dim2 ## Dim3(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim1 = v.get_x(); \
-  Dim4 = v.get_y(); \
-  Dim2 = v.get_z(); \
-  Dim3 = v.get_w(); \
-} \
-\
-void set_ ## Dim1 ## Dim4 ## Dim3 ## Dim2(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim1 = v.get_x(); \
-  Dim4 = v.get_y(); \
-  Dim3 = v.get_z(); \
-  Dim2 = v.get_w(); \
-} \
-\
-void set_ ## Dim2 ## Dim1 ## Dim3 ## Dim4(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim2 = v.get_x(); \
-  Dim1 = v.get_y(); \
-  Dim3 = v.get_z(); \
-  Dim4 = v.get_w(); \
-} \
-\
-void set_ ## Dim2 ## Dim1 ## Dim4 ## Dim3(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim2 = v.get_x(); \
-  Dim1 = v.get_y(); \
-  Dim4 = v.get_z(); \
-  Dim3 = v.get_w(); \
-} \
-\
-void set_ ## Dim2 ## Dim3 ## Dim1 ## Dim4(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim2 = v.get_x(); \
-  Dim3 = v.get_y(); \
-  Dim1 = v.get_z(); \
-  Dim4 = v.get_w(); \
-} \
-\
-void set_ ## Dim2 ## Dim3 ## Dim4 ## Dim1(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim2 = v.get_x(); \
-  Dim3 = v.get_y(); \
-  Dim4 = v.get_z(); \
-  Dim1 = v.get_w(); \
-} \
-\
-void set_ ## Dim2 ## Dim4 ## Dim1 ## Dim3(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim2 = v.get_x(); \
-  Dim4 = v.get_y(); \
-  Dim1 = v.get_z(); \
-  Dim3 = v.get_w(); \
-} \
-\
-void set_ ## Dim2 ## Dim4 ## Dim3 ## Dim1(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim2 = v.get_x(); \
-  Dim4 = v.get_y(); \
-  Dim3 = v.get_z(); \
-  Dim1 = v.get_w(); \
-} \
-\
-void set_ ## Dim3 ## Dim1 ## Dim2 ## Dim4(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim3 = v.get_x(); \
-  Dim1 = v.get_y(); \
-  Dim2 = v.get_z(); \
-  Dim4 = v.get_w(); \
-} \
-\
-void set_ ## Dim3 ## Dim1 ## Dim4 ## Dim2(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim3 = v.get_x(); \
-  Dim1 = v.get_y(); \
-  Dim4 = v.get_z(); \
-  Dim2 = v.get_w(); \
-} \
-\
-void set_ ## Dim3 ## Dim2 ## Dim1 ## Dim4(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim3 = v.get_x(); \
-  Dim2 = v.get_y(); \
-  Dim1 = v.get_z(); \
-  Dim4 = v.get_w(); \
-} \
-\
-void set_ ## Dim3 ## Dim2 ## Dim4 ## Dim1(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim3 = v.get_x(); \
-  Dim2 = v.get_y(); \
-  Dim4 = v.get_z(); \
-  Dim1 = v.get_w(); \
-} \
-\
-void set_ ## Dim3 ## Dim4 ## Dim1 ## Dim2(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim3 = v.get_x(); \
-  Dim4 = v.get_y(); \
-  Dim1 = v.get_z(); \
-  Dim2 = v.get_w(); \
-} \
-\
-void set_ ## Dim3 ## Dim4 ## Dim2 ## Dim1(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim3 = v.get_x(); \
-  Dim4 = v.get_y(); \
-  Dim2 = v.get_z(); \
-  Dim1 = v.get_w(); \
-} \
-\
-void set_ ## Dim4 ## Dim1 ## Dim2 ## Dim3(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim4 = v.get_x(); \
-  Dim1 = v.get_y(); \
-  Dim2 = v.get_z(); \
-  Dim3 = v.get_w(); \
-} \
-\
-void set_ ## Dim4 ## Dim1 ## Dim3 ## Dim2(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim4 = v.get_x(); \
-  Dim1 = v.get_y(); \
-  Dim3 = v.get_z(); \
-  Dim2 = v.get_w(); \
-} \
-\
-void set_ ## Dim4 ## Dim2 ## Dim1 ## Dim3(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim4 = v.get_x(); \
-  Dim2 = v.get_y(); \
-  Dim1 = v.get_z(); \
-  Dim3 = v.get_w(); \
-} \
-\
-void set_ ## Dim4 ## Dim2 ## Dim3 ## Dim1(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim4 = v.get_x(); \
-  Dim2 = v.get_y(); \
-  Dim3 = v.get_z(); \
-  Dim1 = v.get_w(); \
-} \
-\
-void set_ ## Dim4 ## Dim3 ## Dim1 ## Dim2(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim4 = v.get_x(); \
-  Dim3 = v.get_y(); \
-  Dim1 = v.get_z(); \
-  Dim2 = v.get_w(); \
-} \
-\
-void set_ ## Dim4 ## Dim3 ## Dim2 ## Dim1(ST_4 v) __CPU_GPU__ \
-{ \
-  Dim4 = v.get_x(); \
-  Dim3 = v.get_y(); \
-  Dim2 = v.get_z(); \
-  Dim1 = v.get_w(); \
-}
-
-#define SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(ST) \
-ST& ref_x() __CPU_GPU__ { return x; } \
-\
-ST& ref_r() __CPU_GPU__ { return x; }
-
-#define SCALARTYPE_1_COMMON_PUBLIC_MEMBER(ST, ST_1) \
-ST x; \
-typedef ST value_type; \
-static const int size = 1; \
-\
-ST_1() __CPU_GPU__ {} \
-\
-~ST_1() __CPU_GPU__ {} \
-\
-ST_1(ST value) __CPU_GPU__ \
-{ \
-  x = value; \
-} \
-\
-ST_1(const ST_1&  other) __CPU_GPU__ \
-{ \
-  x = other.x; \
-} \
-\
-ST_1& operator=(const ST_1& other) __CPU_GPU__ \
-{ \
-  x = other.x; \
-  return *this; \
-} \
-\
-ST_1& operator++() __CPU_GPU__ \
-{ \
-  ++x; \
-  return *this; \
-} \
-\
-ST_1 operator++(int) __CPU_GPU__ \
-{ \
-  ST_1 Ret(*this); \
-  operator++(); \
-  return Ret; \
-} \
-\
-ST_1& operator--() __CPU_GPU__ \
-{ \
-  --x; \
-  return *this; \
-} \
-\
-ST_1 operator--(int) __CPU_GPU__ \
-{ \
-  ST_1 Ret(*this); \
-  operator--(); \
-  return Ret; \
-} \
-\
-ST_1& operator+=(const ST_1& rhs) __CPU_GPU__ \
-{ \
-  x += rhs.x; \
-  return *this; \
-} \
-\
-ST_1& operator-=(const ST_1& rhs) __CPU_GPU__ \
-{ \
-  x -= rhs.x; \
-  return *this; \
-} \
-\
-ST_1& operator*=(const ST_1& rhs) __CPU_GPU__ \
-{ \
-  x *= rhs.x; \
-  return *this; \
-} \
-\
-ST_1& operator/=(const ST_1& rhs) __CPU_GPU__ \
-{ \
-  x /= rhs.x; \
-  return *this; \
-}
-
-#if !__HCC_AMP__
-
-#define SCALARTYPE_1_CONVERSION_CTOR(ST_1, \
-ST_1_o1, ST_1_o2, ST_1_o3, ST_1_o4, ST_1_o5, \
-ST_1_o6, ST_1_o7, ST_1_o8, ST_1_o9, ST_1_o10, ST_1_o11) \
-\
-explicit ST_1(const ST_1_o1& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o2& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o3& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o4& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o5& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o6& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o7& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o8& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o9& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o10& other) __CPU_GPU__; \
-\
-explicit ST_1(const ST_1_o11& other) __CPU_GPU__;
-
-#endif // if !__HCC_AMP__
-
-#if !__HCC_AMP__
-class int_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(int, int_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(int_1,
-    uint_1, float_1, double_1,
-    char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-  int_1 operator-() const __CPU_GPU__ { return int_1(-x); }
-
-  int_1 operator~() const __CPU_GPU__ { return int_1(~x); }
-
-  int_1& operator%=(const int_1& rhs) __CPU_GPU__
-  {
-    x %= rhs.x;
-    return *this;
-  }
-
-  int_1& operator^=(const int_1& rhs) __CPU_GPU__
-  {
-    x ^= rhs.x;
-    return *this;
-  }
-
-  int_1& operator|=(const int_1& rhs) __CPU_GPU__
-  {
-    x |= rhs.x;
-    return *this;
-  }
-
-  int_1& operator&=(const int_1& rhs) __CPU_GPU__
-  {
-    x &= rhs.x;
-    return *this;
-  }
-
-  int_1& operator>>=(const int_1& rhs) __CPU_GPU__
-  {
-    x >>= rhs.x;
-    return *this;
-  }
-
-  int_1& operator<<=(const int_1& rhs) __CPU_GPU__
-  {
-    x <<= rhs.x;
-    return *this;
-  }
-  
-  SINGLE_COMPONENT_ACCESS(int, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(int)
-
-};
-
-class uint_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(unsigned int, uint_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(uint_1,
-    int_1, float_1, double_1,
-    char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-  uint_1 operator~() const __CPU_GPU__ { return uint_1(~x); }
-
-  uint_1& operator%=(const uint_1& rhs) __CPU_GPU__
-  {
-    x %= rhs.x;
-    return *this;
-  }
-
-  uint_1& operator^=(const uint_1& rhs) __CPU_GPU__
-  {
-    x ^= rhs.x;
-    return *this;
-  }
-
-  uint_1& operator|=(const uint_1& rhs) __CPU_GPU__
-  {
-    x |= rhs.x;
-    return *this;
-  }
-
-  uint_1& operator&=(const uint_1& rhs) __CPU_GPU__
-  {
-    x &= rhs.x;
-    return *this;
-  }
-
-  uint_1& operator>>=(const uint_1& rhs) __CPU_GPU__
-  {
-    x >>= rhs.x;
-    return *this;
-  }
-
-  uint_1& operator<<=(const uint_1& rhs) __CPU_GPU__
-  {
-    x <<= rhs.x;
-    return *this;
-  }
- 
-  SINGLE_COMPONENT_ACCESS(unsigned int, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned int)
-
-};
-
-class float_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(float, float_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(float_1,
-    int_1, uint_1, double_1,
-    char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-  float_1 operator-() const __CPU_GPU__ { return float_1(-x); }
-
-  SINGLE_COMPONENT_ACCESS(float, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(float)
-
-};
-
-class double_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(double, double_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(double_1,
-    int_1, uint_1, float_1,
-    char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-  double_1 operator-() const __CPU_GPU__ { return double_1(-x); }
-
-  SINGLE_COMPONENT_ACCESS(double, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(double)
-};
-
-class char_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(char, char_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(char_1,
-    int_1, uint_1, float_1,
-    double_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-  char_1 operator-() const __CPU_GPU__ { return char_1(-x); }
-
-  SINGLE_COMPONENT_ACCESS(char, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(char)
-};
-
-class uchar_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(unsigned char, uchar_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(uchar_1,
-    int_1, uint_1, float_1,
-    double_1, char_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-  SINGLE_COMPONENT_ACCESS(unsigned char, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned char)
-};
-
-class short_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(short, short_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(short_1,
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-  short_1 operator-() const __CPU_GPU__ { return short_1(-x); }
-
-  SINGLE_COMPONENT_ACCESS(short, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(short)
-};
-
-class ushort_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(unsigned short, ushort_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(ushort_1,
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, long_1, ulong_1, longlong_1, ulonglong_1)
-    
-  SINGLE_COMPONENT_ACCESS(unsigned short, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned short)
-};
-
-class long_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(long, long_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(long_1,
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, ushort_1, ulong_1, longlong_1, ulonglong_1)
-    
-  long_1 operator-() const __CPU_GPU__ { return long_1(-x); }
-
-  SINGLE_COMPONENT_ACCESS(long, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(long)
-};
-
-class ulong_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(unsigned long, ulong_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(ulong_1,
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, ushort_1, long_1, longlong_1, ulonglong_1)
-    
-  SINGLE_COMPONENT_ACCESS(unsigned long, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned long)
-};
-
-class longlong_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(long long int, longlong_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(longlong_1,
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, ulonglong_1)
-    
-  longlong_1 operator-() const __CPU_GPU__ { return longlong_1(-x); }
-
-  SINGLE_COMPONENT_ACCESS(long long int, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(long long int)
-};
-
-class ulonglong_1
-{
-public:
-  SCALARTYPE_1_COMMON_PUBLIC_MEMBER(unsigned long long int, ulonglong_1)
-
-  SCALARTYPE_1_CONVERSION_CTOR(ulonglong_1,
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1)
-    
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, x)
-
-  SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned long long int)
-};
-
-#endif // if !__HCC_AMP__
-
-#undef SCALARTYPE_1_REFERENCE_SINGLE_COMPONENT_ACCESS
-#undef SCALARTYPE_1_COMMON_PUBLIC_MEMBER
-
-#define SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(ST) \
-ST& ref_x() __CPU_GPU__ { return x; } \
-\
-ST& ref_y() __CPU_GPU__ { return y; } \
-\
-ST& ref_r() __CPU_GPU__ { return x; } \
-\
-ST& ref_g() __CPU_GPU__ { return y; }
-
-#define SCALARTYPE_2_COMMON_PUBLIC_MEMBER(ST, ST_2) \
-ST x; \
-ST y; \
-typedef ST value_type; \
-static const int size = 2; \
-\
-ST_2() __CPU_GPU__ {} \
-\
-~ST_2() __CPU_GPU__ {} \
-\
-ST_2(ST value) __CPU_GPU__ \
-{ \
-  x = value; \
-  y = value; \
-} \
-\
-ST_2(const ST_2&  other) __CPU_GPU__ \
-{ \
-  x = other.x; \
-  y = other.y; \
-} \
-\
-ST_2(ST v1, ST v2) __CPU_GPU__ \
-{ \
-  x = v1; \
-  y = v2; \
-} \
-\
-ST_2& operator=(const ST_2& other) __CPU_GPU__ \
-{ \
-  x = other.x; \
-  y = other.y; \
-  return *this; \
-} \
-\
-ST_2& operator++() __CPU_GPU__ \
-{ \
-  ++x; \
-  ++y; \
-  return *this; \
-} \
-\
-ST_2 operator++(int) __CPU_GPU__ \
-{ \
-  ST_2 Ret(*this); \
-  operator++(); \
-  return Ret; \
-} \
-\
-ST_2& operator--() __CPU_GPU__ \
-{ \
-  --x; \
-  --y; \
-  return *this; \
-} \
-\
-ST_2 operator--(int) __CPU_GPU__ \
-{ \
-  ST_2 Ret(*this); \
-  operator--(); \
-  return Ret; \
-} \
-\
-ST_2& operator+=(const ST_2& rhs) __CPU_GPU__ \
-{ \
-  x += rhs.x; \
-  y += rhs.y; \
-  return *this; \
-} \
-\
-ST_2& operator-=(const ST_2& rhs) __CPU_GPU__ \
-{ \
-  x -= rhs.x; \
-  y -= rhs.y; \
-  return *this; \
-} \
-\
-ST_2& operator*=(const ST_2& rhs) __CPU_GPU__ \
-{ \
-  x *= rhs.x; \
-  y *= rhs.y; \
-  return *this; \
-} \
-\
-ST_2& operator/=(const ST_2& rhs) __CPU_GPU__ \
-{ \
-  x /= rhs.x; \
-  y /= rhs.y; \
-  return *this; \
-}
-
-#if !__HCC_AMP__
-
-#define SCALARTYPE_2_CONVERSION_CTOR(ST_2, \
-ST_2_o1, ST_2_o2, ST_2_o3, ST_2_o4, ST_2_o5, \
-ST_2_o6, ST_2_o7, ST_2_o8, ST_2_o9, ST_2_o10, ST_2_o11, ST_2_o12, ST_2_o13) \
-\
-explicit ST_2(const ST_2_o1& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o2& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o3& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o4& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o5& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o6& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o7& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o8& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o9& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o10& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o11& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o12& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o13& other) __CPU_GPU__;
-
-#else
-
-#define SCALARTYPE_2_CONVERSION_CTOR(ST_2, \
-ST_2_o1, ST_2_o2, ST_2_o3, ST_2_o4, ST_2_o5) \
-\
-explicit ST_2(const ST_2_o1& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o2& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o3& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o4& other) __CPU_GPU__; \
-\
-explicit ST_2(const ST_2_o5& other) __CPU_GPU__;
-
-#endif // if !__HCC_AMP__
-
-class int_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(int, int_2)
-
-#if !__HCC_AMP__
-  SCALARTYPE_2_CONVERSION_CTOR(int_2,
-    uint_2, float_2, double_2, norm_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-#else
-  SCALARTYPE_2_CONVERSION_CTOR(int_2,
-    uint_2, float_2, double_2, norm_2, unorm_2)
-#endif
-
-  int_2 operator-() const __CPU_GPU__ { return int_2(-x, -y); }
-
-  int_2 operator~() const __CPU_GPU__ { return int_2(~x, ~y); }
-
-  int_2& operator%=(const int_2& rhs) __CPU_GPU__
-  {
-    x %= rhs.x;
-    y %= rhs.y;
-    return *this;
-  }
-
-  int_2& operator^=(const int_2& rhs) __CPU_GPU__
-  {
-    x ^= rhs.x;
-    y ^= rhs.y;
-    return *this;
-  }
-
-  int_2& operator|=(const int_2& rhs) __CPU_GPU__
-  {
-    x |= rhs.x;
-    y |= rhs.y;
-    return *this;
-  }
-
-  int_2& operator&=(const int_2& rhs) __CPU_GPU__
-  {
-    x &= rhs.x;
-    y &= rhs.y;
-    return *this;
-  }
-
-  int_2& operator>>=(const int_2& rhs) __CPU_GPU__
-  {
-    x >>= rhs.x;
-    y >>= rhs.y;
-    return *this;
-  }
-
-  int_2& operator<<=(const int_2& rhs) __CPU_GPU__
-  {
-    x <<= rhs.x;
-    y <<= rhs.y;
-    return *this;
-  }
-  
-  SINGLE_COMPONENT_ACCESS(int, x)
-  SINGLE_COMPONENT_ACCESS(int, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(int)
-
-  TWO_COMPONENT_ACCESS(int_2, x, y)
-};
-
-class uint_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(unsigned int, uint_2)
-
-#if !__HCC_AMP__
-  SCALARTYPE_2_CONVERSION_CTOR(uint_2,
-    int_2, float_2, double_2, norm_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-#else
-  SCALARTYPE_2_CONVERSION_CTOR(uint_2,
-    int_2, float_2, double_2, norm_2, unorm_2)
-#endif
- 
-  uint_2 operator~() const __CPU_GPU__ { return uint_2(~x, ~y); }
-
-  uint_2& operator%=(const uint_2& rhs) __CPU_GPU__
-  {
-    x %= rhs.x;
-    y %= rhs.y;
-    return *this;
-  }
-
-  uint_2& operator^=(const uint_2& rhs) __CPU_GPU__
-  {
-    x ^= rhs.x;
-    y ^= rhs.y;
-    return *this;
-  }
-
-  uint_2& operator|=(const uint_2& rhs) __CPU_GPU__
-  {
-    x |= rhs.x;
-    y |= rhs.y;
-    return *this;
-  }
-
-  uint_2& operator&=(const uint_2& rhs) __CPU_GPU__
-  {
-    x &= rhs.x;
-    y &= rhs.y;
-    return *this;
-  }
-
-  uint_2& operator>>=(const uint_2& rhs) __CPU_GPU__
-  {
-    x >>= rhs.x;
-    y >>= rhs.y;
-    return *this;
-  }
-
-  uint_2& operator<<=(const uint_2& rhs) __CPU_GPU__
-  {
-    x <<= rhs.x;
-    y <<= rhs.y;
-    return *this;
-  }
- 
-  SINGLE_COMPONENT_ACCESS(unsigned int, x)
-  SINGLE_COMPONENT_ACCESS(unsigned int, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned int)
-
-  TWO_COMPONENT_ACCESS(uint_2, x, y)
-};
-
-class float_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(float, float_2)
-
-#if !__HCC_AMP__
-  SCALARTYPE_2_CONVERSION_CTOR(float_2,
-    int_2, uint_2, double_2, norm_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-#else
-  SCALARTYPE_2_CONVERSION_CTOR(float_2,
-    int_2, uint_2, double_2, norm_2, unorm_2)
-#endif
-  
-  float_2 operator-() const __CPU_GPU__ { return float_2(-x, -y); }
-
-  SINGLE_COMPONENT_ACCESS(float, x)
-  SINGLE_COMPONENT_ACCESS(float, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(float)
-
-  TWO_COMPONENT_ACCESS(float_2, x, y)
-};
-
-class double_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(double, double_2)
-
-#if !__HCC_AMP__
-  SCALARTYPE_2_CONVERSION_CTOR(double_2,
-    int_2, uint_2, float_2, norm_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-#else
-  SCALARTYPE_2_CONVERSION_CTOR(double_2,
-    int_2, uint_2, float_2, norm_2, unorm_2)
-#endif
-  
-  double_2 operator-() const __CPU_GPU__ { return double_2(-x, -y); }
-
-  SINGLE_COMPONENT_ACCESS(double, x)
-  SINGLE_COMPONENT_ACCESS(double, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(double)
-
-  TWO_COMPONENT_ACCESS(double_2, x, y)
-};
-
-class norm_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(norm, norm_2)
-
-#if !__HCC_AMP__
-  SCALARTYPE_2_CONVERSION_CTOR(norm_2,
-    int_2, uint_2, float_2, double_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-#else
-  SCALARTYPE_2_CONVERSION_CTOR(norm_2,
-    int_2, uint_2, float_2, double_2, unorm_2)
-#endif
-
-#if __GNUG__
-  // for some reason g++ will mistakenly treat x, y as type float
-  // so we need to explicitly cast them to norm type here
-  norm_2 operator-() const __CPU_GPU__ { return norm2(-(norm)x, -(norm)y); }
-#else
-  norm_2 operator-() const __CPU_GPU__ { return norm_2(-x, -y); }
-#endif
-  
-  SINGLE_COMPONENT_ACCESS(norm, x)
-  SINGLE_COMPONENT_ACCESS(norm, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(norm)
-
-  TWO_COMPONENT_ACCESS(norm_2, x, y)
-};
-
-class unorm_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(unorm, unorm_2)
-
-#if !__HCC_AMP__
-  SCALARTYPE_2_CONVERSION_CTOR(unorm_2,
-    int_2, uint_2, float_2, double_2, norm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-#else
-  SCALARTYPE_2_CONVERSION_CTOR(unorm_2,
-    int_2, uint_2, float_2, double_2, norm_2)
-#endif
-
-  SINGLE_COMPONENT_ACCESS(unorm, x)
-  SINGLE_COMPONENT_ACCESS(unorm, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(unorm)
-
-  TWO_COMPONENT_ACCESS(unorm_2, x, y)
-};
-
-// additional types not specified in C++AMP
-#if !__HCC_AMP__
-class char_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(char, char_2)
-
-  SCALARTYPE_2_CONVERSION_CTOR(char_2,
-    int_2, uint_2, float_2, double_2, norm_2,
-    unorm_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-    
-  char_2 operator-() const __CPU_GPU__ { return char_2(-x, -y); }
-
-  SINGLE_COMPONENT_ACCESS(char, x)
-  SINGLE_COMPONENT_ACCESS(char, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(char)
-
-  TWO_COMPONENT_ACCESS(char_2, x, y)
-};
-
-class uchar_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(unsigned char, uchar_2)
-
-  SCALARTYPE_2_CONVERSION_CTOR(uchar_2,
-    int_2, uint_2, float_2, double_2, norm_2,
-    unorm_2, char_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-    
-  SINGLE_COMPONENT_ACCESS(unsigned char, x)
-  SINGLE_COMPONENT_ACCESS(unsigned char, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned char)
-
-  TWO_COMPONENT_ACCESS(uchar_2, x, y)
-};
-
-class short_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(short, short_2)
-
-  SCALARTYPE_2_CONVERSION_CTOR(short_2,
-    int_2, uint_2, float_2, double_2, norm_2,
-    unorm_2, uchar_2, char_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-    
-  short_2 operator-() const __CPU_GPU__ { return short_2(-x, -y); }
-
-  SINGLE_COMPONENT_ACCESS(short, x)
-  SINGLE_COMPONENT_ACCESS(short, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(short)
-
-  TWO_COMPONENT_ACCESS(short_2, x, y)
-};
-
-class ushort_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(unsigned short, ushort_2)
-
-  SCALARTYPE_2_CONVERSION_CTOR(ushort_2,
-    int_2, uint_2, float_2, double_2, norm_2,
-    unorm_2, char_2, short_2, uchar_2, long_2, ulong_2, longlong_2, ulonglong_2)
-    
-  SINGLE_COMPONENT_ACCESS(unsigned short, x)
-  SINGLE_COMPONENT_ACCESS(unsigned short, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned short)
-
-  TWO_COMPONENT_ACCESS(ushort_2, x, y)
-};
-
-class long_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(long, long_2)
-
-  SCALARTYPE_2_CONVERSION_CTOR(long_2,
-    int_2, uint_2, float_2, double_2, norm_2,
-    unorm_2, uchar_2, char_2, ushort_2, short_2, ulong_2, longlong_2, ulonglong_2)
-    
-  long_2 operator-() const __CPU_GPU__ { return long_2(-x, -y); }
-
-  SINGLE_COMPONENT_ACCESS(long, x)
-  SINGLE_COMPONENT_ACCESS(long, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(long)
-
-  TWO_COMPONENT_ACCESS(long_2, x, y)
-};
-
-class ulong_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(unsigned long, ulong_2)
-
-  SCALARTYPE_2_CONVERSION_CTOR(ulong_2,
-    int_2, uint_2, float_2, double_2, norm_2,
-    unorm_2, char_2, short_2, uchar_2, long_2, ushort_2, longlong_2, ulonglong_2)
-    
-  SINGLE_COMPONENT_ACCESS(unsigned long, x)
-  SINGLE_COMPONENT_ACCESS(unsigned long, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned long)
-
-  TWO_COMPONENT_ACCESS(ulong_2, x, y)
-};
-
-class longlong_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(long long int, longlong_2)
-
-  SCALARTYPE_2_CONVERSION_CTOR(longlong_2,
-    int_2, uint_2, float_2, double_2, norm_2,
-    unorm_2, uchar_2, char_2, ushort_2, short_2, ulong_2, long_2, ulonglong_2)
-    
-  longlong_2 operator-() const __CPU_GPU__ { return longlong_2(-x, -y); }
-
-  SINGLE_COMPONENT_ACCESS(long long int, x)
-  SINGLE_COMPONENT_ACCESS(long long int, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(long long int)
-
-  TWO_COMPONENT_ACCESS(longlong_2, x, y)
-};
-
-class ulonglong_2
-{
-public:
-  SCALARTYPE_2_COMMON_PUBLIC_MEMBER(unsigned long long int, ulonglong_2)
-
-  SCALARTYPE_2_CONVERSION_CTOR(ulonglong_2,
-    int_2, uint_2, float_2, double_2, norm_2,
-    unorm_2, char_2, short_2, uchar_2, long_2, ushort_2, longlong_2, ulong_2)
-    
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, x)
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, y)
-
-  SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned long long int)
-
-  TWO_COMPONENT_ACCESS(ulonglong_2, x, y)
-};
-
-#endif // if !__HCC_AMP__
-
-#undef SCALARTYPE_2_REFERENCE_SINGLE_COMPONENT_ACCESS
-#undef SCALARTYPE_2_COMMON_PUBLIC_MEMBER
-
-#define SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(ST) \
-ST& ref_x() __CPU_GPU__ { return x; } \
-\
-ST& ref_y() __CPU_GPU__ { return y; } \
-\
-ST& ref_z() __CPU_GPU__ { return z; } \
-\
-ST& ref_r() __CPU_GPU__ { return x; } \
-\
-ST& ref_g() __CPU_GPU__ { return y; } \
-\
-ST& ref_b() __CPU_GPU__ { return z; }
-
-#define SCALARTYPE_3_COMMON_PUBLIC_MEMBER(ST, ST_3) \
-ST x; \
-ST y; \
-ST z; \
-typedef ST value_type; \
-static const int size = 3; \
-\
-ST_3() __CPU_GPU__ {} \
-\
-~ST_3() __CPU_GPU__ {} \
-\
-ST_3(ST value) __CPU_GPU__ \
-{ \
-  x = value; \
-  y = value; \
-  z = value; \
-} \
-\
-ST_3(const ST_3&  other) __CPU_GPU__ \
-{ \
-  x = other.x; \
-  y = other.y; \
-  z = other.z; \
-} \
-\
-ST_3(ST v1, ST v2, ST v3) __CPU_GPU__ \
-{ \
-  x = v1; \
-  y = v2; \
-  z = v3; \
-} \
-\
-ST_3& operator=(const ST_3& other) __CPU_GPU__ \
-{ \
-  x = other.x; \
-  y = other.y; \
-  z = other.z; \
-  return *this; \
-} \
-\
-ST_3& operator++() __CPU_GPU__ \
-{ \
-  ++x; \
-  ++y; \
-  ++z; \
-  return *this; \
-} \
-\
-ST_3 operator++(int) __CPU_GPU__ \
-{ \
-  ST_3 Ret(*this); \
-  operator++(); \
-  return Ret; \
-} \
-\
-ST_3& operator--() __CPU_GPU__ \
-{ \
-  --x; \
-  --y; \
-  --z; \
-  return *this; \
-} \
-\
-ST_3 operator--(int) __CPU_GPU__ \
-{ \
-  ST_3 Ret(*this); \
-  operator--(); \
-  return Ret; \
-} \
-\
-ST_3& operator+=(const ST_3& rhs) __CPU_GPU__ \
-{ \
-  x += rhs.x; \
-  y += rhs.y; \
-  z += rhs.z; \
-  return *this; \
-} \
-\
-ST_3& operator-=(const ST_3& rhs) __CPU_GPU__ \
-{ \
-  x -= rhs.x; \
-  y -= rhs.y; \
-  z -= rhs.z; \
-  return *this; \
-} \
-\
-ST_3& operator*=(const ST_3& rhs) __CPU_GPU__ \
-{ \
-  x *= rhs.x; \
-  y *= rhs.y; \
-  z *= rhs.z; \
-  return *this; \
-} \
-\
-ST_3& operator/=(const ST_3& rhs) __CPU_GPU__ \
-{ \
-  x /= rhs.x; \
-  y /= rhs.y; \
-  z /= rhs.z; \
-  return *this; \
-}
-
-#if !__HCC_AMP__
-
-#define SCALARTYPE_3_CONVERSION_CTOR(ST_3, \
-ST_3_o1, ST_3_o2, ST_3_o3, ST_3_o4, ST_3_o5, \
-ST_3_o6, ST_3_o7, ST_3_o8, ST_3_o9, ST_3_o10, ST_3_o11, ST_3_o12, ST_3_o13) \
-\
-explicit ST_3(const ST_3_o1& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o2& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o3& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o4& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o5& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o6& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o7& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o8& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o9& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o10& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o11& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o12& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o13& other) __CPU_GPU__;
-
-#else
-
-#define SCALARTYPE_3_CONVERSION_CTOR(ST_3, \
-ST_3_o1, ST_3_o2, ST_3_o3, ST_3_o4, ST_3_o5) \
-\
-explicit ST_3(const ST_3_o1& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o2& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o3& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o4& other) __CPU_GPU__; \
-\
-explicit ST_3(const ST_3_o5& other) __CPU_GPU__;
-
-#endif // if !__HCC_AMP__
-
-
-class int_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(int, int_3)
-
-#if !__HCC_AMP__
-  SCALARTYPE_3_CONVERSION_CTOR(int_3,
-    uint_3, float_3, double_3, norm_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-#else
-  SCALARTYPE_3_CONVERSION_CTOR(int_3,
-    uint_3, float_3, double_3, norm_3, unorm_3)
-#endif
-
-  int_3 operator-() const __CPU_GPU__ { return int_3(-x, -y, -z); }
-
-  int_3 operator~() const __CPU_GPU__ { return int_3(~x, ~y, -z); }
-
-  int_3& operator%=(const int_3& rhs) __CPU_GPU__
-  {
-    x %= rhs.x;
-    y %= rhs.y;
-    z %= rhs.z;
-    return *this;
-  }
-
-  int_3& operator^=(const int_3& rhs) __CPU_GPU__
-  {
-    x ^= rhs.x;
-    y ^= rhs.y;
-    z ^= rhs.z;
-    return *this;
-  }
-
-  int_3& operator|=(const int_3& rhs) __CPU_GPU__
-  {
-    x |= rhs.x;
-    y |= rhs.y;
-    z |= rhs.z;
-    return *this;
-  }
-
-  int_3& operator&=(const int_3& rhs) __CPU_GPU__
-  {
-    x &= rhs.x;
-    y &= rhs.y;
-    z &= rhs.z;
-    return *this;
-  }
-
-  int_3& operator>>=(const int_3& rhs) __CPU_GPU__
-  {
-    x >>= rhs.x;
-    y >>= rhs.y;
-    z >>= rhs.z;
-    return *this;
-  }
-
-  int_3& operator<<=(const int_3& rhs) __CPU_GPU__
-  {
-    x <<= rhs.x;
-    y <<= rhs.y;
-    z <<= rhs.z;
-    return *this;
-  }
-  
-  SINGLE_COMPONENT_ACCESS(int, x)
-  SINGLE_COMPONENT_ACCESS(int, y)
-  SINGLE_COMPONENT_ACCESS(int, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(int)
-
-  TWO_COMPONENT_ACCESS(int_2, x, y)
-  TWO_COMPONENT_ACCESS(int_2, x, z)
-  TWO_COMPONENT_ACCESS(int_2, y, z)
-
-  THREE_COMPONENT_ACCESS(int_3, x, y, z)
-};
-
-class uint_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(unsigned int, uint_3)
-
-#if !__HCC_AMP__
-  SCALARTYPE_3_CONVERSION_CTOR(uint_3,
-    int_3, float_3, double_3, norm_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-#else
-  SCALARTYPE_3_CONVERSION_CTOR(uint_3,
-    int_3, float_3, double_3, norm_3, unorm_3)
-#endif
- 
-  uint_3 operator~() const __CPU_GPU__ { return uint_3(~x, ~y, ~z); }
-
-  uint_3& operator%=(const uint_3& rhs) __CPU_GPU__
-  {
-    x %= rhs.x;
-    y %= rhs.y;
-    z %= rhs.z;
-    return *this;
-  }
-
-  uint_3& operator^=(const uint_3& rhs) __CPU_GPU__
-  {
-    x ^= rhs.x;
-    y ^= rhs.y;
-    z ^= rhs.z;
-    return *this;
-  }
-
-  uint_3& operator|=(const uint_3& rhs) __CPU_GPU__
-  {
-    x |= rhs.x;
-    y |= rhs.y;
-    z |= rhs.z;
-    return *this;
-  }
-
-  uint_3& operator&=(const uint_3& rhs) __CPU_GPU__
-  {
-    x &= rhs.x;
-    y &= rhs.y;
-    z &= rhs.z;
-    return *this;
-  }
-
-  uint_3& operator>>=(const uint_3& rhs) __CPU_GPU__
-  {
-    x >>= rhs.x;
-    y >>= rhs.y;
-    z >>= rhs.z;
-    return *this;
-  }
-
-  uint_3& operator<<=(const uint_3& rhs) __CPU_GPU__
-  {
-    x <<= rhs.x;
-    y <<= rhs.y;
-    z <<= rhs.z;
-    return *this;
-  }
- 
-  SINGLE_COMPONENT_ACCESS(unsigned int, x)
-  SINGLE_COMPONENT_ACCESS(unsigned int, y)
-  SINGLE_COMPONENT_ACCESS(unsigned int, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned int)
-
-  TWO_COMPONENT_ACCESS(uint_2, x, y)
-  TWO_COMPONENT_ACCESS(uint_2, x, z)
-  TWO_COMPONENT_ACCESS(uint_2, y, z)
-
-  THREE_COMPONENT_ACCESS(uint_3, x, y, z)
-};
-
-class float_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(float, float_3)
-
-#if !__HCC_AMP__
-  SCALARTYPE_3_CONVERSION_CTOR(float_3,
-    int_3, uint_3, double_3, norm_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-#else
-  SCALARTYPE_3_CONVERSION_CTOR(float_3,
-    int_3, uint_3, double_3, norm_3, unorm_3)
-#endif
-  
-  float_3 operator-() const __CPU_GPU__ { return float_3(-x, -y, -z); }
-
-  SINGLE_COMPONENT_ACCESS(float, x)
-  SINGLE_COMPONENT_ACCESS(float, y)
-  SINGLE_COMPONENT_ACCESS(float, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(float)
-
-  TWO_COMPONENT_ACCESS(float_2, x, y)
-  TWO_COMPONENT_ACCESS(float_2, x, z)
-  TWO_COMPONENT_ACCESS(float_2, y, z)
-
-  THREE_COMPONENT_ACCESS(float_3, x, y, z)
-};
-
-class double_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(double, double_3)
-
-#if !__HCC_AMP__
-  SCALARTYPE_3_CONVERSION_CTOR(double_3,
-    int_3, uint_3, float_3, norm_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-#else
-  SCALARTYPE_3_CONVERSION_CTOR(double_3,
-    int_3, uint_3, float_3, norm_3, unorm_3)
-#endif
-  
-  double_3 operator-() const __CPU_GPU__ { return double_3(-x, -y, -z); }
-
-  SINGLE_COMPONENT_ACCESS(double, x)
-  SINGLE_COMPONENT_ACCESS(double, y)
-  SINGLE_COMPONENT_ACCESS(double, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(double)
-
-  TWO_COMPONENT_ACCESS(double_2, x, y)
-  TWO_COMPONENT_ACCESS(double_2, x, z)
-  TWO_COMPONENT_ACCESS(double_2, y, z)
-
-  THREE_COMPONENT_ACCESS(double_3, x, y, z)
-};
-
-class norm_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(norm, norm_3)
-
-#if !__HCC_AMP__
-  SCALARTYPE_3_CONVERSION_CTOR(norm_3,
-    int_3, uint_3, float_3, double_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-#else
-  SCALARTYPE_3_CONVERSION_CTOR(norm_3,
-    int_3, uint_3, float_3, double_3, unorm_3)
-#endif
-
-#if __GNUG__
-  // for some reason g++ will mistakenly treat x, y, z as type float
-  // so we need to explicitly cast them to norm type here
-  norm_3 operator-() const __CPU_GPU__ { return norm_3(-(norm)x, -(norm)y, -(norm)z); }
-#else
-  norm_3 operator-() const __CPU_GPU__ { return norm_3(-x, -y, -z); }
-#endif
-  
-  SINGLE_COMPONENT_ACCESS(norm, x)
-  SINGLE_COMPONENT_ACCESS(norm, y)
-  SINGLE_COMPONENT_ACCESS(norm, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(norm)
-
-  TWO_COMPONENT_ACCESS(norm_2, x, y)
-  TWO_COMPONENT_ACCESS(norm_2, x, z)
-  TWO_COMPONENT_ACCESS(norm_2, y, z)
-
-  THREE_COMPONENT_ACCESS(norm_3, x, y, z)
-};
-
-class unorm_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(unorm, unorm_3)
-
-#if !__HCC_AMP__
-  SCALARTYPE_3_CONVERSION_CTOR(unorm_3,
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-#else
-  SCALARTYPE_3_CONVERSION_CTOR(unorm_3,
-    int_3, uint_3, float_3, double_3, norm_3)
-#endif
-
-  SINGLE_COMPONENT_ACCESS(unorm, x)
-  SINGLE_COMPONENT_ACCESS(unorm, y)
-  SINGLE_COMPONENT_ACCESS(unorm, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(unorm)
-
-  TWO_COMPONENT_ACCESS(unorm_2, x, y)
-  TWO_COMPONENT_ACCESS(unorm_2, x, z)
-  TWO_COMPONENT_ACCESS(unorm_2, y, z)
-
-  THREE_COMPONENT_ACCESS(unorm_3, x, y, z)
-};
-
-#if !__HCC_AMP__
-
-class char_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(char, char_3)
-
-  SCALARTYPE_3_CONVERSION_CTOR(char_3,
-    int_3, uint_3, float_3, double_3, unorm_3,
-    norm_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-  char_3 operator-() const __CPU_GPU__ { return char_3(-x, -y, -z); }
-  
-  SINGLE_COMPONENT_ACCESS(char, x)
-  SINGLE_COMPONENT_ACCESS(char, y)
-  SINGLE_COMPONENT_ACCESS(char, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(char)
-
-  TWO_COMPONENT_ACCESS(char_2, x, y)
-  TWO_COMPONENT_ACCESS(char_2, x, z)
-  TWO_COMPONENT_ACCESS(char_2, y, z)
-
-  THREE_COMPONENT_ACCESS(char_3, x, y, z)
-};
-
-class uchar_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(unsigned char, uchar_3)
-
-  SCALARTYPE_3_CONVERSION_CTOR(uchar_3,
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, unorm_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-  SINGLE_COMPONENT_ACCESS(unsigned char, x)
-  SINGLE_COMPONENT_ACCESS(unsigned char, y)
-  SINGLE_COMPONENT_ACCESS(unsigned char, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned char)
-
-  TWO_COMPONENT_ACCESS(uchar_2, x, y)
-  TWO_COMPONENT_ACCESS(uchar_2, x, z)
-  TWO_COMPONENT_ACCESS(uchar_2, y, z)
-
-  THREE_COMPONENT_ACCESS(uchar_3, x, y, z)
-};
-
-class short_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(short, short_3)
-
-  SCALARTYPE_3_CONVERSION_CTOR(short_3,
-    int_3, uint_3, float_3, double_3, unorm_3,
-    norm_3, uchar_3, char_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-  short_3 operator-() const __CPU_GPU__ { return short_3(-x, -y, -z); }
-  
-  SINGLE_COMPONENT_ACCESS(short, x)
-  SINGLE_COMPONENT_ACCESS(short, y)
-  SINGLE_COMPONENT_ACCESS(short, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(short)
-
-  TWO_COMPONENT_ACCESS(short_2, x, y)
-  TWO_COMPONENT_ACCESS(short_2, x, z)
-  TWO_COMPONENT_ACCESS(short_2, y, z)
-
-  THREE_COMPONENT_ACCESS(short_3, x, y, z)
-};
-
-class ushort_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(unsigned short, ushort_3)
-
-  SCALARTYPE_3_CONVERSION_CTOR(ushort_3,
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, unorm_3, short_3, uchar_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-  SINGLE_COMPONENT_ACCESS(unsigned short, x)
-  SINGLE_COMPONENT_ACCESS(unsigned short, y)
-  SINGLE_COMPONENT_ACCESS(unsigned short, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned short)
-
-  TWO_COMPONENT_ACCESS(ushort_2, x, y)
-  TWO_COMPONENT_ACCESS(ushort_2, x, z)
-  TWO_COMPONENT_ACCESS(ushort_2, y, z)
-
-  THREE_COMPONENT_ACCESS(ushort_3, x, y, z)
-};
-
-class long_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(long, long_3)
-
-  SCALARTYPE_3_CONVERSION_CTOR(long_3,
-    int_3, uint_3, float_3, double_3, unorm_3,
-    norm_3, uchar_3, short_3, ushort_3, char_3, ulong_3, longlong_3, ulonglong_3)
-
-  long_3 operator-() const __CPU_GPU__ { return long_3(-x, -y, -z); }
-  
-  SINGLE_COMPONENT_ACCESS(long, x)
-  SINGLE_COMPONENT_ACCESS(long, y)
-  SINGLE_COMPONENT_ACCESS(long, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(long)
-
-  TWO_COMPONENT_ACCESS(long_2, x, y)
-  TWO_COMPONENT_ACCESS(long_2, x, z)
-  TWO_COMPONENT_ACCESS(long_2, y, z)
-
-  THREE_COMPONENT_ACCESS(long_3, x, y, z)
-};
-
-class ulong_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(unsigned long, ulong_3)
-
-  SCALARTYPE_3_CONVERSION_CTOR(ulong_3,
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, unorm_3, short_3, ushort_3, long_3, uchar_3, longlong_3, ulonglong_3)
-
-  SINGLE_COMPONENT_ACCESS(unsigned long, x)
-  SINGLE_COMPONENT_ACCESS(unsigned long, y)
-  SINGLE_COMPONENT_ACCESS(unsigned long, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned long)
-
-  TWO_COMPONENT_ACCESS(ulong_2, x, y)
-  TWO_COMPONENT_ACCESS(ulong_2, x, z)
-  TWO_COMPONENT_ACCESS(ulong_2, y, z)
-
-  THREE_COMPONENT_ACCESS(ulong_3, x, y, z)
-};
-
-class longlong_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(long long int, longlong_3)
-
-  SCALARTYPE_3_CONVERSION_CTOR(longlong_3,
-    int_3, uint_3, float_3, double_3, unorm_3,
-    norm_3, uchar_3, short_3, ushort_3, char_3, ulong_3, long_3, ulonglong_3)
-
-  longlong_3 operator-() const __CPU_GPU__ { return longlong_3(-x, -y, -z); }
-  
-  SINGLE_COMPONENT_ACCESS(long long int, x)
-  SINGLE_COMPONENT_ACCESS(long long int, y)
-  SINGLE_COMPONENT_ACCESS(long long int, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(long long int)
-
-  TWO_COMPONENT_ACCESS(longlong_2, x, y)
-  TWO_COMPONENT_ACCESS(longlong_2, x, z)
-  TWO_COMPONENT_ACCESS(longlong_2, y, z)
-
-  THREE_COMPONENT_ACCESS(longlong_3, x, y, z)
-};
-
-class ulonglong_3
-{
-public:
-  SCALARTYPE_3_COMMON_PUBLIC_MEMBER(unsigned long long int, ulonglong_3)
-
-  SCALARTYPE_3_CONVERSION_CTOR(ulonglong_3,
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, unorm_3, short_3, ushort_3, long_3, uchar_3, longlong_3, ulong_3)
-
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, x)
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, y)
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, z)
-
-  SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned long long int)
-
-  TWO_COMPONENT_ACCESS(ulonglong_2, x, y)
-  TWO_COMPONENT_ACCESS(ulonglong_2, x, z)
-  TWO_COMPONENT_ACCESS(ulonglong_2, y, z)
-
-  THREE_COMPONENT_ACCESS(ulonglong_3, x, y, z)
-};
-
-#endif // if !__HCC_AMP__
-
-#undef SCALARTYPE_3_REFERENCE_SINGLE_COMPONENT_ACCESS
-#undef SCALARTYPE_3_COMMON_PUBLIC_MEMBER
-
-#define SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(ST) \
-ST& ref_x() __CPU_GPU__ { return x; } \
-\
-ST& ref_y() __CPU_GPU__ { return y; } \
-\
-ST& ref_z() __CPU_GPU__ { return z; } \
-\
-ST& ref_w() __CPU_GPU__ { return w; } \
-\
-ST& ref_r() __CPU_GPU__ { return x; } \
-\
-ST& ref_g() __CPU_GPU__ { return y; } \
-\
-ST& ref_b() __CPU_GPU__ { return z; } \
-\
-ST& ref_a() __CPU_GPU__ { return w; }
-
-#define SCALARTYPE_4_COMMON_PUBLIC_MEMBER(ST, ST_4) \
-ST x; \
-ST y; \
-ST z; \
-ST w; \
-typedef ST value_type; \
-static const int size = 4; \
-\
-ST_4() __CPU_GPU__ {} \
-\
-~ST_4() __CPU_GPU__ {} \
-\
-ST_4(ST value) __CPU_GPU__ \
-{ \
-  x = value; \
-  y = value; \
-  z = value; \
-  w = value; \
-} \
-\
-ST_4(const ST_4&  other) __CPU_GPU__ \
-{ \
-  x = other.x; \
-  y = other.y; \
-  z = other.z; \
-  w = other.w; \
-} \
-\
-ST_4(ST v1, ST v2, ST v3, ST v4) __CPU_GPU__ \
-{ \
-  x = v1; \
-  y = v2; \
-  z = v3; \
-  w = v4; \
-} \
-\
-ST_4& operator=(const ST_4& other) __CPU_GPU__ \
-{ \
-  x = other.x; \
-  y = other.y; \
-  z = other.z; \
-  w = other.w; \
-  return *this; \
-} \
-\
-ST_4& operator++() __CPU_GPU__ \
-{ \
-  ++x; \
-  ++y; \
-  ++z; \
-  ++w; \
-  return *this; \
-} \
-\
-ST_4 operator++(int) __CPU_GPU__ \
-{ \
-  ST_4 Ret(*this); \
-  operator++(); \
-  return Ret; \
-} \
-\
-ST_4& operator--() __CPU_GPU__ \
-{ \
-  --x; \
-  --y; \
-  --z; \
-  --w; \
-  return *this; \
-} \
-\
-ST_4 operator--(int) __CPU_GPU__ \
-{ \
-  ST_4 Ret(*this); \
-  operator--(); \
-  return Ret; \
-} \
-\
-ST_4& operator+=(const ST_4& rhs) __CPU_GPU__ \
-{ \
-  x += rhs.x; \
-  y += rhs.y; \
-  z += rhs.z; \
-  w += rhs.w; \
-  return *this; \
-} \
-\
-ST_4& operator-=(const ST_4& rhs) __CPU_GPU__ \
-{ \
-  x -= rhs.x; \
-  y -= rhs.y; \
-  z -= rhs.z; \
-  w -= rhs.w; \
-  return *this; \
-} \
-\
-ST_4& operator*=(const ST_4& rhs) __CPU_GPU__ \
-{ \
-  x *= rhs.x; \
-  y *= rhs.y; \
-  z *= rhs.z; \
-  w *= rhs.w; \
-  return *this; \
-} \
-\
-ST_4& operator/=(const ST_4& rhs) __CPU_GPU__ \
-{ \
-  x /= rhs.x; \
-  y /= rhs.y; \
-  z /= rhs.z; \
-  w /= rhs.w; \
-  return *this; \
-}
-
-#if !__HCC_AMP__
-
-#define SCALARTYPE_4_CONVERSION_CTOR(ST_4, \
-ST_4_o1, ST_4_o2, ST_4_o3, ST_4_o4, ST_4_o5, \
-ST_4_o6, ST_4_o7, ST_4_o8, ST_4_o9, ST_4_o10, ST_4_o11, ST_4_o12, ST_4_o13) \
-\
-explicit ST_4(const ST_4_o1& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o2& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o3& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o4& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o5& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o6& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o7& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o8& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o9& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o10& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o11& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o12& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o13& other) __CPU_GPU__;
-
-#else
-
-#define SCALARTYPE_4_CONVERSION_CTOR(ST_4, \
-ST_4_o1, ST_4_o2, ST_4_o3, ST_4_o4, ST_4_o5) \
-\
-explicit ST_4(const ST_4_o1& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o2& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o3& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o4& other) __CPU_GPU__; \
-\
-explicit ST_4(const ST_4_o5& other) __CPU_GPU__; \
-
-#endif // if !__HCC_AMP__
-
-class int_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(int, int_4)
-
-#if !__HCC_AMP__
-  SCALARTYPE_4_CONVERSION_CTOR(int_4,
-    uint_4, float_4, double_4, norm_4, unorm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-#else
-  SCALARTYPE_4_CONVERSION_CTOR(int_4,
-    uint_4, float_4, double_4, norm_4, unorm_4) 
-#endif
-
-  int_4 operator-() const __CPU_GPU__ { return int_4(-x, -y, -z, -w); }
-
-  int_4 operator~() const __CPU_GPU__ { return int_4(~x, ~y, -z, -w); }
-
-  int_4& operator%=(const int_4& rhs) __CPU_GPU__
-  {
-    x %= rhs.x;
-    y %= rhs.y;
-    z %= rhs.z;
-    w %= rhs.w;
-    return *this;
-  }
-
-  int_4& operator^=(const int_4& rhs) __CPU_GPU__
-  {
-    x ^= rhs.x;
-    y ^= rhs.y;
-    z ^= rhs.z;
-    w ^= rhs.w;
-    return *this;
-  }
-
-  int_4& operator|=(const int_4& rhs) __CPU_GPU__
-  {
-    x |= rhs.x;
-    y |= rhs.y;
-    z |= rhs.z;
-    w |= rhs.w;
-    return *this;
-  }
-
-  int_4& operator&=(const int_4& rhs) __CPU_GPU__
-  {
-    x &= rhs.x;
-    y &= rhs.y;
-    z &= rhs.z;
-    w &= rhs.w;
-    return *this;
-  }
-
-  int_4& operator>>=(const int_4& rhs) __CPU_GPU__
-  {
-    x >>= rhs.x;
-    y >>= rhs.y;
-    z >>= rhs.z;
-    w >>= rhs.w;
-    return *this;
-  }
-
-  int_4& operator<<=(const int_4& rhs) __CPU_GPU__
-  {
-    x <<= rhs.x;
-    y <<= rhs.y;
-    z <<= rhs.z;
-    w <<= rhs.w;
-    return *this;
-  }
-  
-  SINGLE_COMPONENT_ACCESS(int, x)
-  SINGLE_COMPONENT_ACCESS(int, y)
-  SINGLE_COMPONENT_ACCESS(int, z)
-  SINGLE_COMPONENT_ACCESS(int, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(int)
-
-  TWO_COMPONENT_ACCESS(int_2, x, y)
-  TWO_COMPONENT_ACCESS(int_2, x, z)
-  TWO_COMPONENT_ACCESS(int_2, x, w)
-  TWO_COMPONENT_ACCESS(int_2, y, z)
-  TWO_COMPONENT_ACCESS(int_2, y, w)
-  TWO_COMPONENT_ACCESS(int_2, z, w)
-
-  THREE_COMPONENT_ACCESS(int_3, x, y, z)
-  THREE_COMPONENT_ACCESS(int_3, x, y, w)
-  THREE_COMPONENT_ACCESS(int_3, x, z, w)
-  THREE_COMPONENT_ACCESS(int_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(int_4, x, y, z, w)
-};
-
-class uint_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(unsigned int, uint_4)
-
-#if !__HCC_AMP__
-  SCALARTYPE_4_CONVERSION_CTOR(uint_4,
-    int_4, float_4, double_4, norm_4, unorm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-#else
-  SCALARTYPE_4_CONVERSION_CTOR(uint_4,
-    int_4, float_4, double_4, norm_4, unorm_4) 
-#endif
- 
-  uint_4 operator~() const __CPU_GPU__ { return uint_4(~x, ~y, ~z, -w); }
-
-  uint_4& operator%=(const uint_4& rhs) __CPU_GPU__
-  {
-    x %= rhs.x;
-    y %= rhs.y;
-    z %= rhs.z;
-    w %= rhs.w;
-    return *this;
-  }
-
-  uint_4& operator^=(const uint_4& rhs) __CPU_GPU__
-  {
-    x ^= rhs.x;
-    y ^= rhs.y;
-    z ^= rhs.z;
-    w ^= rhs.w;
-    return *this;
-  }
-
-  uint_4& operator|=(const uint_4& rhs) __CPU_GPU__
-  {
-    x |= rhs.x;
-    y |= rhs.y;
-    z |= rhs.z;
-    w |= rhs.w;
-    return *this;
-  }
-
-  uint_4& operator&=(const uint_4& rhs) __CPU_GPU__
-  {
-    x &= rhs.x;
-    y &= rhs.y;
-    z &= rhs.z;
-    w &= rhs.w;
-    return *this;
-  }
-
-  uint_4& operator>>=(const uint_4& rhs) __CPU_GPU__
-  {
-    x >>= rhs.x;
-    y >>= rhs.y;
-    z >>= rhs.z;
-    w >>= rhs.w;
-    return *this;
-  }
-
-  uint_4& operator<<=(const uint_4& rhs) __CPU_GPU__
-  {
-    x <<= rhs.x;
-    y <<= rhs.y;
-    z <<= rhs.z;
-    w <<= rhs.w;
-    return *this;
-  }
- 
-  SINGLE_COMPONENT_ACCESS(unsigned int, x)
-  SINGLE_COMPONENT_ACCESS(unsigned int, y)
-  SINGLE_COMPONENT_ACCESS(unsigned int, z)
-  SINGLE_COMPONENT_ACCESS(unsigned int, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned int)
-
-  TWO_COMPONENT_ACCESS(uint_2, x, y)
-  TWO_COMPONENT_ACCESS(uint_2, x, z)
-  TWO_COMPONENT_ACCESS(uint_2, x, w)
-  TWO_COMPONENT_ACCESS(uint_2, y, z)
-  TWO_COMPONENT_ACCESS(uint_2, y, w)
-  TWO_COMPONENT_ACCESS(uint_2, z, w)
-
-  THREE_COMPONENT_ACCESS(uint_3, x, y, z)
-  THREE_COMPONENT_ACCESS(uint_3, x, y, w)
-  THREE_COMPONENT_ACCESS(uint_3, x, z, w)
-  THREE_COMPONENT_ACCESS(uint_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(uint_4, x, y, z, w)
-};
-
-class float_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(float, float_4)
-
-#if !__HCC_AMP__
-  SCALARTYPE_4_CONVERSION_CTOR(float_4,
-    int_4, uint_4, double_4, norm_4, unorm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-#else
-  SCALARTYPE_4_CONVERSION_CTOR(float_4,
-    int_4, uint_4, double_4, norm_4, unorm_4) 
-#endif
-  
-  float_4 operator-() const __CPU_GPU__ { return float_4(-x, -y, -z, -w); }
-
-  SINGLE_COMPONENT_ACCESS(float, x)
-  SINGLE_COMPONENT_ACCESS(float, y)
-  SINGLE_COMPONENT_ACCESS(float, z)
-  SINGLE_COMPONENT_ACCESS(float, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(float)
-
-  TWO_COMPONENT_ACCESS(float_2, x, y)
-  TWO_COMPONENT_ACCESS(float_2, x, z)
-  TWO_COMPONENT_ACCESS(float_2, x, w)
-  TWO_COMPONENT_ACCESS(float_2, y, z)
-  TWO_COMPONENT_ACCESS(float_2, y, w)
-  TWO_COMPONENT_ACCESS(float_2, z, w)
-
-  THREE_COMPONENT_ACCESS(float_3, x, y, z)
-  THREE_COMPONENT_ACCESS(float_3, x, y, w)
-  THREE_COMPONENT_ACCESS(float_3, x, z, w)
-  THREE_COMPONENT_ACCESS(float_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(float_4, x, y, z, w)
-};
-
-class double_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(double, double_4)
-
-#if !__HCC_AMP__
-  SCALARTYPE_4_CONVERSION_CTOR(double_4,
-    int_4, uint_4, float_4, norm_4, unorm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-#else
-  SCALARTYPE_4_CONVERSION_CTOR(double_4,
-    int_4, uint_4, float_4, norm_4, unorm_4) 
-#endif
-  
-  double_4 operator-() const __CPU_GPU__ { return double_4(-x, -y, -z, -w); }
-
-  SINGLE_COMPONENT_ACCESS(double, x)
-  SINGLE_COMPONENT_ACCESS(double, y)
-  SINGLE_COMPONENT_ACCESS(double, z)
-  SINGLE_COMPONENT_ACCESS(double, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(double)
-
-  TWO_COMPONENT_ACCESS(double_2, x, y)
-  TWO_COMPONENT_ACCESS(double_2, x, z)
-  TWO_COMPONENT_ACCESS(double_2, x, w)
-  TWO_COMPONENT_ACCESS(double_2, y, z)
-  TWO_COMPONENT_ACCESS(double_2, y, w)
-  TWO_COMPONENT_ACCESS(double_2, z, w)
-
-  THREE_COMPONENT_ACCESS(double_3, x, y, z)
-  THREE_COMPONENT_ACCESS(double_3, x, y, w)
-  THREE_COMPONENT_ACCESS(double_3, x, z, w)
-  THREE_COMPONENT_ACCESS(double_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(double_4, x, y, z, w)
-};
-
-class norm_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(norm, norm_4)
-
-#if !__HCC_AMP__
-  SCALARTYPE_4_CONVERSION_CTOR(norm_4,
-    int_4, uint_4, float_4, double_4, unorm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-#else
-  SCALARTYPE_4_CONVERSION_CTOR(norm_4,
-    int_4, uint_4, float_4, double_4, unorm_4) 
-#endif
-
-#if __GNUG__
-  // for some reason g++ will mistakenly treat x, y, z, w as type float
-  // so we need to explicitly cast them to norm type here
-  norm_4 operator-() const __CPU_GPU__ { return norm_4(-(norm)x, -(norm)y, -(norm)z, -(norm)w); }
-#else
-  norm_4 operator-() const __CPU_GPU__ { return norm_4(-x, -y, -z, -w); }
-#endif
-  
-  SINGLE_COMPONENT_ACCESS(norm, x)
-  SINGLE_COMPONENT_ACCESS(norm, y)
-  SINGLE_COMPONENT_ACCESS(norm, z)
-  SINGLE_COMPONENT_ACCESS(norm, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(norm)
-
-  TWO_COMPONENT_ACCESS(norm_2, x, y)
-  TWO_COMPONENT_ACCESS(norm_2, x, z)
-  TWO_COMPONENT_ACCESS(norm_2, x, w)
-  TWO_COMPONENT_ACCESS(norm_2, y, z)
-  TWO_COMPONENT_ACCESS(norm_2, y, w)
-  TWO_COMPONENT_ACCESS(norm_2, z, w)
-
-  THREE_COMPONENT_ACCESS(norm_3, x, y, z)
-  THREE_COMPONENT_ACCESS(norm_3, x, y, w)
-  THREE_COMPONENT_ACCESS(norm_3, x, z, w)
-  THREE_COMPONENT_ACCESS(norm_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(norm_4, x, y, z, w)
-};
-
-class unorm_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(unorm, unorm_4)
-
-#if !__HCC_AMP__
-  SCALARTYPE_4_CONVERSION_CTOR(unorm_4,
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-#else
-  SCALARTYPE_4_CONVERSION_CTOR(unorm_4,
-    int_4, uint_4, float_4, double_4, norm_4) 
-#endif
-
-  SINGLE_COMPONENT_ACCESS(unorm, x)
-  SINGLE_COMPONENT_ACCESS(unorm, y)
-  SINGLE_COMPONENT_ACCESS(unorm, z)
-  SINGLE_COMPONENT_ACCESS(unorm, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(unorm)
-
-  TWO_COMPONENT_ACCESS(unorm_2, x, y)
-  TWO_COMPONENT_ACCESS(unorm_2, x, z)
-  TWO_COMPONENT_ACCESS(unorm_2, x, w)
-  TWO_COMPONENT_ACCESS(unorm_2, y, z)
-  TWO_COMPONENT_ACCESS(unorm_2, y, w)
-  TWO_COMPONENT_ACCESS(unorm_2, z, w)
-
-  THREE_COMPONENT_ACCESS(unorm_3, x, y, z)
-  THREE_COMPONENT_ACCESS(unorm_3, x, y, w)
-  THREE_COMPONENT_ACCESS(unorm_3, x, z, w)
-  THREE_COMPONENT_ACCESS(unorm_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(unorm_4, x, y, z, w)
-};
-
-#if !__HCC_AMP__
-
-class char_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(char, char_4)
-
-  SCALARTYPE_4_CONVERSION_CTOR(char_4,
-    int_4, uint_4, float_4, double_4, unorm_4,
-    norm_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-  char_4 operator-() const __CPU_GPU__ { return char_4(-x, -y, -z, -w); }
-  
-  SINGLE_COMPONENT_ACCESS(char, x)
-  SINGLE_COMPONENT_ACCESS(char, y)
-  SINGLE_COMPONENT_ACCESS(char, z)
-  SINGLE_COMPONENT_ACCESS(char, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(char)
-
-  TWO_COMPONENT_ACCESS(char_2, x, y)
-  TWO_COMPONENT_ACCESS(char_2, x, z)
-  TWO_COMPONENT_ACCESS(char_2, x, w)
-  TWO_COMPONENT_ACCESS(char_2, y, z)
-  TWO_COMPONENT_ACCESS(char_2, y, w)
-  TWO_COMPONENT_ACCESS(char_2, z, w)
-
-  THREE_COMPONENT_ACCESS(char_3, x, y, z)
-  THREE_COMPONENT_ACCESS(char_3, x, y, w)
-  THREE_COMPONENT_ACCESS(char_3, x, z, w)
-  THREE_COMPONENT_ACCESS(char_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(char_4, x, y, z, w)
-};
-
-class uchar_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(unsigned char, uchar_4)
-
-  SCALARTYPE_4_CONVERSION_CTOR(uchar_4,
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, unorm_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-  SINGLE_COMPONENT_ACCESS(unsigned char, x)
-  SINGLE_COMPONENT_ACCESS(unsigned char, y)
-  SINGLE_COMPONENT_ACCESS(unsigned char, z)
-  SINGLE_COMPONENT_ACCESS(unsigned char, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned char)
-
-  TWO_COMPONENT_ACCESS(uchar_2, x, y)
-  TWO_COMPONENT_ACCESS(uchar_2, x, z)
-  TWO_COMPONENT_ACCESS(uchar_2, x, w)
-  TWO_COMPONENT_ACCESS(uchar_2, y, z)
-  TWO_COMPONENT_ACCESS(uchar_2, y, w)
-  TWO_COMPONENT_ACCESS(uchar_2, z, w)
-
-  THREE_COMPONENT_ACCESS(uchar_3, x, y, z)
-  THREE_COMPONENT_ACCESS(uchar_3, x, y, w)
-  THREE_COMPONENT_ACCESS(uchar_3, x, z, w)
-  THREE_COMPONENT_ACCESS(uchar_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(uchar_4, x, y, z, w)
-};
-
-class short_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(short, short_4)
-
-  SCALARTYPE_4_CONVERSION_CTOR(short_4,
-    int_4, uint_4, float_4, double_4, unorm_4,
-    norm_4, uchar_4, char_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-  short_4 operator-() const __CPU_GPU__ { return short_4(-x, -y, -z, -w); }
-  
-  SINGLE_COMPONENT_ACCESS(short, x)
-  SINGLE_COMPONENT_ACCESS(short, y)
-  SINGLE_COMPONENT_ACCESS(short, z)
-  SINGLE_COMPONENT_ACCESS(short, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(short)
-
-  TWO_COMPONENT_ACCESS(short_2, x, y)
-  TWO_COMPONENT_ACCESS(short_2, x, z)
-  TWO_COMPONENT_ACCESS(short_2, x, w)
-  TWO_COMPONENT_ACCESS(short_2, y, z)
-  TWO_COMPONENT_ACCESS(short_2, y, w)
-  TWO_COMPONENT_ACCESS(short_2, z, w)
-
-  THREE_COMPONENT_ACCESS(short_3, x, y, z)
-  THREE_COMPONENT_ACCESS(short_3, x, y, w)
-  THREE_COMPONENT_ACCESS(short_3, x, z, w)
-  THREE_COMPONENT_ACCESS(short_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(short_4, x, y, z, w)
-};
-
-class ushort_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(unsigned short, ushort_4)
-
-  SCALARTYPE_4_CONVERSION_CTOR(ushort_4,
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, unorm_4, short_4, uchar_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-  SINGLE_COMPONENT_ACCESS(unsigned short, x)
-  SINGLE_COMPONENT_ACCESS(unsigned short, y)
-  SINGLE_COMPONENT_ACCESS(unsigned short, z)
-  SINGLE_COMPONENT_ACCESS(unsigned short, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned short)
-
-  TWO_COMPONENT_ACCESS(ushort_2, x, y)
-  TWO_COMPONENT_ACCESS(ushort_2, x, z)
-  TWO_COMPONENT_ACCESS(ushort_2, x, w)
-  TWO_COMPONENT_ACCESS(ushort_2, y, z)
-  TWO_COMPONENT_ACCESS(ushort_2, y, w)
-  TWO_COMPONENT_ACCESS(ushort_2, z, w)
-
-  THREE_COMPONENT_ACCESS(ushort_3, x, y, z)
-  THREE_COMPONENT_ACCESS(ushort_3, x, y, w)
-  THREE_COMPONENT_ACCESS(ushort_3, x, z, w)
-  THREE_COMPONENT_ACCESS(ushort_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(ushort_4, x, y, z, w)
-};
-
-class long_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(long, long_4)
-
-  SCALARTYPE_4_CONVERSION_CTOR(long_4,
-    int_4, uint_4, float_4, double_4, unorm_4,
-    norm_4, uchar_4, short_4, ushort_4, char_4, ulong_4, longlong_4, ulonglong_4)
-
-  long_4 operator-() const __CPU_GPU__ { return long_4(-x, -y, -z, -w); }
-  
-  SINGLE_COMPONENT_ACCESS(long, x)
-  SINGLE_COMPONENT_ACCESS(long, y)
-  SINGLE_COMPONENT_ACCESS(long, z)
-  SINGLE_COMPONENT_ACCESS(long, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(long)
-
-  TWO_COMPONENT_ACCESS(long_2, x, y)
-  TWO_COMPONENT_ACCESS(long_2, x, z)
-  TWO_COMPONENT_ACCESS(long_2, x, w)
-  TWO_COMPONENT_ACCESS(long_2, y, z)
-  TWO_COMPONENT_ACCESS(long_2, y, w)
-  TWO_COMPONENT_ACCESS(long_2, z, w)
-
-  THREE_COMPONENT_ACCESS(long_3, x, y, z)
-  THREE_COMPONENT_ACCESS(long_3, x, y, w)
-  THREE_COMPONENT_ACCESS(long_3, x, z, w)
-  THREE_COMPONENT_ACCESS(long_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(long_4, x, y, z, w)
-};
-
-class ulong_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(unsigned long, ulong_4)
-
-  SCALARTYPE_4_CONVERSION_CTOR(ulong_4,
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, unorm_4, short_4, ushort_4, long_4, uchar_4, longlong_4, ulonglong_4)
-
-  SINGLE_COMPONENT_ACCESS(unsigned long, x)
-  SINGLE_COMPONENT_ACCESS(unsigned long, y)
-  SINGLE_COMPONENT_ACCESS(unsigned long, z)
-  SINGLE_COMPONENT_ACCESS(unsigned long, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned long)
-
-  TWO_COMPONENT_ACCESS(ulong_2, x, y)
-  TWO_COMPONENT_ACCESS(ulong_2, x, z)
-  TWO_COMPONENT_ACCESS(ulong_2, x, w)
-  TWO_COMPONENT_ACCESS(ulong_2, y, z)
-  TWO_COMPONENT_ACCESS(ulong_2, y, w)
-  TWO_COMPONENT_ACCESS(ulong_2, z, w)
-
-  THREE_COMPONENT_ACCESS(ulong_3, x, y, z)
-  THREE_COMPONENT_ACCESS(ulong_3, x, y, w)
-  THREE_COMPONENT_ACCESS(ulong_3, x, z, w)
-  THREE_COMPONENT_ACCESS(ulong_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(ulong_4, x, y, z, w)
-};
-
-class longlong_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(long long int, longlong_4)
-
-  SCALARTYPE_4_CONVERSION_CTOR(longlong_4,
-    int_4, uint_4, float_4, double_4, unorm_4,
-    norm_4, uchar_4, short_4, ushort_4, char_4, ulong_4, long_4, ulonglong_4)
-
-  longlong_4 operator-() const __CPU_GPU__ { return longlong_4(-x, -y, -z, -w); }
-  
-  SINGLE_COMPONENT_ACCESS(long long int, x)
-  SINGLE_COMPONENT_ACCESS(long long int, y)
-  SINGLE_COMPONENT_ACCESS(long long int, z)
-  SINGLE_COMPONENT_ACCESS(long long int, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(long long int)
-
-  TWO_COMPONENT_ACCESS(longlong_2, x, y)
-  TWO_COMPONENT_ACCESS(longlong_2, x, z)
-  TWO_COMPONENT_ACCESS(longlong_2, x, w)
-  TWO_COMPONENT_ACCESS(longlong_2, y, z)
-  TWO_COMPONENT_ACCESS(longlong_2, y, w)
-  TWO_COMPONENT_ACCESS(longlong_2, z, w)
-
-  THREE_COMPONENT_ACCESS(longlong_3, x, y, z)
-  THREE_COMPONENT_ACCESS(longlong_3, x, y, w)
-  THREE_COMPONENT_ACCESS(longlong_3, x, z, w)
-  THREE_COMPONENT_ACCESS(longlong_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(longlong_4, x, y, z, w)
-};
-
-class ulonglong_4
-{
-public:
-  SCALARTYPE_4_COMMON_PUBLIC_MEMBER(unsigned long long int, ulonglong_4)
-
-  SCALARTYPE_4_CONVERSION_CTOR(ulonglong_4,
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, unorm_4, short_4, ushort_4, long_4, uchar_4, longlong_4, ulong_4)
-
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, x)
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, y)
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, z)
-  SINGLE_COMPONENT_ACCESS(unsigned long long int, w)
-
-  SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS(unsigned long long int)
-
-  TWO_COMPONENT_ACCESS(ulonglong_2, x, y)
-  TWO_COMPONENT_ACCESS(ulonglong_2, x, z)
-  TWO_COMPONENT_ACCESS(ulonglong_2, x, w)
-  TWO_COMPONENT_ACCESS(ulonglong_2, y, z)
-  TWO_COMPONENT_ACCESS(ulonglong_2, y, w)
-  TWO_COMPONENT_ACCESS(ulonglong_2, z, w)
-
-  THREE_COMPONENT_ACCESS(ulonglong_3, x, y, z)
-  THREE_COMPONENT_ACCESS(ulonglong_3, x, y, w)
-  THREE_COMPONENT_ACCESS(ulonglong_3, x, z, w)
-  THREE_COMPONENT_ACCESS(ulonglong_3, y, z, w)
-
-  FOUR_COMPONENT_ACCESS(ulonglong_4, x, y, z, w)
-};
-
-#endif // if !__HCC_AMP__
-
-#undef SCALARTYPE_4_REFERENCE_SINGLE_COMPONENT_ACCESS
-#undef SCALARTYPE_4_COMMON_PUBLIC_MEMBER
-
-#undef SINGLE_COMPONENT_ACCESS
-#undef TWO_COMPONENT_ACCESS
-#undef THREE_COMPONENT_ACCESS
-#undef FOUR_COMPONENT_ACCESS
-
-//   Explicit Conversion Constructor Definitions (10.8.2.2)
-
-#if !__HCC_AMP__
-
-#define SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(ST, ST_1, \
-ST_1_o1, ST_1_o2, ST_1_o3, ST_1_o4, ST_1_o5, \
-ST_1_o6, ST_1_o7, ST_1_o8, ST_1_o9, ST_1_o10, ST_1_o11) \
-inline ST_1::ST_1(const ST_1_o1& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-\
-inline ST_1::ST_1(const ST_1_o2& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-\
-inline ST_1::ST_1(const ST_1_o3& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-\
-inline ST_1::ST_1(const ST_1_o4& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-\
-inline ST_1::ST_1(const ST_1_o5& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-inline ST_1::ST_1(const ST_1_o6& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-inline ST_1::ST_1(const ST_1_o7& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-inline ST_1::ST_1(const ST_1_o8& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-inline ST_1::ST_1(const ST_1_o9& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-inline ST_1::ST_1(const ST_1_o10& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-} \
-inline ST_1::ST_1(const ST_1_o11& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-}
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(int, int_1, 
-    uint_1, float_1, double_1,
-    char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned int, uint_1, 
-    int_1, float_1, double_1,
-    char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(float, float_1, 
-    int_1, uint_1, double_1,
-    char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(double, double_1, 
-    int_1, uint_1, float_1,
-    char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(char, char_1, 
-    int_1, uint_1, float_1,
-    double_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned char, uchar_1, 
-    int_1, uint_1, float_1,
-     double_1, char_1, short_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(short, short_1, 
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, ushort_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned short, ushort_1, 
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, long_1, ulong_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(long, long_1, 
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, ushort_1, ulong_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned long, ulong_1, 
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, ushort_1, long_1, longlong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(long long int, longlong_1, 
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, ulonglong_1)
-
-SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned long long int, ulonglong_1, 
-    int_1, uint_1, float_1,
-    double_1, char_1, uchar_1, short_1, ushort_1, long_1, ulong_1, longlong_1)
-
-#undef SCALARTYPE_1_EXPLICIT_CONVERSION_CONSTRUCTORS
-
-#endif // if !__HCC_AMP__
-
-#if !__HCC_AMP__
-
-#define SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(ST, ST_2, \
-ST_2_o1, ST_2_o2, ST_2_o3, ST_2_o4, ST_2_o5, \
-ST_2_o6, ST_2_o7, ST_2_o8, ST_2_o9, ST_2_o10, ST_2_o11, ST_2_o12, ST_2_o13) \
-inline ST_2::ST_2(const ST_2_o1& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-\
-inline ST_2::ST_2(const ST_2_o2& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-\
-inline ST_2::ST_2(const ST_2_o3& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-\
-inline ST_2::ST_2(const ST_2_o4& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-\
-inline ST_2::ST_2(const ST_2_o5& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-inline ST_2::ST_2(const ST_2_o6& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-inline ST_2::ST_2(const ST_2_o7& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-inline ST_2::ST_2(const ST_2_o8& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-inline ST_2::ST_2(const ST_2_o9& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-inline ST_2::ST_2(const ST_2_o10& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-inline ST_2::ST_2(const ST_2_o11& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-inline ST_2::ST_2(const ST_2_o12& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-inline ST_2::ST_2(const ST_2_o13& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-}
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(int, int_2, 
-    uint_2, float_2, double_2, norm_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned int, uint_2, 
-    int_2, float_2, double_2, norm_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(float, float_2, 
-    int_2, uint_2, double_2, norm_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(double, double_2, 
-    int_2, uint_2, float_2, norm_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(norm, norm_2, 
-    int_2, uint_2, float_2, double_2, unorm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(unorm, unorm_2, 
-    int_2, uint_2, float_2, double_2, norm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(char, char_2, 
-    int_2, uint_2, float_2, double_2, norm_2,
-    unorm_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned char, uchar_2, 
-    int_2, uint_2, float_2, double_2, norm_2,
-    char_2, unorm_2, short_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(short, short_2, 
-    int_2, uint_2, float_2, double_2, norm_2,
-    char_2, uchar_2, unorm_2, ushort_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned short, ushort_2, 
-    int_2, uint_2, float_2, double_2, norm_2,
-    char_2, uchar_2, short_2, unorm_2, long_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(long, long_2, 
-    int_2, uint_2, float_2, double_2, norm_2,
-    char_2, uchar_2, short_2, ushort_2, unorm_2, ulong_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned long, ulong_2, 
-    int_2, uint_2, float_2, double_2, norm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, unorm_2, longlong_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(long long int, longlong_2, 
-    int_2, uint_2, float_2, double_2, norm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, unorm_2, ulonglong_2)
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned long long int, ulonglong_2, 
-    int_2, uint_2, float_2, double_2, norm_2,
-    char_2, uchar_2, short_2, ushort_2, long_2, ulong_2, longlong_2, unorm_2)
-
-#undef SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS
-
-#else // if !__HCC_AMP__
-
-#define SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(ST, ST_2, \
-ST_2_o1, ST_2_o2, ST_2_o3, ST_2_o4, ST_2_o5) \
-inline ST_2::ST_2(const ST_2_o1& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-\
-inline ST_2::ST_2(const ST_2_o2& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-\
-inline ST_2::ST_2(const ST_2_o3& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-\
-inline ST_2::ST_2(const ST_2_o4& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-} \
-\
-inline ST_2::ST_2(const ST_2_o5& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-}
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(int, int_2, 
-    uint_2, float_2, double_2, norm_2, unorm_2) 
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned int, uint_2, 
-    int_2, float_2, double_2, norm_2, unorm_2) 
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(float, float_2, 
-    int_2, uint_2, double_2, norm_2, unorm_2) 
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(double, double_2, 
-    int_2, uint_2, float_2, norm_2, unorm_2) 
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(norm, norm_2, 
-    int_2, uint_2, float_2, double_2, unorm_2) 
-
-SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS(unorm, unorm_2, 
-    int_2, uint_2, float_2, double_2, norm_2) 
-
-#undef SCALARTYPE_2_EXPLICIT_CONVERSION_CONSTRUCTORS
-
-#endif // if !__HCC_AMP__
-
-#if !__HCC_AMP__
-
-#define SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(ST, ST_3, \
-ST_3_o1, ST_3_o2, ST_3_o3, ST_3_o4, ST_3_o5, \
-ST_3_o6, ST_3_o7, ST_3_o8, ST_3_o9, ST_3_o10, ST_3_o11, ST_3_o12, ST_3_o13) \
-inline ST_3::ST_3(const ST_3_o1& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-\
-inline ST_3::ST_3(const ST_3_o2& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-\
-inline ST_3::ST_3(const ST_3_o3& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-\
-inline ST_3::ST_3(const ST_3_o4& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-\
-inline ST_3::ST_3(const ST_3_o5& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-inline ST_3::ST_3(const ST_3_o6& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-inline ST_3::ST_3(const ST_3_o7& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-inline ST_3::ST_3(const ST_3_o8& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-inline ST_3::ST_3(const ST_3_o9& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-inline ST_3::ST_3(const ST_3_o10& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-inline ST_3::ST_3(const ST_3_o11& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-inline ST_3::ST_3(const ST_3_o12& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-inline ST_3::ST_3(const ST_3_o13& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-}
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(int, int_3, 
-    uint_3, float_3, double_3, norm_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned int, uint_3, 
-    int_3, float_3, double_3, norm_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(float, float_3, 
-    int_3, uint_3, double_3, norm_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(double, double_3, 
-    int_3, uint_3, float_3, norm_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(norm, norm_3, 
-    int_3, uint_3, float_3, double_3, unorm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(unorm, unorm_3, 
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(char, char_3, 
-    int_3, uint_3, float_3, double_3, norm_3,
-    unorm_3, uchar_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned char, uchar_3, 
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, unorm_3, short_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(short, short_3, 
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, uchar_3, unorm_3, ushort_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned short, ushort_3, 
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, uchar_3, short_3, unorm_3, long_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(long, long_3, 
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, uchar_3, short_3, ushort_3, unorm_3, ulong_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned long, ulong_3, 
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, unorm_3, longlong_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(long long int, longlong_3, 
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, uchar_3, short_3, ushort_3, unorm_3, ulong_3, long_3, ulonglong_3)
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned long long int, ulonglong_3, 
-    int_3, uint_3, float_3, double_3, norm_3,
-    char_3, uchar_3, short_3, ushort_3, long_3, unorm_3, longlong_3, ulong_3)
-
-#undef SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS
-
-#else // if !__HCC_AMP__
-
-#define SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(ST, ST_3, \
-ST_3_o1, ST_3_o2, ST_3_o3, ST_3_o4, ST_3_o5) \
-inline ST_3::ST_3(const ST_3_o1& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-\
-inline ST_3::ST_3(const ST_3_o2& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-\
-inline ST_3::ST_3(const ST_3_o3& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-\
-inline ST_3::ST_3(const ST_3_o4& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-} \
-\
-inline ST_3::ST_3(const ST_3_o5& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-}
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(int, int_3, 
-    uint_3, float_3, double_3, norm_3, unorm_3) 
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned int, uint_3, 
-    int_3, float_3, double_3, norm_3, unorm_3) 
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(float, float_3, 
-    int_3, uint_3, double_3, norm_3, unorm_3) 
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(double, double_3, 
-    int_3, uint_3, float_3, norm_3, unorm_3) 
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(norm, norm_3, 
-    int_3, uint_3, float_3, double_3, unorm_3) 
-
-SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS(unorm, unorm_3, 
-    int_3, uint_3, float_3, double_3, norm_3) 
-
-#undef SCALARTYPE_3_EXPLICIT_CONVERSION_CONSTRUCTORS
-
-#endif // if !__HCC_AMP__
-
-#if !__HCC_AMP__
-
-#define SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(ST, ST_4, \
-ST_4_o1, ST_4_o2, ST_4_o3, ST_4_o4, ST_4_o5, \
-ST_4_o6, ST_4_o7, ST_4_o8, ST_4_o9, ST_4_o10, ST_4_o11, ST_4_o12, ST_4_o13) \
-inline ST_4::ST_4(const ST_4_o1& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-\
-inline ST_4::ST_4(const ST_4_o2& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-\
-inline ST_4::ST_4(const ST_4_o3& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-\
-inline ST_4::ST_4(const ST_4_o4& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-\
-inline ST_4::ST_4(const ST_4_o5& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-inline ST_4::ST_4(const ST_4_o6& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-inline ST_4::ST_4(const ST_4_o7& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-inline ST_4::ST_4(const ST_4_o8& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-inline ST_4::ST_4(const ST_4_o9& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-inline ST_4::ST_4(const ST_4_o10& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-inline ST_4::ST_4(const ST_4_o11& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-inline ST_4::ST_4(const ST_4_o12& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-inline ST_4::ST_4(const ST_4_o13& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-}
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(int, int_4, 
-    uint_4, float_4, double_4, norm_4, unorm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned int, uint_4, 
-    int_4, float_4, double_4, norm_4, unorm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(float, float_4, 
-    int_4, uint_4, double_4, norm_4, unorm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(double, double_4, 
-    int_4, uint_4, float_4, norm_4, unorm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(norm, norm_4, 
-    int_4, uint_4, float_4, double_4, unorm_4, 
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(unorm, unorm_4, 
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(char, char_4, 
-    int_4, uint_4, float_4, double_4, norm_4,
-    unorm_4, uchar_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned char, uchar_4, 
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, unorm_4, short_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(short, short_4, 
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, uchar_4, unorm_4, ushort_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned short, ushort_4, 
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, uchar_4, short_4, unorm_4, long_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(long, long_4, 
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, uchar_4, short_4, ushort_4, unorm_4, ulong_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned long, ulong_4, 
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, unorm_4, longlong_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(long long int, longlong_4, 
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, uchar_4, short_4, ushort_4, unorm_4, ulong_4, long_4, ulonglong_4)
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned long long int, ulonglong_4, 
-    int_4, uint_4, float_4, double_4, norm_4,
-    char_4, uchar_4, short_4, ushort_4, long_4, unorm_4, longlong_4, ulong_4)
-
-#undef SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS
-
-#else // if !__HCC_AMP__
-
-#define SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(ST, ST_4, \
-ST_4_o1, ST_4_o2, ST_4_o3, ST_4_o4, ST_4_o5) \
-inline ST_4::ST_4(const ST_4_o1& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-\
-inline ST_4::ST_4(const ST_4_o2& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-\
-inline ST_4::ST_4(const ST_4_o3& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-\
-inline ST_4::ST_4(const ST_4_o4& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-} \
-\
-inline ST_4::ST_4(const ST_4_o5& other) __CPU_GPU__ \
-{ \
-  x = static_cast<ST>(other.get_x()); \
-  y = static_cast<ST>(other.get_y()); \
-  z = static_cast<ST>(other.get_z()); \
-  w = static_cast<ST>(other.get_w()); \
-}
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(int, int_4, 
-    uint_4, float_4, double_4, norm_4, unorm_4) 
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(unsigned int, uint_4, 
-    int_4, float_4, double_4, norm_4, unorm_4) 
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(float, float_4, 
-    int_4, uint_4, double_4, norm_4, unorm_4) 
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(double, double_4, 
-    int_4, uint_4, float_4, norm_4, unorm_4) 
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(norm, norm_4, 
-    int_4, uint_4, float_4, double_4, unorm_4) 
-
-SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS(unorm, unorm_4, 
-    int_4, uint_4, float_4, double_4, norm_4) 
-
-#undef SCALARTYPE_4_EXPLICIT_CONVERSION_CONSTRUCTORS
-
-#endif
-
-//   Operators between Two References (10.8.1 Synopsis)
-
-#if !__HCC_AMP__
-
-#define SCALARTYPE_1_OPERATOR(ST_1) \
-inline ST_1 operator+(const ST_1& lhs, const ST_1& rhs) __CPU_GPU__ \
-{ \
-  return ST_1(lhs.get_x() + rhs.get_x()); \
-} \
-\
-inline ST_1 operator-(const ST_1& lhs, const ST_1& rhs) __CPU_GPU__ \
-{ \
-  return ST_1(lhs.get_x() - rhs.get_x()); \
-} \
-\
-inline ST_1 operator*(const ST_1& lhs, const ST_1& rhs) __CPU_GPU__ \
-{ \
-  return ST_1(lhs.get_x() * rhs.get_x()); \
-} \
-\
-inline ST_1 operator/(const ST_1& lhs, const ST_1& rhs) __CPU_GPU__ \
-{ \
-  return ST_1(lhs.get_x() / rhs.get_x()); \
-} \
-\
-inline bool operator==(const ST_1& lhs, const ST_1& rhs) __CPU_GPU__ \
-{ \
-  return (lhs.get_x() == rhs.get_x()); \
-} \
-\
-inline bool operator!=(const ST_1& lhs, const ST_1& rhs) __CPU_GPU__ \
-{ \
-  return (lhs.get_x() != rhs.get_x()); \
-}
-
-SCALARTYPE_1_OPERATOR(int_1)
-
-SCALARTYPE_1_OPERATOR(uint_1)
-
-SCALARTYPE_1_OPERATOR(float_1)
-
-SCALARTYPE_1_OPERATOR(double_1)
-
-SCALARTYPE_1_OPERATOR(char_1)
-
-SCALARTYPE_1_OPERATOR(uchar_1)
-
-SCALARTYPE_1_OPERATOR(short_1)
-
-SCALARTYPE_1_OPERATOR(ushort_1)
-
-SCALARTYPE_1_OPERATOR(long_1)
-
-SCALARTYPE_1_OPERATOR(ulong_1)
-
-SCALARTYPE_1_OPERATOR(longlong_1)
-
-SCALARTYPE_1_OPERATOR(ulonglong_1)
-
-#undef SCALARTYPE_1_OPERATOR
-
-inline int_1 operator%(const int_1& lhs, const int_1& rhs) __CPU_GPU__
-{
-  return int_1(lhs.get_x() % rhs.get_x());
-}
-
-inline int_1 operator^(const int_1& lhs, const int_1& rhs) __CPU_GPU__
-{
-  return int_1(lhs.get_x() ^ rhs.get_x());
-}
-
-inline int_1 operator|(const int_1& lhs, const int_1& rhs) __CPU_GPU__
-{
-  return int_1(lhs.get_x() | rhs.get_x());
-}
-
-inline int_1 operator&(const int_1& lhs, const int_1& rhs) __CPU_GPU__
-{
-  return int_1(lhs.get_x() & rhs.get_x());
-}
-
-inline int_1 operator<<(const int_1& lhs, const int_1& rhs) __CPU_GPU__
-{
-  return int_1(lhs.get_x() << rhs.get_x());
-}
-
-inline int_1 operator>>(const int_1& lhs, const int_1& rhs) __CPU_GPU__
-{
-  return int_1(lhs.get_x() >> rhs.get_x());
-}
-
-inline uint_1 operator%(const uint_1& lhs, const uint_1& rhs) __CPU_GPU__
-{
-  return uint_1(lhs.get_x() % rhs.get_x());
-}
-
-inline uint_1 operator^(const uint_1& lhs, const uint_1& rhs) __CPU_GPU__
-{
-  return uint_1(lhs.get_x() ^ rhs.get_x());
-}
-
-inline uint_1 operator|(const uint_1& lhs, const uint_1& rhs) __CPU_GPU__
-{
-  return uint_1(lhs.get_x() | rhs.get_x());
-}
-
-inline uint_1 operator&(const uint_1& lhs, const uint_1& rhs) __CPU_GPU__
-{
-  return uint_1(lhs.get_x() & rhs.get_x());
-}
-
-inline uint_1 operator<<(const uint_1& lhs, const uint_1& rhs) __CPU_GPU__
-{
-  return uint_1(lhs.get_x() << rhs.get_x());
-}
-
-inline uint_1 operator>>(const uint_1& lhs, const uint_1& rhs) __CPU_GPU__
-{
-  return uint_1(lhs.get_x() >> rhs.get_x());
-}
-
-#endif // if !__HCC_AMP__
-
-#define SCALARTYPE_2_OPERATOR(ST_2) \
-inline ST_2 operator+(const ST_2& lhs, const ST_2& rhs) __CPU_GPU__ \
-{ \
-  return ST_2(lhs.get_x() + rhs.get_x(), lhs.get_y() + rhs.get_y()); \
-} \
-\
-inline ST_2 operator-(const ST_2& lhs, const ST_2& rhs) __CPU_GPU__ \
-{ \
-  return ST_2(lhs.get_x() - rhs.get_x(), lhs.get_y() - rhs.get_y()); \
-} \
-\
-inline ST_2 operator*(const ST_2& lhs, const ST_2& rhs) __CPU_GPU__ \
-{ \
-  return ST_2(lhs.get_x() * rhs.get_x(), lhs.get_y() * rhs.get_y()); \
-} \
-\
-inline ST_2 operator/(const ST_2& lhs, const ST_2& rhs) __CPU_GPU__ \
-{ \
-  return ST_2(lhs.get_x() / rhs.get_x(), lhs.get_y() / rhs.get_y()); \
-} \
-\
-inline bool operator==(const ST_2& lhs, const ST_2& rhs) __CPU_GPU__ \
-{ \
-  return (lhs.get_x() == rhs.get_x()) && (lhs.get_y() == rhs.get_y()); \
-} \
-\
-inline bool operator!=(const ST_2& lhs, const ST_2& rhs) __CPU_GPU__ \
-{ \
-  return (lhs.get_x() != rhs.get_x()) || (lhs.get_y() != rhs.get_y()); \
-}
-
-SCALARTYPE_2_OPERATOR(int_2)
-
-SCALARTYPE_2_OPERATOR(uint_2)
-
-SCALARTYPE_2_OPERATOR(float_2)
-
-SCALARTYPE_2_OPERATOR(double_2)
-
-SCALARTYPE_2_OPERATOR(norm_2)
-
-SCALARTYPE_2_OPERATOR(unorm_2)
-
-#if !__HCC_AMP__
-
-SCALARTYPE_2_OPERATOR(char_2)
-
-SCALARTYPE_2_OPERATOR(uchar_2)
-
-SCALARTYPE_2_OPERATOR(short_2)
-
-SCALARTYPE_2_OPERATOR(ushort_2)
-
-SCALARTYPE_2_OPERATOR(long_2)
-
-SCALARTYPE_2_OPERATOR(ulong_2)
-
-SCALARTYPE_2_OPERATOR(longlong_2)
-
-SCALARTYPE_2_OPERATOR(ulonglong_2)
-
-#endif // if !__HCC_AMP__
-
-#undef SCALARTYPE_2_OPERATOR
-
-inline int_2 operator%(const int_2& lhs, const int_2& rhs) __CPU_GPU__
-{
-  return int_2(lhs.get_x() % rhs.get_x(), lhs.get_y() % rhs.get_y());
-}
-
-inline int_2 operator^(const int_2& lhs, const int_2& rhs) __CPU_GPU__
-{
-  return int_2(lhs.get_x() ^ rhs.get_x(), lhs.get_y() ^ rhs.get_y());
-}
-
-inline int_2 operator|(const int_2& lhs, const int_2& rhs) __CPU_GPU__
-{
-  return int_2(lhs.get_x() | rhs.get_x(), lhs.get_y() | rhs.get_y());
-}
-
-inline int_2 operator&(const int_2& lhs, const int_2& rhs) __CPU_GPU__
-{
-  return int_2(lhs.get_x() & rhs.get_x(), lhs.get_y() & rhs.get_y());
-}
-
-inline int_2 operator<<(const int_2& lhs, const int_2& rhs) __CPU_GPU__
-{
-  return int_2(lhs.get_x() << rhs.get_x(), lhs.get_y() << rhs.get_y());
-}
-
-inline int_2 operator>>(const int_2& lhs, const int_2& rhs) __CPU_GPU__
-{
-  return int_2(lhs.get_x() >> rhs.get_x(), lhs.get_y() >> rhs.get_y());
-}
-
-inline uint_2 operator%(const uint_2& lhs, const uint_2& rhs) __CPU_GPU__
-{
-  return uint_2(lhs.get_x() % rhs.get_x(), lhs.get_y() % rhs.get_y());
-}
-
-inline uint_2 operator^(const uint_2& lhs, const uint_2& rhs) __CPU_GPU__
-{
-  return uint_2(lhs.get_x() ^ rhs.get_x(), lhs.get_y() ^ rhs.get_y());
-}
-
-inline uint_2 operator|(const uint_2& lhs, const uint_2& rhs) __CPU_GPU__
-{
-  return uint_2(lhs.get_x() | rhs.get_x(), lhs.get_y() | rhs.get_y());
-}
-
-inline uint_2 operator&(const uint_2& lhs, const uint_2& rhs) __CPU_GPU__
-{
-  return uint_2(lhs.get_x() & rhs.get_x(), lhs.get_y() & rhs.get_y());
-}
-
-inline uint_2 operator<<(const uint_2& lhs, const uint_2& rhs) __CPU_GPU__
-{
-  return uint_2(lhs.get_x() << rhs.get_x(), lhs.get_y() << rhs.get_y());
-}
-
-inline uint_2 operator>>(const uint_2& lhs, const uint_2& rhs) __CPU_GPU__
-{
-  return uint_2(lhs.get_x() >> rhs.get_x(), lhs.get_y() >> rhs.get_y());
-}
-
-#define SCALARTYPE_3_OPERATOR(ST_3) \
-inline ST_3 operator+(const ST_3& lhs, const ST_3& rhs) __CPU_GPU__ \
-{ \
-  return ST_3(lhs.get_x() + rhs.get_x(), lhs.get_y() + rhs.get_y(), \
-               lhs.get_z() + rhs.get_z()); \
-} \
-\
-inline ST_3 operator-(const ST_3& lhs, const ST_3& rhs) __CPU_GPU__ \
-{ \
-  return ST_3(lhs.get_x() - rhs.get_x(), lhs.get_y() - rhs.get_y(), \
-               lhs.get_z() - rhs.get_z()); \
-} \
-\
-inline ST_3 operator*(const ST_3& lhs, const ST_3& rhs) __CPU_GPU__ \
-{ \
-  return ST_3(lhs.get_x() * rhs.get_x(), lhs.get_y() * rhs.get_y(), \
-               lhs.get_z() * rhs.get_z()); \
-} \
-\
-inline ST_3 operator/(const ST_3& lhs, const ST_3& rhs) __CPU_GPU__ \
-{ \
-  return ST_3(lhs.get_x() / rhs.get_x(), lhs.get_y() / rhs.get_y(), \
-               lhs.get_z() / rhs.get_z()); \
-} \
-\
-inline bool operator==(const ST_3& lhs, const ST_3& rhs) __CPU_GPU__ \
-{ \
-  return (lhs.get_x() == rhs.get_x()) && (lhs.get_y() == rhs.get_y()) \
-           && (lhs.get_z() == rhs.get_z()); \
-} \
-\
-inline bool operator!=(const ST_3& lhs, const ST_3& rhs) __CPU_GPU__ \
-{ \
-  return (lhs.get_x() != rhs.get_x()) || (lhs.get_y() != rhs.get_y()) \
-           || (lhs.get_z() != rhs.get_z()); \
-}
-
-SCALARTYPE_3_OPERATOR(int_3)
-
-SCALARTYPE_3_OPERATOR(uint_3)
-
-SCALARTYPE_3_OPERATOR(float_3)
-
-SCALARTYPE_3_OPERATOR(double_3)
-
-SCALARTYPE_3_OPERATOR(norm_3)
-
-SCALARTYPE_3_OPERATOR(unorm_3)
-
-#if !__HCC_AMP__
-
-SCALARTYPE_3_OPERATOR(char_3)
-
-SCALARTYPE_3_OPERATOR(uchar_3)
-
-SCALARTYPE_3_OPERATOR(short_3)
-
-SCALARTYPE_3_OPERATOR(ushort_3)
-
-SCALARTYPE_3_OPERATOR(long_3)
-
-SCALARTYPE_3_OPERATOR(ulong_3)
-
-SCALARTYPE_3_OPERATOR(longlong_3)
-
-SCALARTYPE_3_OPERATOR(ulonglong_3)
-
-#endif // if !__HCC_AMP__
-
-#undef SCALARTYPE_3_OPERATOR
-
-inline int_3 operator%(const int_3& lhs, const int_3& rhs) __CPU_GPU__
-{
-  return int_3(lhs.get_x() % rhs.get_x(), lhs.get_y() % rhs.get_y(),
-                lhs.get_z() % rhs.get_z());
-}
-
-inline int_3 operator^(const int_3& lhs, const int_3& rhs) __CPU_GPU__
-{
-  return int_3(lhs.get_x() ^ rhs.get_x(), lhs.get_y() ^ rhs.get_y(),
-                lhs.get_z() ^ rhs.get_z());
-}
-
-inline int_3 operator|(const int_3& lhs, const int_3& rhs) __CPU_GPU__
-{
-  return int_3(lhs.get_x() | rhs.get_x(), lhs.get_y() | rhs.get_y(),
-                lhs.get_z() | rhs.get_z());
-}
-
-inline int_3 operator&(const int_3& lhs, const int_3& rhs) __CPU_GPU__
-{
-  return int_3(lhs.get_x() & rhs.get_x(), lhs.get_y() & rhs.get_y(),
-                lhs.get_z() & rhs.get_z());
-}
-
-inline int_3 operator<<(const int_3& lhs, const int_3& rhs) __CPU_GPU__
-{
-  return int_3(lhs.get_x() << rhs.get_x(), lhs.get_y() << rhs.get_y(),
-                lhs.get_z() << rhs.get_z());
-}
-
-inline int_3 operator>>(const int_3& lhs, const int_3& rhs) __CPU_GPU__
-{
-  return int_3(lhs.get_x() >> rhs.get_x(), lhs.get_y() >> rhs.get_y(),
-                lhs.get_z() >> rhs.get_z());
-}
-
-inline uint_3 operator%(const uint_3& lhs, const uint_3& rhs) __CPU_GPU__
-{
-  return uint_3(lhs.get_x() % rhs.get_x(), lhs.get_y() % rhs.get_y(),
-                 lhs.get_z() % rhs.get_z());
-}
-
-inline uint_3 operator^(const uint_3& lhs, const uint_3& rhs) __CPU_GPU__
-{
-  return uint_3(lhs.get_x() ^ rhs.get_x(), lhs.get_y() ^ rhs.get_y(),
-                 lhs.get_z() ^ rhs.get_z());
-}
-
-inline uint_3 operator|(const uint_3& lhs, const uint_3& rhs) __CPU_GPU__
-{
-  return uint_3(lhs.get_x() | rhs.get_x(), lhs.get_y() | rhs.get_y(),
-                 lhs.get_z() | rhs.get_z());
-}
-
-inline uint_3 operator&(const uint_3& lhs, const uint_3& rhs) __CPU_GPU__
-{
-  return uint_3(lhs.get_x() & rhs.get_x(), lhs.get_y() & rhs.get_y(),
-                 lhs.get_z() & rhs.get_z());
-}
-
-inline uint_3 operator<<(const uint_3& lhs, const uint_3& rhs) __CPU_GPU__
-{
-  return uint_3(lhs.get_x() << rhs.get_x(), lhs.get_y() << rhs.get_y(),
-                 lhs.get_z() << rhs.get_z());
-}
-
-inline uint_3 operator>>(const uint_3& lhs, const uint_3& rhs) __CPU_GPU__
-{
-  return uint_3(lhs.get_x() >> rhs.get_x(), lhs.get_y() >> rhs.get_y(),
-                 lhs.get_z() >> rhs.get_z());
-}
-
-#define SCALARTYPE_4_OPERATOR(ST_4) \
-inline ST_4 operator+(const ST_4& lhs, const ST_4& rhs) __CPU_GPU__ \
-{ \
-  return ST_4(lhs.get_x() + rhs.get_x(), lhs.get_y() + rhs.get_y(), \
-               lhs.get_z() + rhs.get_z(), lhs.get_w() + rhs.get_w()); \
-} \
-\
-inline ST_4 operator-(const ST_4& lhs, const ST_4& rhs) __CPU_GPU__ \
-{ \
-  return ST_4(lhs.get_x() - rhs.get_x(), lhs.get_y() - rhs.get_y(), \
-               lhs.get_z() - rhs.get_z(), lhs.get_w() - rhs.get_w()); \
-} \
-\
-inline ST_4 operator*(const ST_4& lhs, const ST_4& rhs) __CPU_GPU__ \
-{ \
-  return ST_4(lhs.get_x() * rhs.get_x(), lhs.get_y() * rhs.get_y(), \
-               lhs.get_z() * rhs.get_z(), lhs.get_w() * rhs.get_w()); \
-} \
-\
-inline ST_4 operator/(const ST_4& lhs, const ST_4& rhs) __CPU_GPU__ \
-{ \
-  return ST_4(lhs.get_x() / rhs.get_x(), lhs.get_y() / rhs.get_y(), \
-               lhs.get_z() / rhs.get_z(), lhs.get_w() / rhs.get_w()); \
-} \
-\
-inline bool operator==(const ST_4& lhs, const ST_4& rhs) __CPU_GPU__ \
-{ \
-  return (lhs.get_x() == rhs.get_x()) && (lhs.get_y() == rhs.get_y()) \
-           && (lhs.get_z() == rhs.get_z()) && (lhs.get_w() == rhs.get_w()); \
-} \
-\
-inline bool operator!=(const ST_4& lhs, const ST_4& rhs) __CPU_GPU__ \
-{ \
-  return (lhs.get_x() != rhs.get_x()) || (lhs.get_y() != rhs.get_y()) \
-           || (lhs.get_z() != rhs.get_z()) || (lhs.get_w() != rhs.get_w()); \
-}
-
-SCALARTYPE_4_OPERATOR(int_4)
-
-SCALARTYPE_4_OPERATOR(uint_4)
-
-SCALARTYPE_4_OPERATOR(float_4)
-
-SCALARTYPE_4_OPERATOR(double_4)
-
-SCALARTYPE_4_OPERATOR(norm_4)
-
-SCALARTYPE_4_OPERATOR(unorm_4)
-
-#if !__HCC_AMP__
-
-SCALARTYPE_4_OPERATOR(char_4)
-
-SCALARTYPE_4_OPERATOR(uchar_4)
-
-SCALARTYPE_4_OPERATOR(short_4)
-
-SCALARTYPE_4_OPERATOR(ushort_4)
-
-SCALARTYPE_4_OPERATOR(long_4)
-
-SCALARTYPE_4_OPERATOR(ulong_4)
-
-SCALARTYPE_4_OPERATOR(longlong_4)
-
-SCALARTYPE_4_OPERATOR(ulonglong_4)
-
-#endif // if !__HCC_AMP__
-
-#undef SCALARTYPE_4_OPERATOR
-
-inline int_4 operator%(const int_4& lhs, const int_4& rhs) __CPU_GPU__
-{
-  return int_4(lhs.get_x() % rhs.get_x(), lhs.get_y() % rhs.get_y(),
-                lhs.get_z() % rhs.get_z(), lhs.get_w() % rhs.get_w());
-}
-
-inline int_4 operator^(const int_4& lhs, const int_4& rhs) __CPU_GPU__
-{
-  return int_4(lhs.get_x() ^ rhs.get_x(), lhs.get_y() ^ rhs.get_y(),
-                lhs.get_z() ^ rhs.get_z(), lhs.get_w() ^ rhs.get_w());
-}
-
-inline int_4 operator|(const int_4& lhs, const int_4& rhs) __CPU_GPU__
-{
-  return int_4(lhs.get_x() | rhs.get_x(), lhs.get_y() | rhs.get_y(),
-                lhs.get_z() | rhs.get_z(), lhs.get_w() | rhs.get_w());
-}
-
-inline int_4 operator&(const int_4& lhs, const int_4& rhs) __CPU_GPU__
-{
-  return int_4(lhs.get_x() & rhs.get_x(), lhs.get_y() & rhs.get_y(),
-                lhs.get_z() & rhs.get_z(), lhs.get_w() & rhs.get_w());
-}
-
-inline int_4 operator<<(const int_4& lhs, const int_4& rhs) __CPU_GPU__
-{
-  return int_4(lhs.get_x() << rhs.get_x(), lhs.get_y() << rhs.get_y(),
-                lhs.get_z() << rhs.get_z(), lhs.get_w() << rhs.get_w());
-}
-
-inline int_4 operator>>(const int_4& lhs, const int_4& rhs) __CPU_GPU__
-{
-  return int_4(lhs.get_x() >> rhs.get_x(), lhs.get_y() >> rhs.get_y(),
-                lhs.get_z() >> rhs.get_z(), lhs.get_w() >> rhs.get_w());
-}
-
-inline uint_4 operator%(const uint_4& lhs, const uint_4& rhs) __CPU_GPU__
-{
-  return uint_4(lhs.get_x() % rhs.get_x(), lhs.get_y() % rhs.get_y(),
-                 lhs.get_z() % rhs.get_z(), lhs.get_w() % rhs.get_w());
-}
-
-inline uint_4 operator^(const uint_4& lhs, const uint_4& rhs) __CPU_GPU__
-{
-  return uint_4(lhs.get_x() ^ rhs.get_x(), lhs.get_y() ^ rhs.get_y(),
-                 lhs.get_z() ^ rhs.get_z(), lhs.get_w() ^ rhs.get_w());
-}
-
-inline uint_4 operator|(const uint_4& lhs, const uint_4& rhs) __CPU_GPU__
-{
-  return uint_4(lhs.get_x() | rhs.get_x(), lhs.get_y() | rhs.get_y(),
-                 lhs.get_z() | rhs.get_z(), lhs.get_w() | rhs.get_w());
-}
-
-inline uint_4 operator&(const uint_4& lhs, const uint_4& rhs) __CPU_GPU__
-{
-  return uint_4(lhs.get_x() & rhs.get_x(), lhs.get_y() & rhs.get_y(),
-                 lhs.get_z() & rhs.get_z(), lhs.get_w() & rhs.get_w());
-}
-
-inline uint_4 operator<<(const uint_4& lhs, const uint_4& rhs) __CPU_GPU__
-{
-  return uint_4(lhs.get_x() << rhs.get_x(), lhs.get_y() << rhs.get_y(),
-                 lhs.get_z() << rhs.get_z(), lhs.get_w() << rhs.get_w());
-}
-
-inline uint_4 operator>>(const uint_4& lhs, const uint_4& rhs) __CPU_GPU__
-{
-  return uint_4(lhs.get_x() >> rhs.get_x(), lhs.get_y() >> rhs.get_y(),
-                 lhs.get_z() >> rhs.get_z(), lhs.get_w() >> rhs.get_w());
-}
-
-// C++ AMP Specification 10.9 short_vector
-template<typename scalar_type, int size> struct short_vector
-{
-  short_vector()
-  {
-    // FIXME: Bug of Clang, passed under ICC 13 and VC++ 2012
-    // static_assert(false, "short_vector is not supported for this scalar type (T) and length (N)");
-  }
-};
-
-#define SHORT_VECTOR(ST, S, ST_S) \
-template<> \
-struct short_vector<ST, S> \
-{ \
-  typedef ST_S type; \
-};
-
-#if !__HCC_AMP__
-SHORT_VECTOR(unsigned int, 1, uint_1)
-#else
-SHORT_VECTOR(unsigned int, 1, unsigned int)
-#endif
-
-SHORT_VECTOR(unsigned int, 2, uint_2)
-
-SHORT_VECTOR(unsigned int, 3, uint_3)
-
-SHORT_VECTOR(unsigned int, 4, uint_4)
-
-#if !__HCC_AMP__
-SHORT_VECTOR(int, 1, int_1)
-#else
-SHORT_VECTOR(int, 1, int)
-#endif
-
-SHORT_VECTOR(int, 2, int_2)
-
-SHORT_VECTOR(int, 3, int_3)
-
-SHORT_VECTOR(int, 4, int_4)
-
-#if !__HCC_AMP__
-SHORT_VECTOR(float, 1, float_1)
-#else
-SHORT_VECTOR(float, 1, float)
-#endif
-
-SHORT_VECTOR(float, 2, float_2)
-
-SHORT_VECTOR(float, 3, float_3)
-
-SHORT_VECTOR(float, 4, float_4)
-
-SHORT_VECTOR(unorm, 1, unorm)
-
-SHORT_VECTOR(unorm, 2, unorm_2)
-
-SHORT_VECTOR(unorm, 3, unorm_3)
-
-SHORT_VECTOR(unorm, 4, unorm_4)
-
-SHORT_VECTOR(norm, 1, norm)
-
-SHORT_VECTOR(norm, 2, norm_2)
-
-SHORT_VECTOR(norm, 3, norm_3)
-
-SHORT_VECTOR(norm, 4, norm_4)
-
-#if !__HCC_AMP__
-SHORT_VECTOR(double, 1, double_1)
-#else
-SHORT_VECTOR(double, 1, double)
-#endif
-
-SHORT_VECTOR(double, 2, double_2)
-
-SHORT_VECTOR(double, 3, double_3)
-
-SHORT_VECTOR(double, 4, double_4)
-
-#if !__HCC_AMP__
-
-SHORT_VECTOR(char, 1, char_1)
-
-SHORT_VECTOR(char, 2, char_2)
-
-SHORT_VECTOR(char, 3, char_3)
-
-SHORT_VECTOR(char, 4, char_4)
-
-SHORT_VECTOR(unsigned char, 1, uchar_1)
-
-SHORT_VECTOR(unsigned char, 2, uchar_2)
-
-SHORT_VECTOR(unsigned char, 3, uchar_3)
-
-SHORT_VECTOR(unsigned char, 4, uchar_4)
-
-SHORT_VECTOR(short, 1, short_1)
-
-SHORT_VECTOR(short, 2, short_2)
-
-SHORT_VECTOR(short, 3, short_3)
-
-SHORT_VECTOR(short, 4, short_4)
-
-SHORT_VECTOR(unsigned short, 1, ushort_1)
-
-SHORT_VECTOR(unsigned short, 2, ushort_2)
-
-SHORT_VECTOR(unsigned short, 3, ushort_3)
-
-SHORT_VECTOR(unsigned short, 4, ushort_4)
-
-SHORT_VECTOR(long, 1, long_1)
-
-SHORT_VECTOR(long, 2, long_2)
-
-SHORT_VECTOR(long, 3, long_3)
-
-SHORT_VECTOR(long, 4, long_4)
-
-SHORT_VECTOR(unsigned long, 1, ulong_1)
-
-SHORT_VECTOR(unsigned long, 2, ulong_2)
-
-SHORT_VECTOR(unsigned long, 3, ulong_3)
-
-SHORT_VECTOR(unsigned long, 4, ulong_4)
-
-SHORT_VECTOR(long long int, 1, longlong_1)
-
-SHORT_VECTOR(long long int, 2, longlong_2)
-
-SHORT_VECTOR(long long int, 3, longlong_3)
-
-SHORT_VECTOR(long long int, 4, longlong_4)
-
-SHORT_VECTOR(unsigned long long int, 1, ulonglong_1)
-
-SHORT_VECTOR(unsigned long long int, 2, ulonglong_2)
-
-SHORT_VECTOR(unsigned long long int, 3, ulonglong_3)
-
-SHORT_VECTOR(unsigned long long int, 4, ulonglong_4)
-
-#endif // if !__HCC_AMP__
-
-#undef SHORT_VECTOR
-
-// C++ AMP Specification 10.10 short_vector_traits
-template<typename type> struct short_vector_traits
-{
-  short_vector_traits()
-  {
-    // FIXME: Bug of Clang, passed under ICC 13 and VC++ 2012
-    // static_assert(false, "short_vector_traits is not supported for this type (type)");
-  }
-};
-
-#define SHORT_VECTOR_TRAITS(ST, S, ST_S) \
-template<> \
-struct short_vector_traits<ST_S> \
-{ \
-  typedef ST value_type; \
-  static int const size = S; \
-};
-
-#if !__HCC_AMP__
-SHORT_VECTOR_TRAITS(unsigned int, 1, uint_1)
-#else
-SHORT_VECTOR_TRAITS(unsigned int, 1, unsigned int)
-#endif
-
-SHORT_VECTOR_TRAITS(unsigned int, 2, uint_2)
-
-SHORT_VECTOR_TRAITS(unsigned int, 3, uint_3)
-
-SHORT_VECTOR_TRAITS(unsigned int, 4, uint_4)
-
-#if !__HCC_AMP__
-SHORT_VECTOR_TRAITS(int, 1, int_1)
-#else
-SHORT_VECTOR_TRAITS(int, 1, int)
-#endif
-
-SHORT_VECTOR_TRAITS(int, 2, int_2)
-
-SHORT_VECTOR_TRAITS(int, 3, int_3)
-
-SHORT_VECTOR_TRAITS(int, 4, int_4)
-
-#if !__HCC_AMP__
-SHORT_VECTOR_TRAITS(float, 1, float_1)
-#else
-SHORT_VECTOR_TRAITS(float, 1, float)
-#endif
-
-SHORT_VECTOR_TRAITS(float, 2, float_2)
-
-SHORT_VECTOR_TRAITS(float, 3, float_3)
-
-SHORT_VECTOR_TRAITS(float, 4, float_4)
-
-SHORT_VECTOR_TRAITS(unorm, 1, unorm)
-
-SHORT_VECTOR_TRAITS(unorm, 2, unorm_2)
-
-SHORT_VECTOR_TRAITS(unorm, 3, unorm_3)
-
-SHORT_VECTOR_TRAITS(unorm, 4, unorm_4)
-
-SHORT_VECTOR_TRAITS(norm, 1, norm)
-
-SHORT_VECTOR_TRAITS(norm, 2, norm_2)
-
-SHORT_VECTOR_TRAITS(norm, 3, norm_3)
-
-SHORT_VECTOR_TRAITS(norm, 4, norm_4)
-
-#if !__HCC_AMP__
-SHORT_VECTOR_TRAITS(double, 1, double_1)
-#else
-SHORT_VECTOR_TRAITS(double, 1, double)
-#endif
-
-SHORT_VECTOR_TRAITS(double, 2, double_2)
-
-SHORT_VECTOR_TRAITS(double, 3, double_3)
-
-SHORT_VECTOR_TRAITS(double, 4, double_4)
-
-#if !__HCC_AMP__
-
-SHORT_VECTOR_TRAITS(char, 1, char_1)
-
-SHORT_VECTOR_TRAITS(char, 2, char_2)
-
-SHORT_VECTOR_TRAITS(char, 3, char_3)
-
-SHORT_VECTOR_TRAITS(char, 4, char_4)
-
-SHORT_VECTOR_TRAITS(unsigned char, 1, uchar_1)
-
-SHORT_VECTOR_TRAITS(unsigned char, 2, uchar_2)
-
-SHORT_VECTOR_TRAITS(unsigned char, 3, uchar_3)
-
-SHORT_VECTOR_TRAITS(unsigned char, 4, uchar_4)
-
-SHORT_VECTOR_TRAITS(short, 1, short_1)
-
-SHORT_VECTOR_TRAITS(short, 2, short_2)
-
-SHORT_VECTOR_TRAITS(short, 3, short_3)
-
-SHORT_VECTOR_TRAITS(short, 4, short_4)
-
-SHORT_VECTOR_TRAITS(unsigned short, 1, ushort_1)
-
-SHORT_VECTOR_TRAITS(unsigned short, 2, ushort_2)
-
-SHORT_VECTOR_TRAITS(unsigned short, 3, ushort_3)
-
-SHORT_VECTOR_TRAITS(unsigned short, 4, ushort_4)
-
-SHORT_VECTOR_TRAITS(long, 1, long_1)
-
-SHORT_VECTOR_TRAITS(long, 2, long_2)
-
-SHORT_VECTOR_TRAITS(long, 3, long_3)
-
-SHORT_VECTOR_TRAITS(long, 4, long_4)
-
-SHORT_VECTOR_TRAITS(unsigned long, 1, ulong_1)
-
-SHORT_VECTOR_TRAITS(unsigned long, 2, ulong_2)
-
-SHORT_VECTOR_TRAITS(unsigned long, 3, ulong_3)
-
-SHORT_VECTOR_TRAITS(unsigned long, 4, ulong_4)
-
-SHORT_VECTOR_TRAITS(long long int, 1, longlong_1)
-
-SHORT_VECTOR_TRAITS(long long int, 2, longlong_2)
-
-SHORT_VECTOR_TRAITS(long long int, 3, longlong_3)
-
-SHORT_VECTOR_TRAITS(long long int, 4, longlong_4)
-
-SHORT_VECTOR_TRAITS(unsigned long long int, 1, ulonglong_1)
-
-SHORT_VECTOR_TRAITS(unsigned long long int, 2, ulonglong_2)
-
-SHORT_VECTOR_TRAITS(unsigned long long int, 3, ulonglong_3)
-
-SHORT_VECTOR_TRAITS(unsigned long long int, 4, ulonglong_4)
-
-#endif // if !__HCC_AMP__
-
-#undef SHORT_VECTOR_TRAITS
-
-#endif // _KALMAR_SHORT_VECTORS_H
diff --git a/include/pinned_vector.hpp b/include/pinned_vector.hpp
deleted file mode 100644
index 65e94bc1fc9..00000000000
--- a/include/pinned_vector.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#pragma once
-
-#ifndef _PINNED_VECTOR_H
-#define _PINNED_VECTOR_H
-
-#include <new>
-#include "hc.hpp"
-#include "hc_am.hpp"
-
-namespace hc
-{
-
-// minimal allocator that uses am_alloc to allocate pinned memory on the host,
-// with comparison functions used by the C++ standard library
-  
-template <class T>
-struct am_allocator {
-  typedef T value_type;
-
-  am_allocator() = default;
-
-  template <class U> am_allocator(const am_allocator<U>&) {}
-
-  T* allocate(std::size_t n) {
-    hc::accelerator acc;
-    auto p = static_cast<T*>(hc::am_alloc(n*sizeof(T), acc, amHostPinned));
-    if(p == nullptr){ throw std::bad_alloc(); }
-    return p;
-  }
-
-  void deallocate(T* p, std::size_t) {
-    // am_free returns an am_status_t; we can't return that, since
-    // allocate is a void function, and we can't throw an exception either,
-    // since deallocate is used in destructors. Hmmm.
-    hc::am_free(p);
-  }
-};
-
-template <class T, class U>
-bool operator==(const am_allocator<T>&, const am_allocator<U>&) { return true; }
-
-template <class T, class U>
-bool operator!=(const am_allocator<T>&, const am_allocator<U>&) { return false; }
-
-
-// convenience alias 
-template<typename T>
-using pinned_vector = std::vector<T, am_allocator<T>>;
-
-} // namespace hc
-
-#endif // _PINNED_VECTOR_H
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 98c1581d261..a4c56359ff8 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -5,7 +5,8 @@ if (HCC_RUNTIME_DEBUG)
   add_compile_options(-g -O0)
 endif (HCC_RUNTIME_DEBUG)
 
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third_party)
 
 include(GNUInstallDirs)
 set( LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} )
@@ -16,7 +17,6 @@ set( CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hcc )
 # C++AMP runtime (mcwamp)
 ####################
 add_mcwamp_shared_library(mcwamp mcwamp.cpp)
-add_mcwamp_library(mcwamp_atomic mcwamp_atomic.cpp)
 
 # Library interface to use runtime
 add_library(hccrt INTERFACE)
@@ -75,12 +75,11 @@ endif (USE_CODEXL_ACTIVITY_LOGGER EQUAL 1)
 # add subdirectories
 ####################
 add_subdirectory(hsa)
-add_subdirectory(cpu)
 
 ####################
 # install targets
 ####################
-install(TARGETS mcwamp mcwamp_atomic hccrt hccshared
+install(TARGETS mcwamp hccrt hccshared
     EXPORT hcc-targets
     RUNTIME DESTINATION bin
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/lib/clamp-device.in b/lib/clamp-device.in
index 80342f118a1..9073e00b893 100755
--- a/lib/clamp-device.in
+++ b/lib/clamp-device.in
@@ -187,10 +187,12 @@ fi
 # Invoke HCC-specific opt passes
 # Optimization notes:
 #  -disable-simplify-libcalls:  prevents transforming loops into library calls such as memset, memcopy on GPU
-$OPT -mtriple amdgcn--amdhsa-amdgiz -mcpu=$AMDGPU_TARGET \
-   -load $LIB/LLVMSelectAcceleratorCode@CMAKE_SHARED_LIBRARY_SUFFIX@ \
+$OPT -mtriple amdgcn-amd-amdhsa -mcpu=$AMDGPU_TARGET \
   -load $LIB/LLVMPromotePointerKernArgsToGlobal@CMAKE_SHARED_LIBRARY_SUFFIX@ \
-  -select-accelerator-code -promote-pointer-kernargs-to-global \
+  -load $LIB/LLVMUndefineGlobalsInAcceleratorCode@CMAKE_SHARED_LIBRARY_SUFFIX@ \
+  -load $LIB/LLVMSelectAcceleratorCode@CMAKE_SHARED_LIBRARY_SUFFIX@ \
+  -select-accelerator-code -undefine-globals-in-accelerator-code \
+  -promote-pointer-kernargs-to-global \
   -dce -globaldce -always-inline -infer-address-spaces \
   -amdgpu-internalize-symbols -disable-simplify-libcalls $KMOPTOPT -verify \
   < $2.linked.bc -o $2.opt.bc
@@ -220,9 +222,9 @@ fi
 CODE_OBJECT_FORMAT="-mattr=-code-object-v3"
 
 if [ $KMTHINLTO == "1" ]; then
-  $LLC $KMOPTLLC -mtriple amdgcn--amdhsa-amdgiz -mcpu=$AMDGPU_TARGET $CODE_OBJECT_FORMAT -filetype=obj -o $2 $2.opt.bc
+  $LLC $KMOPTLLC -mtriple amdgcn-amd-amdhsa -mcpu=$AMDGPU_TARGET $CODE_OBJECT_FORMAT -filetype=obj -o $2 $2.opt.bc
 else
-  $LLC $KMOPTLLC -mtriple amdgcn--amdhsa-amdgiz -mcpu=$AMDGPU_TARGET $CODE_OBJECT_FORMAT -filetype=obj -o $2.isabin $2.opt.bc
+  $LLC $KMOPTLLC -mtriple amdgcn-amd-amdhsa -mcpu=$AMDGPU_TARGET $CODE_OBJECT_FORMAT -filetype=obj -o $2.isabin $2.opt.bc
 fi
 
 # error handling for llc
@@ -238,7 +240,7 @@ if [ $KMDUMPISA == "1" ]; then
   else
     cp $2.isabin ${KMDUMPDIR}/dump-$AMDGPU_TARGET.isabin
   fi
-  $LLC $KMOPTLLC -mtriple amdgcn--amdhsa-amdgiz -mcpu=$AMDGPU_TARGET $CODE_OBJECT_FORMAT -filetype=asm -o $2.isa $2.opt.bc
+  $LLC $KMOPTLLC -mtriple amdgcn-amd-amdhsa -mcpu=$AMDGPU_TARGET $CODE_OBJECT_FORMAT -filetype=asm -o $2.isa $2.opt.bc
   mv $2.isa ${KMDUMPDIR}/dump-$AMDGPU_TARGET.isa
 fi
 
diff --git a/lib/cpu/CMakeLists.txt b/lib/cpu/CMakeLists.txt
deleted file mode 100644
index 5968014e475..00000000000
--- a/lib/cpu/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-####################
-# C++AMP runtime (CPU implementation)
-####################
-add_mcwamp_library_cpu(mcwamp_cpu mcwamp_cpu.cpp)
-install(TARGETS mcwamp_cpu
-    EXPORT hcc-targets
-    RUNTIME DESTINATION bin
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    )
-MESSAGE(STATUS "build HCC CPU Runtime")
diff --git a/lib/cpu/mcwamp_cpu.cpp b/lib/cpu/mcwamp_cpu.cpp
deleted file mode 100644
index a86f28faf2c..00000000000
--- a/lib/cpu/mcwamp_cpu.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cstdlib>
-#include <cassert>
-#include <iostream>
-#include <map>
-#include <vector>
-
-#include <kalmar_runtime.h>
-#include <kalmar_aligned_alloc.h>
-
-extern "C" void PushArgImpl(void *ker, int idx, size_t sz, const void *v) {}
-
-namespace Kalmar {
-
-class CPUFallbackQueue final : public KalmarQueue
-{
-public:
-
-  CPUFallbackQueue(KalmarDevice* pDev) : KalmarQueue(pDev) {}
-
-  void read(void* device, void* dst, size_t count, size_t offset) override {
-      if (dst != device)
-          memmove(dst, (char*)device + offset, count);
-  }
-
-  void write(void* device, const void* src, size_t count, size_t offset, bool blocking) override {
-      if (src != device)
-          memmove((char*)device + offset, src, count);
-  }
-
-  void copy(void* src, void* dst, size_t count, size_t src_offset, size_t dst_offset, bool blocking) override {
-      if (src != dst)
-          memmove((char*)dst + dst_offset, (char*)src + src_offset, count);
-  }
-
-  void* map(void* device, size_t count, size_t offset, bool modify) override {
-      return (char*)device + offset;
-  }
-
-  void unmap(void* device, void* addr, size_t count, size_t offset, bool modify) override {}
-
-  void Push(void *kernel, int idx, void* device, bool isConst) override {}
-};
-
-class CPUFallbackDevice final : public KalmarDevice
-{
-public:
-    CPUFallbackDevice() : KalmarDevice() {}
-
-    std::wstring get_path() const override { return L"fallback"; }
-    std::wstring get_description() const override { return L"CPU Fallback"; }
-    size_t get_mem() const override { return 0; }
-    bool is_double() const override { return true; }
-    bool is_lim_double() const override { return true; }
-    bool is_unified() const override { return true; }
-    bool is_emulated() const override { return true; }
-    uint32_t get_version() const override { return 0; }
-
-    void* create(size_t count, struct rw_info* /* not used */) override {
-        return kalmar_aligned_alloc(0x1000, count);
-    }
-    void release(void *device, struct rw_info* /* not used */ ) override { 
-        kalmar_aligned_free(device);
-    }
-    std::shared_ptr<KalmarQueue> createQueue(execute_order order = execute_in_order, queue_priority priority = priority_normal) override {
-        return std::shared_ptr<KalmarQueue>(new CPUFallbackQueue(this));
-    }
-};
-
-template <typename T> inline void deleter(T* ptr) { delete ptr; }
-
-class CPUContext final : public KalmarContext
-{
-public:
-    CPUContext() { Devices.push_back(new CPUFallbackDevice); }
-    ~CPUContext() { std::for_each(std::begin(Devices), std::end(Devices), deleter<KalmarDevice>); }
-};
-
-
-static CPUContext ctx;
-
-} // namespace Kalmar
-
-extern "C" void *GetContextImpl() {
-  return &Kalmar::ctx;
-}
diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 42074c122ef..27afddcd44d 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -8,6 +8,8 @@
 
 #include "../hc2/headers/types/program_state.hpp"
 
+#include <elfio/elfio_amdgpu.hpp>
+
 #include <algorithm>
 #include <cassert>
 #include <chrono>
@@ -63,7 +65,7 @@
 #define KERNARG_BUFFER_SIZE (512)
 
 // number of pre-allocated kernarg buffers in HSAContext
-// Not required but typically should be greater than HCC_SIGNAL_POOL_SIZE 
+// Not required but typically should be greater than HCC_SIGNAL_POOL_SIZE
 // (some kernels don't allocate signals but nearly all need kernargs)
 #define KERNARG_POOL_SIZE (1024)
 
@@ -1354,13 +1356,13 @@ class HSAQueue final : public KalmarQueue
     bool nextKernelNeedsSysAcquire() const { return _nextKernelNeedsSysAcquire; };
     void setNextKernelNeedsSysAcquire(bool r) { _nextKernelNeedsSysAcquire = r; };
 
-    bool nextSyncNeedsSysRelease() const { 
+    bool nextSyncNeedsSysRelease() const {
       DBOUT( DB_CMD2, "  HSAQueue::nextSyncNeedsSysRelease(): " <<  _nextSyncNeedsSysRelease << "\n");
-      return _nextSyncNeedsSysRelease; 
+      return _nextSyncNeedsSysRelease;
     };
     void setNextSyncNeedsSysRelease(bool r) {
       DBOUT( DB_CMD2, "  HSAQueue::setNextSyncNeedsSysRelease(" <<  r << ")\n");
-      _nextSyncNeedsSysRelease = r; 
+      _nextSyncNeedsSysRelease = r;
     };
 
     uint64_t getSeqNum() const { return queueSeqNum; };
@@ -1462,7 +1464,7 @@ class HSAQueue final : public KalmarQueue
     //
     // Also different modes and optimizations can control when dependencies are added.
     // TODO - return reference if possible to avoid shared ptr overhead.
-    std::shared_ptr<KalmarAsyncOp> detectStreamDeps(hcCommandKind newCommandKind, KalmarAsyncOp *kNewOp) {
+    std::shared_ptr<KalmarAsyncOp> detectStreamDeps(hcCommandKind newCommandKind, KalmarAsyncOp *kNewOp) override {
 
         const auto newOp = static_cast<const HSAOp*> (kNewOp);
 
@@ -1479,8 +1481,8 @@ class HSAQueue final : public KalmarQueue
 
             bool needDep = false;
             if  (newCommandKind != youngestCommandKind) {
-                DBOUT(DB_CMD2, "Set NeedDep (command type changed) " 
-                        << getHcCommandKindString(youngestCommandKind) 
+                DBOUT(DB_CMD2, "Set NeedDep (command type changed) "
+                        << getHcCommandKindString(youngestCommandKind)
                         << "  ->  " << getHcCommandKindString(newCommandKind) << "\n") ;
                 needDep = true;
             };
@@ -1550,7 +1552,7 @@ class HSAQueue final : public KalmarQueue
     bool isEmpty() override {
         // Have to walk asyncOps since it can contain null pointers (if event is waited on and removed)
         // Also not all commands contain signals.
-        
+
         bool isEmpty = true;
 
         const auto& oldest = find_if(
@@ -1768,7 +1770,7 @@ class HSAQueue final : public KalmarQueue
         // do read
         if (dst != device) {
             if (!getDev()->is_unified()) {
-                DBOUT(DB_COPY, "read(" << device << "," << dst << "," << count << "," << offset 
+                DBOUT(DB_COPY, "read(" << device << "," << dst << "," << count << "," << offset
                                 << "): use HSA memory copy\n");
                 hsa_status_t status = HSA_STATUS_SUCCESS;
                 // Make sure host memory is accessible to gpu
@@ -1794,7 +1796,7 @@ class HSAQueue final : public KalmarQueue
                 // Unlock the host memory
                 status = hsa_amd_memory_unlock(dst);
             } else {
-                DBOUT(DB_COPY, "read(" << device << "," << dst << "," << count << "," << offset 
+                DBOUT(DB_COPY, "read(" << device << "," << dst << "," << count << "," << offset
                                 << "): use host memory copy\n");
                 memmove(dst, (char*)device + offset, count);
             }
@@ -1808,7 +1810,7 @@ class HSAQueue final : public KalmarQueue
         // do write
         if (src != device) {
             if (!getDev()->is_unified()) {
-                DBOUT(DB_COPY, "write(" << device << "," << src << "," << count << "," << offset 
+                DBOUT(DB_COPY, "write(" << device << "," << src << "," << count << "," << offset
                                 << "," << blocking << "): use HSA memory copy\n");
                 hsa_status_t status = HSA_STATUS_SUCCESS;
                 // Make sure host memory is accessible to gpu
@@ -1829,7 +1831,7 @@ class HSAQueue final : public KalmarQueue
                 // Unlock the host memory
                 status = hsa_amd_memory_unlock(const_cast<void*>(src));
             } else {
-                DBOUT(DB_COPY, "write(" << device << "," << src << "," << count << "," << offset 
+                DBOUT(DB_COPY, "write(" << device << "," << src << "," << count << "," << offset
                                 << "," << blocking << "): use host memory copy\n");
                 memmove((char*)device + offset, src, count);
             }
@@ -1847,7 +1849,7 @@ class HSAQueue final : public KalmarQueue
         // do copy
         if (src != dst) {
             if (!getDev()->is_unified()) {
-                DBOUT(DB_COPY, "copy(" << src << "," << dst << "," << count << "," << src_offset 
+                DBOUT(DB_COPY, "copy(" << src << "," << dst << "," << count << "," << src_offset
                                << "," << dst_offset << "," << blocking << "): use HSA memory copy\n");
                 hsa_status_t status = HSA_STATUS_SUCCESS;
                 // FIXME: aftre p2p enabled, if this function is not expected to copy between two buffers from different device, then, delete allow_access API call.
@@ -1857,7 +1859,7 @@ class HSAQueue final : public KalmarQueue
                 status = hsa_memory_copy((char*)dst + dst_offset, (char*)src + src_offset, count);
                 STATUS_CHECK(status, __LINE__);
             } else {
-                DBOUT(DB_COPY, "copy(" << src << "," << dst << "," << count << "," << src_offset 
+                DBOUT(DB_COPY, "copy(" << src << "," << dst << "," << count << "," << src_offset
                                << "," << dst_offset << "," << blocking << "): use host memory copy\n");
                 memmove((char*)dst + dst_offset, (char*)src + src_offset, count);
             }
@@ -1877,7 +1879,7 @@ class HSAQueue final : public KalmarQueue
         if (!getDev()->is_unified()) {
             if (DBFLAG(DB_COPY)) {
                 DBWSTREAM << getDev()->get_path();
-                DBSTREAM << ": map( <device> " << device << ", <count> " << count << ", <offset> " << offset 
+                DBSTREAM << ": map( <device> " << device << ", <count> " << count << ", <offset> " << offset
                          << ", <modify> " << modify << "): use HSA memory map\n";
             }
             hsa_status_t status = HSA_STATUS_SUCCESS;
@@ -1901,7 +1903,7 @@ class HSAQueue final : public KalmarQueue
         } else {
             if (DBFLAG(DB_COPY)) {
               DBWSTREAM << getDev()->get_path();
-              DBSTREAM << ": map( <device> " << device << ", <count> " << count << ", <offset> " << offset 
+              DBSTREAM << ": map( <device> " << device << ", <count> " << count << ", <offset> " << offset
                        << ", <modify> " << modify << "): use host memory map\n";
             }
             // for host memory we simply return the pointer plus offset
@@ -1917,7 +1919,7 @@ class HSAQueue final : public KalmarQueue
         if (!getDev()->is_unified()) {
             if (DBFLAG(DB_COPY)) {
                 DBWSTREAM << getDev()->get_path();
-                DBSTREAM << ": unmap( <device> " << device << ", <addr> " << addr << ", <count> " << count 
+                DBSTREAM << ": unmap( <device> " << device << ", <addr> " << addr << ", <count> " << count
                          << ", <offset> " << offset << ", <modify> " << modify << "): use HSA memory unmap\n";
             }
             if (modify) {
@@ -1933,7 +1935,7 @@ class HSAQueue final : public KalmarQueue
         } else {
             if (DBFLAG(DB_COPY)) {
                 DBWSTREAM << getDev()->get_path();
-                DBSTREAM << ": unmap( <device> " << device << ", <addr> " << addr << ", <count> " << count 
+                DBSTREAM << ": unmap( <device> " << device << ", <addr> " << addr << ", <count> " << count
                          << ", <offset> " << offset << ", <modify> " << modify <<"): use host memory unmap\n";
             }
             // for host memory there's nothing to be done
@@ -1962,7 +1964,7 @@ class HSAQueue final : public KalmarQueue
 
     void* getHSAAgent() override;
 
-    void* getHostAgent() override;
+    void* getHostAgent();
 
     void* getHSAAMRegion() override;
 
@@ -2152,7 +2154,7 @@ class HSAQueue final : public KalmarQueue
     void copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo,
                   const Kalmar::KalmarDevice *copyDevice, bool forceUnpinnedCopy) override ;
 
-    void copy2d_ext(const void *src, void *dst, size_t width, size_t height, size_t srcPitch, size_t dstPitch, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, const Kalmar::KalmarDevice *copyDevice, bool forceUnpinnedCopy);
+    void copy2d_ext(const void *src, void *dst, size_t width, size_t height, size_t srcPitch, size_t dstPitch, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, const Kalmar::KalmarDevice *copyDevice, bool forceUnpinnedCopy) override;
 
     void copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, bool foo) override ;
 
@@ -2622,7 +2624,7 @@ class HSADevice final : public KalmarDevice
         return (useCoarseGrainedRegion == false);
     }
     bool is_emulated() const override { return false; }
-    uint32_t get_version() const { return ((static_cast<unsigned int>(versionMajor) << 16) | versionMinor); }
+    uint32_t get_version() const override { return ((static_cast<unsigned int>(versionMajor) << 16) | versionMinor); }
 
     bool has_cpu_accessible_am() const override { return cpu_accessible_am; }
 
@@ -2733,6 +2735,7 @@ class HSADevice final : public KalmarDevice
             case hc::EF_AMDGPU_MACH_AMDGCN_GFX803 : triple.append("803"); break;
             case hc::EF_AMDGPU_MACH_AMDGCN_GFX900 : triple.append("900"); break;
             case hc::EF_AMDGPU_MACH_AMDGCN_GFX906 : triple.append("906"); break;
+            default: return false;
         }
 
         const auto isa{get_isa_name_from_triple(std::move(triple))};
@@ -3033,7 +3036,7 @@ class HSADevice final : public KalmarDevice
     }
 
 
-    bool has_cpu_accessible_am() override {
+    bool has_cpu_accessible_am() {
         return cpu_accessible_am;
     };
 
@@ -3224,19 +3227,15 @@ class HSADevice final : public KalmarDevice
     void memcpySymbol(void* symbolAddr, void* hostptr, size_t count, size_t offset = 0, enum hcCommandKind kind = hcMemcpyHostToDevice) override {
         hsa_status_t status;
 
-        if (executables.size() != 0) {
-            // copy data
-            if (kind == hcMemcpyHostToDevice) {
-                // host -> device
-                status = hsa_memory_copy(symbolAddr, (char*)hostptr + offset, count);
-                STATUS_CHECK(status, __LINE__);
-            } else if (kind == hcMemcpyDeviceToHost) {
-                // device -> host
-                status = hsa_memory_copy(hostptr, (char*)symbolAddr + offset, count);
-                STATUS_CHECK(status, __LINE__);
-            }
-        } else {
-            throw Kalmar::runtime_exception("HSA executable NOT built yet!", 0);
+        // copy data
+        if (kind == hcMemcpyHostToDevice) {
+            // host -> device
+            status = hsa_memory_copy(symbolAddr, (char*)hostptr + offset, count);
+            STATUS_CHECK(status, __LINE__);
+        } else if (kind == hcMemcpyDeviceToHost) {
+            // device -> host
+            status = hsa_memory_copy(hostptr, (char*)symbolAddr + offset, count);
+            STATUS_CHECK(status, __LINE__);
         }
     }
 
@@ -3682,7 +3681,7 @@ class HSAContext final : public KalmarContext
 
     void initPrintfBuffer() override {
 
-        if (HCC_ENABLE_PRINTF) { 
+        if (HCC_ENABLE_PRINTF) {
           if (hc::printf_buffer != nullptr) {
             // Check whether the printf buffer is still valid
             // because it may have been annihilated by HIP's hipDeviceReset().
@@ -3767,7 +3766,7 @@ void HSAContext::ReadHccEnv()
     GET_ENV_INT (HCC_D2H_PININPLACE_THRESHOLD, "Min size (in KB) to use pin-in-place for D2H copy if ChooseBest algorithm selected");
 
     GET_ENV_INT (HCC_STAGING_BUFFER_SIZE, "Unpinned copy engine staging buffer size in KB");
-  
+
     // Change the default GPU
     GET_ENV_INT (HCC_DEFAULT_GPU, "Change the default GPU (Default is device 0)");
 
@@ -3972,7 +3971,7 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) :
 }
 
 inline void*
-HSADevice::getHSAAgent() override {
+HSADevice::getHSAAgent() {
     return static_cast<void*>(&getAgent());
 }
 
@@ -4028,7 +4027,7 @@ HSAQueue::HSAQueue(KalmarDevice* pDev, hsa_agent_t agent, execute_order order, q
 }
 
 
-void HSAQueue::dispose() override {
+void HSAQueue::dispose() {
     hsa_status_t status;
 
     DBOUT(DB_INIT, "HSAQueue::dispose() " << this << "in\n");
@@ -4098,34 +4097,34 @@ void HSAQueue::releaseLockedRocrQueue()
 }
 
 inline void*
-HSAQueue::getHSAAgent() override {
+HSAQueue::getHSAAgent() {
     return static_cast<void*>(&(static_cast<HSADevice*>(getDev())->getAgent()));
 }
 inline void*
-HSAQueue::getHostAgent() override {
+HSAQueue::getHostAgent() {
     return static_cast<void*>(&(static_cast<HSADevice*>(getDev())->getHostAgent()));
 }
 inline void*
-HSAQueue::getHSAAMRegion() override {
+HSAQueue::getHSAAMRegion() {
     return static_cast<void*>(&(static_cast<HSADevice*>(getDev())->getHSAAMRegion()));
 }
 inline void*
-HSAQueue::getHSACoherentAMHostRegion() override {
+HSAQueue::getHSACoherentAMHostRegion() {
     return static_cast<void*>(&(static_cast<HSADevice*>(getDev())->getHSACoherentAMHostRegion()));
 }
 inline void*
-HSAQueue::getHSAAMHostRegion() override {
+HSAQueue::getHSAAMHostRegion() {
     return static_cast<void*>(&(static_cast<HSADevice*>(getDev())->getHSAAMHostRegion()));
 }
 
 
 inline void*
-HSAQueue::getHSAKernargRegion() override {
+HSAQueue::getHSAKernargRegion() {
     return static_cast<void*>(&(static_cast<HSADevice*>(getDev())->getHSAKernargRegion()));
 }
 
 void HSAQueue::copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo,
-              const Kalmar::KalmarDevice *copyDevice, bool forceUnpinnedCopy) override {
+              const Kalmar::KalmarDevice *copyDevice, bool forceUnpinnedCopy) {
     // wait for all previous async commands in this queue to finish
     // TODO - can remove this synchronization, copy is tail-synchronous not required on front end.
     this->wait();
@@ -4148,7 +4147,7 @@ void HSAQueue::copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCom
 
 
 // TODO - remove me
-void HSAQueue::copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, bool foo) override {
+void HSAQueue::copy_ext(const void *src, void *dst, size_t size_bytes, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, bool foo) {
 
     const Kalmar::KalmarDevice *copyDevice;
     if (srcPtrInfo._isInDeviceMem) {
@@ -4179,7 +4178,7 @@ void HSAQueue::copy2d_ext(const void *src, void *dst, size_t width, size_t heigh
 
 std::shared_ptr<KalmarAsyncOp> HSAQueue::EnqueueAsyncCopyExt(const void* src, void* dst, size_t size_bytes,
                                                    hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo,
-                                                   const Kalmar::KalmarDevice *copyDevice) override {
+                                                   const Kalmar::KalmarDevice *copyDevice) {
 
     hsa_status_t status = HSA_STATUS_SUCCESS;
 
@@ -4199,7 +4198,7 @@ std::shared_ptr<KalmarAsyncOp> HSAQueue::EnqueueAsyncCopyExt(const void* src, vo
 
 std::shared_ptr<KalmarAsyncOp> HSAQueue::EnqueueAsyncCopy2dExt(const void* src, void* dst, size_t width, size_t height, size_t srcPitch, size_t dstPitch,
                                                    hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo,
-                                                   const Kalmar::KalmarDevice *copyDevice) override {
+                                                   const Kalmar::KalmarDevice *copyDevice) {
 
 
     hsa_status_t status = HSA_STATUS_SUCCESS;
@@ -4219,7 +4218,7 @@ std::shared_ptr<KalmarAsyncOp> HSAQueue::EnqueueAsyncCopy2dExt(const void* src,
 };
 
 // enqueue an async copy command
-std::shared_ptr<KalmarAsyncOp> HSAQueue::EnqueueAsyncCopy(const void *src, void *dst, size_t size_bytes) override {
+std::shared_ptr<KalmarAsyncOp> HSAQueue::EnqueueAsyncCopy(const void *src, void *dst, size_t size_bytes) {
     hsa_status_t status = HSA_STATUS_SUCCESS;
 
     // create shared_ptr instance
@@ -4272,7 +4271,7 @@ std::shared_ptr<KalmarAsyncOp> HSAQueue::EnqueueAsyncCopy(const void *src, void
 void
 HSAQueue::dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql,
                          const void * args, size_t argSize,
-                         hc::completion_future *cf, const char *kernelName) override
+                         hc::completion_future *cf, const char *kernelName)
 {
     uint16_t dims = (aql->setup >> HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS) &
                     ((1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS) - 1);
@@ -4694,14 +4693,14 @@ HSADispatch::dispose() {
 }
 
 inline uint64_t
-HSADispatch::getBeginTimestamp() override {
+HSADispatch::getBeginTimestamp() {
     hsa_amd_profiling_dispatch_time_t time;
     hsa_amd_profiling_get_dispatch_time(_agent, _signal, &time);
     return time.start;
 }
 
 inline uint64_t
-HSADispatch::getEndTimestamp() override {
+HSADispatch::getEndTimestamp() {
     hsa_amd_profiling_dispatch_time_t time;
     hsa_amd_profiling_get_dispatch_time(_agent, _signal, &time);
     return time.end;
@@ -4737,12 +4736,12 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t *
         // throw an error
         if (localDims[i] > workgroup_max_dim[i]) {
           std::stringstream msg;
-          msg << "The extent of the tile (" << localDims[i] 
+          msg << "The extent of the tile (" << localDims[i]
               << ") exceeds the device limit (" << workgroup_max_dim[i] << ").";
           throw Kalmar::runtime_exception(msg.str().c_str(), -1);
         } else if (localDims[i] > globalDims[i]) {
           std::stringstream msg;
-          msg << "The extent of the tile (" << localDims[i] 
+          msg << "The extent of the tile (" << localDims[i]
               << ") exceeds the compute grid extent (" << globalDims[i] << ").";
           throw Kalmar::runtime_exception(msg.str().c_str(), -1);
         }
@@ -4763,7 +4762,7 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t *
         for (unsigned int i = 1; ; i<<=1) {
           if (i == recommended_flat_workgroup_size
               || i >= globalDims[0]) {
-            workgroup_size[0] = 
+            workgroup_size[0] =
               std::min(i, static_cast<unsigned int>(globalDims[0]));
             break;
           }
@@ -4778,7 +4777,7 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t *
         for (unsigned int i = 1; ; i<<=1) {
           if (i == recommended_flat_workgroup_size
               || i >= globalDims[0]) {
-            workgroup_size[0] = 
+            workgroup_size[0] =
               std::min(i, static_cast<unsigned int>(globalDims[0]));
             break;
           }
@@ -4793,14 +4792,14 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t *
           }
           else if (flat_group_size == recommended_flat_workgroup_size
               || j >= globalDims[1]) {
-            workgroup_size[1] = 
+            workgroup_size[1] =
               std::min(j, static_cast<unsigned int>(globalDims[1]));
             break;
           }
         }
 
         // compute the group size for the 3rd dimension
-        workgroup_size[2] = recommended_flat_workgroup_size / 
+        workgroup_size[2] = recommended_flat_workgroup_size /
                               (workgroup_size[0] * workgroup_size[1]);
       }
     }
@@ -4812,7 +4811,7 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t *
       constexpr unsigned int num_work_items_per_simd = 64;
       constexpr unsigned int num_simds_per_cu = 4;
       const unsigned int workitem_vgpr_count = std::max((unsigned int)kernel->workitem_vgpr_count, 1u);
-      unsigned int max_flat_group_size = (max_num_vgprs_per_work_item / workitem_vgpr_count) 
+      unsigned int max_flat_group_size = (max_num_vgprs_per_work_item / workitem_vgpr_count)
                                            * num_work_items_per_simd * num_simds_per_cu;
       return max_flat_group_size;
     };
@@ -4822,7 +4821,7 @@ HSADispatch::setLaunchConfiguration(const int dims, size_t *globalDims, size_t *
       const unsigned int max_num_work_items_per_cu = calculate_kernel_max_flat_workgroup_size();
       if (actual_flat_group_size > max_num_work_items_per_cu) {
         std::stringstream msg;
-        msg << "The number of work items (" << actual_flat_group_size 
+        msg << "The number of work items (" << actual_flat_group_size
             << ") per work group exceeds the limit (" << max_num_work_items_per_cu << ") of kernel "
             << kernel->kernelName << " .";
         throw Kalmar::runtime_exception(msg.str().c_str(), -1);
@@ -5056,14 +5055,14 @@ HSABarrier::dispose() {
 }
 
 inline uint64_t
-HSABarrier::getBeginTimestamp() override {
+HSABarrier::getBeginTimestamp() {
     hsa_amd_profiling_dispatch_time_t time;
     hsa_amd_profiling_get_dispatch_time(_agent, _signal, &time);
     return time.start;
 }
 
 inline uint64_t
-HSABarrier::getEndTimestamp() override {
+HSABarrier::getEndTimestamp() {
     hsa_amd_profiling_dispatch_time_t time;
     hsa_amd_profiling_get_dispatch_time(_agent, _signal, &time);
     return time.end;
@@ -5090,12 +5089,12 @@ HSAOp::HSAOp(Kalmar::KalmarQueue *queue, hc::hcCommandKind commandKind) :
     apiStartTick = Kalmar::ctx.getSystemTicks();
 };
 
-Kalmar::HSAQueue *HSAOp::hsaQueue() const 
-{ 
-    return static_cast<Kalmar::HSAQueue *> (this->getQueue()); 
+Kalmar::HSAQueue *HSAOp::hsaQueue() const
+{
+    return static_cast<Kalmar::HSAQueue *> (this->getQueue());
 };
 
-bool HSAOp::isReady() override {
+bool HSAOp::isReady() {
     bool ready = (hsa_signal_load_scacquire(_signal) == 0);
     if (ready && hsaQueue()) {
         hsaQueue()->removeAsyncOp(this);
@@ -5428,7 +5427,7 @@ HSACopy::enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc::
         // We need to ensure the copy waits for preceding commands the HCC queue to complete, if those commands exist.
         // The copy has to be set so that it depends on the completion_signal of the youngest command in the queue.
         if (depAsyncOp || fenceScope != hc::no_scope) {
-        
+
             // Normally we can use the input signal to hsa_amd_memory_async_copy to ensure the copy waits for youngest op.
             // However, two cases require special handling:
             //    - the youngest op may not have a completion signal - this is optional for kernel launch commands.
@@ -5585,14 +5584,14 @@ HSACopy::dispose() {
 }
 
 inline uint64_t
-HSACopy::getBeginTimestamp() override {
+HSACopy::getBeginTimestamp() {
     hsa_amd_profiling_async_copy_time_t time;
     hsa_amd_profiling_get_async_copy_time(_signal, &time);
     return time.start;
 }
 
 inline uint64_t
-HSACopy::getEndTimestamp() override {
+HSACopy::getEndTimestamp() {
     hsa_amd_profiling_async_copy_time_t time;
     hsa_amd_profiling_get_async_copy_time(_signal, &time);
     return time.end;
diff --git a/lib/mcwamp.cpp b/lib/mcwamp.cpp
index 8d4d7320659..de47342bb0b 100644
--- a/lib/mcwamp.cpp
+++ b/lib/mcwamp.cpp
@@ -16,10 +16,13 @@
 #include <cstddef>
 #include <tuple>
 
+#include <hc.hpp>
 #include <mutex>
 
 #include <dlfcn.h>
 
+// weak symbols of kernel codes
+
 // Kernel bundle
 extern "C" char * kernel_bundle_source[] asm ("_binary_kernel_bundle_start") __attribute__((visibility("default")));
 
@@ -395,44 +398,4 @@ extern "C" void __attribute__((constructor)) __hcc_shared_library_init() {
 }
 
 extern "C" void __attribute__((destructor)) __hcc_shared_library_fini() {
-}
-
-// conversion routines between float and half precision
-static inline std::uint32_t f32_as_u32(float f) { union { float f; std::uint32_t u; } v; v.f = f; return v.u; }
-static inline float u32_as_f32(std::uint32_t u) { union { float f; std::uint32_t u; } v; v.u = u; return v.f; }
-static inline int clamp_int(int i, int l, int h) { return std::min(std::max(i, l), h); }
-
-// half � float, the f16 is in the low 16 bits of the input argument �a�
-static inline float __convert_half_to_float(std::uint32_t a) noexcept {
-  std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;
-  std::uint32_t v = f32_as_u32(u32_as_f32(u) * 0x1.0p+112f) + 0x38000000U;
-  u = (a & 0x7fff) != 0 ? v : u;
-  return u32_as_f32(u) * 0x1.0p-112f;
-}
-
-// float � half with nearest even rounding
-// The lower 16 bits of the result is the bit pattern for the f16
-static inline std::uint32_t __convert_float_to_half(float a) noexcept {
-  std::uint32_t u = f32_as_u32(a);
-  int e = static_cast<int>((u >> 23) & 0xff) - 127 + 15;
-  std::uint32_t m = ((u >> 11) & 0xffe) | ((u & 0xfff) != 0);
-  std::uint32_t i = 0x7c00 | (m != 0 ? 0x0200 : 0);
-  std::uint32_t n = ((std::uint32_t)e << 12) | m;
-  std::uint32_t s = (u >> 16) & 0x8000;
-  int b = clamp_int(1-e, 0, 13);
-  std::uint32_t d = (0x1000 | m) >> b;
-  d |= (d << b) != (0x1000 | m);
-  std::uint32_t v = e < 1 ? d : n;
-  v = (v >> 2) + (((v & 0x7) == 3) | ((v & 0x7) > 5));
-  v = e > 30 ? 0x7c00 : v;
-  v = e == 143 ? i : v;
-  return s | v;
-}
-
-extern "C" float __gnu_h2f_ieee(unsigned short h){
-  return __convert_half_to_float((std::uint32_t) h);
-}
-
-extern "C" unsigned short __gnu_f2h_ieee(float f){
-  return (unsigned short)__convert_float_to_half(f);
-}
+}
\ No newline at end of file
diff --git a/lib/mcwamp_atomic.cpp b/lib/mcwamp_atomic.cpp
deleted file mode 100644
index 01b964b2515..00000000000
--- a/lib/mcwamp_atomic.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-#include <mutex>
-#include <algorithm>
-
-// FIXME : need to consider how to let hc namespace could also use functions here
-namespace Concurrency {
-
-std::mutex afx_u, afx_i, afx_f;
-unsigned int atomic_exchange_unsigned(unsigned int *x, unsigned int y) {
-    std::lock_guard<std::mutex> guard(afx_u);
-    unsigned int old = *x;
-    *x = y;
-    return old;
-}
-int atomic_exchange_int(int *x, int y) {
-    std::lock_guard<std::mutex> guard(afx_i);
-    int old = *x;
-    *x = y;
-    return old;
-}
-float atomic_exchange_float(float* x, float y) {
-    std::lock_guard<std::mutex> guard(afx_f);
-    int old = *x;
-    *x = y;
-    return old;
-}
-
-std::mutex afcas_u, afcas_i;
-unsigned int atomic_compare_exchange_unsigned(unsigned int *x, unsigned int y, unsigned int z) {
-    std::lock_guard<std::mutex> guard(afcas_u);
-    unsigned int old = *x;
-    if (*x == y) {
-        *x = z;
-    }
-    return old;
-}
-int atomic_compare_exchange_int(int *x, int y, int z) {
-    std::lock_guard<std::mutex> guard(afcas_i);
-    int old = *x;
-    if (*x == y) {
-        *x = z;
-    }
-    return old;
-}
-
-std::mutex afa_u, afa_i, afa_f;
-unsigned int atomic_add_unsigned(unsigned int *x, unsigned int y) {
-    std::lock_guard<std::mutex> guard(afa_u);
-    unsigned int old = *x;
-    *x += y;
-    return old;
-}
-int atomic_add_int(int *x, int y) {
-    std::lock_guard<std::mutex> guard(afa_i);
-    int old = *x;
-    *x += y;
-    return old;
-}
-float atomic_add_float(float* x, float y) {
-    std::lock_guard<std::mutex> guard(afa_f);
-    float old = *x;
-    *x += y;
-    return old;
-}
-
-std::mutex afs_u, afs_i, afs_f;
-unsigned int atomic_sub_unsigned(unsigned int *x, unsigned int y) {
-    std::lock_guard<std::mutex> guard(afa_u);
-    unsigned int old = *x;
-    *x -= y;
-    return old;
-}
-int atomic_sub_int(int *x, int y) {
-    std::lock_guard<std::mutex> guard(afa_i);
-    int old = *x;
-    *x -= y;
-    return old;
-}
-float atomic_sub_float(float* x, float y) {
-    std::lock_guard<std::mutex> guard(afa_f);
-    float old = *x;
-    *x -= y;
-    return old;
-}
-
-std::mutex afand_u, afand_i;
-unsigned int atomic_and_unsigned(unsigned int *x, unsigned int y) {
-    std::lock_guard<std::mutex> guard(afand_u);
-    unsigned int old = *x;
-    *x &= y;
-    return old;
-}
-int atomic_and_int(int *x, int y) {
-    std::lock_guard<std::mutex> guard(afand_i);
-    int old = *x;
-    *x &= y;
-    return old;
-}
-
-std::mutex afor_u, afor_i;
-unsigned int atomic_or_unsigned(unsigned int *x, unsigned int y) {
-    std::lock_guard<std::mutex> guard(afor_u);
-    unsigned int old = *x;
-    *x |= y;
-    return old;
-}
-int atomic_or_int(int *x, int y) {
-    std::lock_guard<std::mutex> guard(afor_i);
-    int old = *x;
-    *x |= y;
-    return old;
-}
-
-std::mutex afxor_u, afxor_i;
-unsigned int atomic_xor_unsigned(unsigned int *x, unsigned int y) {
-    std::lock_guard<std::mutex> guard(afxor_u);
-    unsigned int old = *x;
-    *x ^= y;
-    return old;
-}
-int atomic_xor_int(int *x, int y) {
-    std::lock_guard<std::mutex> guard(afxor_i);
-    int old = *x;
-    *x ^= y;
-    return old;
-}
-
-std::mutex afmax_u, afmax_i;
-unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val) {
-    std::lock_guard<std::mutex> guard(afmax_u);
-    unsigned int old = *p;
-    *p = std::max(*p, val);
-    return old;
-}
-int atomic_max_int(int *p, int val) {
-    std::lock_guard<std::mutex> guard(afmax_i);
-    int old = *p;
-    *p = std::max(*p, val);
-    return old;
-}
-
-std::mutex afmin_u, afmin_i;
-unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val) {
-    std::lock_guard<std::mutex> guard(afmin_u);
-    unsigned int old = *p;
-    *p = std::min(*p, val);
-    return old;
-}
-int atomic_min_int(int *p, int val) {
-    std::lock_guard<std::mutex> guard(afmin_i);
-    int old = *p;
-    *p = std::min(*p, val);
-    return old;
-}
-
-std::mutex afi_u, afi_i;
-unsigned int atomic_inc_unsigned(unsigned int *p) {
-    std::lock_guard<std::mutex> guard(afi_u);
-    unsigned int old = *p;
-    *p += 1;
-    return old;
-}
-int atomic_inc_int(int *p) {
-    std::lock_guard<std::mutex> guard(afi_i);
-    int old = *p;
-    *p += 1;
-    return old;
-}
-
-std::mutex afd_u, afd_i;
-unsigned int atomic_dec_unsigned(unsigned int *p) {
-    std::lock_guard<std::mutex> guard(afd_u);
-    unsigned int old = *p;
-    *p -= 1;
-    return old;
-}
-int atomic_dec_int(int *p) {
-    std::lock_guard<std::mutex> guard(afd_i);
-    int old = *p;
-    *p -= 1;
-    return old;
-}
-
-}
diff --git a/lld b/lld
index 0c61d0e5fff..a89add36a54 160000
--- a/lld
+++ b/lld
@@ -1 +1 @@
-Subproject commit 0c61d0e5fff65d51b0d177470a51eea7654dd712
+Subproject commit a89add36a5426e5dc295baf64182cd30e77fa6b2
diff --git a/scripts/cmake/MCWAMP.cmake b/scripts/cmake/MCWAMP.cmake
index 49c155ad40c..c5eb18618a3 100644
--- a/scripts/cmake/MCWAMP.cmake
+++ b/scripts/cmake/MCWAMP.cmake
@@ -52,6 +52,7 @@ endmacro(amp_target name )
 ####################
 macro(add_mcwamp_library name )
   add_library( ${name} ${ARGN} )
+  target_compile_definitions(${name} PUBLIC __HIPCC__)
   amp_target(${name})
   # LLVM and Clang shall be compiled beforehand
   add_dependencies(${name} llvm-link opt clang rocdl)
@@ -70,6 +71,7 @@ endmacro(add_mcwamp_shared_library name )
 ####################
 macro(add_mcwamp_library_cpu name )
   add_library( ${name} SHARED ${ARGN} )
+  target_compile_definitions(${name} PUBLIC __HIPCC__)
   amp_target(${name})
   # LLVM and Clang shall be compiled beforehand
   add_dependencies(${name} llvm-link opt clang rocdl)
@@ -81,6 +83,7 @@ endmacro(add_mcwamp_library_cpu name )
 ####################
 macro(add_mcwamp_library_hsa name )
   add_library( ${name} SHARED ${ARGN} )
+  target_compile_definitions(${name} PUBLIC __HIPCC__)
   amp_target(${name})
   # LLVM and Clang shall be compiled beforehand
   add_dependencies(${name} llvm-link opt clang hc_am rocdl)
@@ -93,6 +96,7 @@ endmacro(add_mcwamp_library_hsa name )
 
 macro(add_mcwamp_library_hc_am name )
   add_library( ${name} SHARED ${ARGN} )
+  target_compile_definitions(${name} PUBLIC __HIPCC__)
   amp_target(${name})
   # LLVM and Clang shall be compiled beforehand
   add_dependencies(${name} llvm-link opt clang rocdl)
diff --git a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.15/test.cpp b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.15/test.cpp
index 72613888567..0c767b14a64 100644
--- a/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.15/test.cpp
+++ b/tests/Conformance/2_Cxx_Lang_Exte/2_3_Expr_Invo_Rest_Func/2_3_2_Func_Over/2_3_2_1_Over_Reso/Overloading/Overloading.15/test.cpp
@@ -8,7 +8,7 @@
 
 // RUN: %cxxamp %s -o %t.out && %t.out
 
-int f(int &) [[hc]] 
+int f(int &) [[hc]]
 {
     return 0;
 }
diff --git a/tests/Unit/AM/am_aligned_alloc.cpp b/tests/Unit/AM/am_aligned_alloc.cpp
index b992b550241..a42c4cd420b 100644
--- a/tests/Unit/AM/am_aligned_alloc.cpp
+++ b/tests/Unit/AM/am_aligned_alloc.cpp
@@ -1,9 +1,9 @@
-// RUN: %hc %s -lhc_am -o %t.out && %t.out
+// RUN: %hc %s -o %t.out && %t.out
 
 #include <cstdlib>
 #include <cstdio>
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 #include <iostream>
 
 #define TRACKER_PRINT(_target)\
@@ -28,9 +28,9 @@ int main()
     TRACKER_PRINT(a);
     TRACKER_PRINT(b);
     hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
-    am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, b);
+    am_status_t status = hc::am_memtracker_get_info(&amPointerInfo, b);
     if (status == AM_SUCCESS) {
-       if (amPointerInfo._hostPointer == NULL) {
+       if (amPointerInfo.host_pointer == NULL) {
            hc::am_free(b);
        }
        else { 
@@ -42,9 +42,9 @@ int main()
            ret = false;
     }
 
-    status = hc::am_memtracker_getinfo(&amPointerInfo, a);
+    status = hc::am_memtracker_get_info(&amPointerInfo, a);
     if (status == AM_SUCCESS) {
-       if (amPointerInfo._hostPointer == NULL)
+       if (amPointerInfo.host_pointer == NULL)
            hc::am_free(a);
        else {
            printf("Failed device pointer check for a\n");
diff --git a/tests/Unit/AM/am_alloc.cpp b/tests/Unit/AM/am_alloc.cpp
index be4c0cd93ac..88c773f2c00 100644
--- a/tests/Unit/AM/am_alloc.cpp
+++ b/tests/Unit/AM/am_alloc.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc %s -lhc_am -o %t.out && %t.out
+// RUN: %hc %s  -o %t.out && %t.out
 
 #include <cstdlib>
 #include <cstdio>
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 #include <iostream>
 
 #define WAIT_ACCELERATOR_VIEW 2
@@ -63,8 +63,8 @@ int main()
 
 
         char *a = am_alloc(10000, defaultAcc, 0);
-        char *b = am_alloc(20000, defaultAcc, amHostPinned);
-        char *c = am_alloc(cSize, defaultAcc, amHostCoherent);
+        char *b = am_alloc(20000, defaultAcc, am_host_pinned);
+        char *c = am_alloc(cSize, defaultAcc, am_host_coherent);
 
         // Simple tests to verify that the memory allocations to all 3 regions succeeded.
         assert(a);
@@ -88,7 +88,7 @@ int main()
             if (!a->get_is_emulated()) {
 
                 int *hostPtr = nullptr;
-                hostPtr = am_alloc(sizeElements, *a, amHostCoherent);
+                hostPtr = am_alloc(sizeElements, *a, am_host_coherent);
                 assert(hostPtr);
 
                 std::cout << "test: alloc coherent host mem on accelerator#" << a->get_seqnum() << " + accelerator_view::wait()\n";
@@ -103,7 +103,7 @@ int main()
                 assert (hc::am_free(hostPtr) == AM_SUCCESS);
 
 
-                hostPtr = am_alloc(sizeElements, *a, amHostPinned);
+                hostPtr = am_alloc(sizeElements, *a, am_host_pinned);
                 assert(hostPtr);
                 std::cout << "test: alloc non-coherent host mem on accelerator#" << a->get_seqnum() << " + accelerator_view::wait()\n";
                 accessFromAllAccs(numElements, hostPtr, WAIT_ACCELERATOR_VIEW);
diff --git a/tests/Unit/AM/am_memtracker.cpp b/tests/Unit/AM/am_memtracker.cpp
index ab2e6ff8f3a..629c264c4c8 100644
--- a/tests/Unit/AM/am_memtracker.cpp
+++ b/tests/Unit/AM/am_memtracker.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc %s -lhc_am -o %t.out && %t.out
+// RUN: %hc %s  -o %t.out && %t.out
 
 #include <cstdlib>
 #include <cstdio>
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 #include <iostream>
 
 #define TRACKER_PRINT(_target)\
diff --git a/tests/Unit/AMDGPU/activelanecount.cpp b/tests/Unit/AMDGPU/activelanecount.cpp
index a28a8ccc775..01fb708c376 100644
--- a/tests/Unit/AMDGPU/activelanecount.cpp
+++ b/tests/Unit/AMDGPU/activelanecount.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/activelanemask.cpp b/tests/Unit/AMDGPU/activelanemask.cpp
index ed584419950..d3add1d4f26 100644
--- a/tests/Unit/AMDGPU/activelanemask.cpp
+++ b/tests/Unit/AMDGPU/activelanemask.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/ballot.cpp b/tests/Unit/AMDGPU/ballot.cpp
index 51f6dd419ee..ce7289bb63c 100644
--- a/tests/Unit/AMDGPU/ballot.cpp
+++ b/tests/Unit/AMDGPU/ballot.cpp
@@ -1,34 +1,34 @@
 
-// RUN: %hc %s -o %t.out && %t.out
+// RUN: %hc %s -g3 -o %t.out && %t.out
+
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <cassert>
-#include <hc.hpp>
 
 #define WAVEFRONT_SIZE (64) // as of now, all HSA agents have wavefront size of 64
 #define TEST_DEBUG (0)
 
-int main() {
-
-  hc::array_view<uint64_t,1> a(WAVEFRONT_SIZE);
-
-  hc::extent<1> e(WAVEFRONT_SIZE);
-
-  int errors = 0;
-  for (int i = 0; i <  WAVEFRONT_SIZE; i++) {
-    hc::parallel_for_each(e,[=](hc::index<1> idx) [[hc]] {
-      uint64_t d = hc::__ballot(1);
-      if (idx[0]==i)
-        a[0] = d;
-    }).wait();
-    if (a[0] != 0xFFFFFFFFFFFFFFFF) {
-       errors++;
+int main()
+{
+    hc::array_view<uint64_t,1> a{WAVEFRONT_SIZE};
+
+    hc::extent<1> e{WAVEFRONT_SIZE};
+
+    int errors = 0;
+    for (int i = 0; i != WAVEFRONT_SIZE; ++i) {
+        hc::parallel_for_each(e, [=](hc::index<1> idx) [[hc]] {
+            uint64_t d = hc::__ballot(1);
+            if (idx[0] == i) a[0] = d;
+        });
+        if (a[0] != 0xFFFFFFFFFFFFFFFF) {
+          ++errors;
+        }
+        //#if TEST_DEBUG
+            std::cout << "(i=" << i << "): 0x" << std::hex << a[0] << std::endl;
+            std::cout << std::dec;
+        //#endif
     }
-#if TEST_DEBUG
-    std::cout << "(i=" << i << "): 0x" << std::hex << a[0] << std::endl;
-    std::cout << std::dec;
-#endif
-  }
 
-  return !(errors==0);
+    return errors != 0;
 }
diff --git a/tests/Unit/AMDGPU/bitextract.cpp b/tests/Unit/AMDGPU/bitextract.cpp
index 66d27c5def0..8fc31ccb041 100644
--- a/tests/Unit/AMDGPU/bitextract.cpp
+++ b/tests/Unit/AMDGPU/bitextract.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/bitinsert.cpp b/tests/Unit/AMDGPU/bitinsert.cpp
index 92d906450c7..30de2e0006e 100644
--- a/tests/Unit/AMDGPU/bitinsert.cpp
+++ b/tests/Unit/AMDGPU/bitinsert.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/bitselect.cpp b/tests/Unit/AMDGPU/bitselect.cpp
index 9996336d32b..83628e2c1a9 100644
--- a/tests/Unit/AMDGPU/bitselect.cpp
+++ b/tests/Unit/AMDGPU/bitselect.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/clock.cpp b/tests/Unit/AMDGPU/clock.cpp
index b244fb8b1ea..8405831e4fd 100644
--- a/tests/Unit/AMDGPU/clock.cpp
+++ b/tests/Unit/AMDGPU/clock.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/AMDGPU/clock2.cpp b/tests/Unit/AMDGPU/clock2.cpp
index bc7b3a61df7..8585a71129b 100644
--- a/tests/Unit/AMDGPU/clock2.cpp
+++ b/tests/Unit/AMDGPU/clock2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/AMDGPU/firstbit.cpp b/tests/Unit/AMDGPU/firstbit.cpp
index 494b7561d75..dd5f12b6f92 100644
--- a/tests/Unit/AMDGPU/firstbit.cpp
+++ b/tests/Unit/AMDGPU/firstbit.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/laneid.cpp b/tests/Unit/AMDGPU/laneid.cpp
index b551b1165e7..841cdcf6114 100644
--- a/tests/Unit/AMDGPU/laneid.cpp
+++ b/tests/Unit/AMDGPU/laneid.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/popcount.cpp b/tests/Unit/AMDGPU/popcount.cpp
index 7c3f4e2c066..318975640ad 100644
--- a/tests/Unit/AMDGPU/popcount.cpp
+++ b/tests/Unit/AMDGPU/popcount.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/register-control.cpp b/tests/Unit/AMDGPU/register-control.cpp
index 85df138a0af..3fff7251ed7 100644
--- a/tests/Unit/AMDGPU/register-control.cpp
+++ b/tests/Unit/AMDGPU/register-control.cpp
@@ -2,7 +2,7 @@
 // RUN: %llvm-dis %T/dump-gfx803.opt.bc -f -o - | %FileCheck %s
 // RUN: %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <vector>
 
 #define GRID_SIZE (1024)
@@ -11,16 +11,17 @@ int main() {
   using namespace hc;
   array<unsigned int, 1> table(GRID_SIZE);
   extent<1> ex(GRID_SIZE);
-  // CHECK-LABEL: define weak_odr amdgpu_kernel void @"_ZZ4mainEN3$_019__cxxamp_trampolineEPjii"
-  // CHECK-SAME:({{[^)]*}}){{[^#]*}}#[[ATTR0:[0-9]+]]
-  // CHECK: attributes #[[ATTR0]] = {{{.*}}"amdgpu-flat-work-group-size"="1,10" "amdgpu-max-work-group-dim"="10,1,1" "amdgpu-waves-per-eu"="5,6"
-  auto k = [&](index<1>& idx) [[hc]]
-                              [[hc_waves_per_eu(5,6)]]
-                              [[hc_flat_workgroup_size(1,10)]]
-                              [[hc_max_workgroup_dim(10,1,1)]]{
-    table(idx) = idx[0];
-  };
-  parallel_for_each(ex, k ).wait();
+  // CHECK-LABEL: define weak_odr amdgpu_kernel void {{.*Kernel_emitter.*}}"
+  // CHECK-SAME: {{[^#]*}}#[[ATTR0:[0-9]+]]
+  auto k = make_callable_with_AMDGPU_attributes<
+    Waves_per_eu<5, 6>,
+    Flat_workgroup_size<1, 10>
+    #if defined(NON_CLANG_ATTRIBUTES)
+      , Max_workgroup_dim<10, 1, 1>
+    #endif
+    >([&](index<1>& idx) [[hc]] { table(idx) = idx[0]; }
+  );
+  parallel_for_each(ex, k).wait();
 
   // verify result
   bool ret = true;
@@ -32,3 +33,4 @@ int main() {
   return !(ret == true);
 }
 
+// CHECK: attributes #[[ATTR0]] = {{{.*}}"amdgpu-flat-work-group-size"="1,10" "amdgpu-waves-per-eu"="5,6"
\ No newline at end of file
diff --git a/tests/Unit/AMDGPU/shfl.cpp b/tests/Unit/AMDGPU/shfl.cpp
index 8b8a793636e..97fac287c75 100644
--- a/tests/Unit/AMDGPU/shfl.cpp
+++ b/tests/Unit/AMDGPU/shfl.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/AMDGPU/shfl_down.cpp b/tests/Unit/AMDGPU/shfl_down.cpp
index ecddfa2301f..644bb7b7466 100644
--- a/tests/Unit/AMDGPU/shfl_down.cpp
+++ b/tests/Unit/AMDGPU/shfl_down.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/AMDGPU/shfl_scan.cpp b/tests/Unit/AMDGPU/shfl_scan.cpp
index 9236f110186..0913908b186 100644
--- a/tests/Unit/AMDGPU/shfl_scan.cpp
+++ b/tests/Unit/AMDGPU/shfl_scan.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/AMDGPU/shfl_up.cpp b/tests/Unit/AMDGPU/shfl_up.cpp
index 3856c772cbe..c2d96206ab4 100644
--- a/tests/Unit/AMDGPU/shfl_up.cpp
+++ b/tests/Unit/AMDGPU/shfl_up.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/AMDGPU/shfl_xor.cpp b/tests/Unit/AMDGPU/shfl_xor.cpp
index 207732d4ba6..6b15b0a3788 100644
--- a/tests/Unit/AMDGPU/shfl_xor.cpp
+++ b/tests/Unit/AMDGPU/shfl_xor.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/AMDGPU/vote_any_all.cpp b/tests/Unit/AMDGPU/vote_any_all.cpp
index ae7f7263329..a2b39cf0264 100644
--- a/tests/Unit/AMDGPU/vote_any_all.cpp
+++ b/tests/Unit/AMDGPU/vote_any_all.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/vote_ballot.cpp b/tests/Unit/AMDGPU/vote_ballot.cpp
index cf44c79fd50..2879f5b75b5 100644
--- a/tests/Unit/AMDGPU/vote_ballot.cpp
+++ b/tests/Unit/AMDGPU/vote_ballot.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AMDGPU/wavesize.cpp b/tests/Unit/AMDGPU/wavesize.cpp
index 038215c5b80..5a47d0ea7e5 100644
--- a/tests/Unit/AMDGPU/wavesize.cpp
+++ b/tests/Unit/AMDGPU/wavesize.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/AcceleratorViewCopy/avcopy_classic.cpp b/tests/Unit/AcceleratorViewCopy/avcopy_classic.cpp
index e919a5acd0d..14fdb82d70e 100644
--- a/tests/Unit/AcceleratorViewCopy/avcopy_classic.cpp
+++ b/tests/Unit/AcceleratorViewCopy/avcopy_classic.cpp
@@ -1,13 +1,11 @@
-// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 && %t.out
+// RUN: %hc %s -o %t.out && %t.out
 //
 // Test "classic" GPU pattern of H2D copies, followed by Kernels, followed by
 // D2H.
 // Test allows toggling explicit host-side syncs (via accelerator-view waits) vs
 // relying on efficient GPU hardware dependencies.
-#include <hc.hpp>
-#include <hc_am.hpp>
-
-#include "/opt/rocm/include/hsa/hsa.h"
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 #include <algorithm>
 #include <cassert>
@@ -33,10 +31,11 @@ std::unique_ptr<T[], decltype(hc::am_free)*> hostAlloc(
             hc::am_alloc(
                 sizeof(T) * cnt,
                 const_cast<hc::accelerator&>(acc),
-                amHostPinned))
+                am_host_pinned))
                     : new T[cnt],
         host_pinned ? hc::am_free
-                    : [](void* p) { delete [] static_cast<T*>(p); return 0; }};
+                    : [](void* p) {
+                        delete [] static_cast<T*>(p); return AM_SUCCESS; }};
 
     return p;
 }
diff --git a/tests/Unit/AcceleratorViewCopy/avcopy_with_offsets_host_locked.cpp b/tests/Unit/AcceleratorViewCopy/avcopy_with_offsets_host_locked.cpp
index 49ad27ec0bf..010115fd86a 100644
--- a/tests/Unit/AcceleratorViewCopy/avcopy_with_offsets_host_locked.cpp
+++ b/tests/Unit/AcceleratorViewCopy/avcopy_with_offsets_host_locked.cpp
@@ -1,11 +1,11 @@
-// RUN: %hc %s -o %t.out -lhc_am && %t.out
+// RUN: %hc %s -o %t.out  && %t.out
 
 // Test hc::acclerator_view::copy()
 // with GPU buffers having offsets from the result of am_alloc
 // with CPU buffers page locked, and have offsets
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 void rocm_device_synchronize()
 {
@@ -54,7 +54,7 @@ bool test() {
 
   // execute a kernel to populate data on GPU
   hc::extent<1> e(N);
-  hc::parallel_for_each(e,[=](hc::index<1> idx)__HC__{
+  hc::parallel_for_each(e,[=](hc::index<1> idx)[[hc]]{
     a[idx[0]] = 5;
   });
 
diff --git a/tests/Unit/AcceleratorViewCopy/avcopy_with_offsets_host_unlocked.cpp b/tests/Unit/AcceleratorViewCopy/avcopy_with_offsets_host_unlocked.cpp
index fd41e558798..2c8105178da 100644
--- a/tests/Unit/AcceleratorViewCopy/avcopy_with_offsets_host_unlocked.cpp
+++ b/tests/Unit/AcceleratorViewCopy/avcopy_with_offsets_host_unlocked.cpp
@@ -1,11 +1,11 @@
-// RUN: %hc %s -o %t.out -lhc_am && %t.out
+// RUN: %hc %s -o %t.out  && %t.out
 
 // Test hc::acclerator_view::copy()
 // with GPU buffers having offsets from the result of am_alloc
 // with CPU buffers un-locked, and have offsets
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 void rocm_device_synchronize()
 {
@@ -48,7 +48,7 @@ bool test() {
 
   // execute a kernel to populate data on GPU
   hc::extent<1> e(N);
-  hc::parallel_for_each(e,[=](hc::index<1> idx)__HC__{
+  hc::parallel_for_each(e,[=](hc::index<1> idx)[[hc]]{
     a[idx[0]] = 5;
   });
 
diff --git a/tests/Unit/AcceleratorViewCopy/avfunc_l5_sync.cpp b/tests/Unit/AcceleratorViewCopy/avfunc_l5_sync.cpp
index 53f235d5f2e..19a422a372b 100644
--- a/tests/Unit/AcceleratorViewCopy/avfunc_l5_sync.cpp
+++ b/tests/Unit/AcceleratorViewCopy/avfunc_l5_sync.cpp
@@ -1,9 +1,9 @@
-// RUN: %hc %s -o %t.out -lhc_am && %t.out 
+// RUN: %hc %s -o %t.out  && %t.out 
 #include <stdlib.h>
 #include <iostream>
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 
 // A few helper routines for writing tests:
@@ -57,8 +57,7 @@ inline void assertTwoArrays(int *A, int *B, size_t length){
 
 void runKernel(hc::accelerator_view &av, int *Ad){
     hc::parallel_for_each(av, hc::extent<1>(LEN), [=](hc::index<1> idx)[[hc]]{
-        int i = amp_get_global_id(0);
-        Ad[i] = Ad[i] + 1;
+        Ad[idx[0]] = Ad[idx[0]] + 1;
     });
 }
 
diff --git a/tests/Unit/AcceleratorViewCopy/common2.h b/tests/Unit/AcceleratorViewCopy/common2.h
index 093eaeef77d..875dee77b3e 100644
--- a/tests/Unit/AcceleratorViewCopy/common2.h
+++ b/tests/Unit/AcceleratorViewCopy/common2.h
@@ -11,7 +11,7 @@ inline hc::completion_future MemcpyAsync(hc::accelerator_view &av, const void *S
 }
 
 inline void* HostAlloc(hc::accelerator &Acc, size_t Size){
-    return hc::am_alloc(Size, Acc, amHostPinned);
+    return hc::am_alloc(Size, Acc, am_host_pinned);
 }
 
 inline void* DeviceAlloc(hc::accelerator &Acc, size_t Size){
diff --git a/tests/Unit/AcceleratorViewCopy/copy_coherency.cpp b/tests/Unit/AcceleratorViewCopy/copy_coherency.cpp
index e22e2ba4f6a..536bb6970f8 100644
--- a/tests/Unit/AcceleratorViewCopy/copy_coherency.cpp
+++ b/tests/Unit/AcceleratorViewCopy/copy_coherency.cpp
@@ -1,9 +1,9 @@
-// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 && %t.out
+// RUN: %hc %s -o %t.out && %t.out
 //
 // Test coherency and flushes.  Need to flush GPU caches before H2D copy
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 
 void memsetIntKernel(hc::accelerator_view &av, int * ptr, int val, size_t numElements)
@@ -68,7 +68,7 @@ void singleAccelerator(int numElements)
 
     //int * A  = hc::am_alloc(sizeElements, acc, 0);
     int * B  = hc::am_alloc(sizeElements, acc, 0);
-    int * Bh = hc::am_alloc(sizeElements, acc, amHostPinned);
+    int * Bh = hc::am_alloc(sizeElements, acc, am_host_pinned);
 
     if (1) {
         printf ("test: running same-stream copy coherency test\n");
@@ -179,7 +179,7 @@ void multiAccelerator(int numElements)
     const size_t sizeElements = numElements * sizeof(int);
     int * dataGpu0 = hc::am_alloc(sizeElements, gpus[0], 0);
     int * dataGpu1 = hc::am_alloc(sizeElements, gpus[1], 0);
-    int * dataHost = hc::am_alloc(sizeElements, gpus[1], amHostPinned);
+    int * dataHost = hc::am_alloc(sizeElements, gpus[1], am_host_pinned);
 
     hc::accelerator_view av0 = gpus[0].create_view();
     hc::accelerator_view av1 = gpus[1].create_view();
@@ -259,14 +259,8 @@ int main()
     const size_t sizeElements = numElements * sizeof(int);
     printf ("info: buffer size = %6.2f MB\n", sizeElements / 1024.0 / 1024.0);
 
-    if (1) {
-        singleAccelerator(numElements);
-    }
-
-    // TODO - need to re-enable multi-GPU tests:
-    if (0) {
-        multiAccelerator(numElements);
-    }
+    singleAccelerator(numElements);
+    multiAccelerator(numElements);
 
     printf ("passed!\n");
     return 0;
diff --git a/tests/Unit/AcceleratorViewCopy/copy_coherency2.cpp b/tests/Unit/AcceleratorViewCopy/copy_coherency2.cpp
index 3862f2fa90a..51ab53f128b 100644
--- a/tests/Unit/AcceleratorViewCopy/copy_coherency2.cpp
+++ b/tests/Unit/AcceleratorViewCopy/copy_coherency2.cpp
@@ -1,9 +1,9 @@
-// RUN: %hc %s -o %t.out -lhc_am -L/opt/rocm/lib -lhsa-runtime64 && %t.out
+// RUN: %hc %s -o %t.out && %t.out
 //
 // Test coherency and flushes.  Need to flush GPU caches before H2D copy
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 
 void memsetIntKernel(hc::accelerator_view &av, int * ptr, int val, size_t numElements)
@@ -44,8 +44,8 @@ void test(int numElements)
 
     int * B  = hc::am_alloc(sizeElements, acc, 0);
     int * C  = hc::am_alloc(sizeElements, acc, 0);
-    int * Bh = hc::am_alloc(sizeElements, acc, amHostPinned);
-    int * Ch = hc::am_alloc(sizeElements, acc, amHostPinned);
+    int * Bh = hc::am_alloc(sizeElements, acc, am_host_pinned);
+    int * Ch = hc::am_alloc(sizeElements, acc, am_host_pinned);
 
     const int expected = 42;
     memsetIntKernel(av, Bh, expected, numElements);
diff --git a/tests/Unit/AmpMath/amp_math_acos.cpp b/tests/Unit/AmpMath/amp_math_acos.cpp
index 9abb24a7e96..bd7a2ab592f 100644
--- a/tests/Unit/AmpMath/amp_math_acos.cpp
+++ b/tests/Unit/AmpMath/amp_math_acos.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_acos_precise_math.cpp b/tests/Unit/AmpMath/amp_math_acos_precise_math.cpp
index 238fbb3ef92..b52d1255cf5 100644
--- a/tests/Unit/AmpMath/amp_math_acos_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_acos_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_acosf.cpp b/tests/Unit/AmpMath/amp_math_acosf.cpp
index 5ec24ef47ac..50b0063ea51 100644
--- a/tests/Unit/AmpMath/amp_math_acosf.cpp
+++ b/tests/Unit/AmpMath/amp_math_acosf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_acosh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_acosh_precise_math.cpp
index d4a10f01fe4..d44ad4a1fa6 100644
--- a/tests/Unit/AmpMath/amp_math_acosh_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_acosh_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_asin.cpp b/tests/Unit/AmpMath/amp_math_asin.cpp
index 9680e1ea4bd..578d29a5c5f 100644
--- a/tests/Unit/AmpMath/amp_math_asin.cpp
+++ b/tests/Unit/AmpMath/amp_math_asin.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_asin_precise_math.cpp b/tests/Unit/AmpMath/amp_math_asin_precise_math.cpp
index 6a4e9c3a9de..bfb3336058f 100644
--- a/tests/Unit/AmpMath/amp_math_asin_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_asin_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_asinf.cpp b/tests/Unit/AmpMath/amp_math_asinf.cpp
index a9089332eb1..6e23faca482 100644
--- a/tests/Unit/AmpMath/amp_math_asinf.cpp
+++ b/tests/Unit/AmpMath/amp_math_asinf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_asinh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_asinh_precise_math.cpp
index 04ec1f3e58e..e54a7cca476 100644
--- a/tests/Unit/AmpMath/amp_math_asinh_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_asinh_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_atan.cpp b/tests/Unit/AmpMath/amp_math_atan.cpp
index 3b4451b174f..7afdab0a6de 100644
--- a/tests/Unit/AmpMath/amp_math_atan.cpp
+++ b/tests/Unit/AmpMath/amp_math_atan.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_atan2.cpp b/tests/Unit/AmpMath/amp_math_atan2.cpp
index cc4cfe95c44..60357557976 100644
--- a/tests/Unit/AmpMath/amp_math_atan2.cpp
+++ b/tests/Unit/AmpMath/amp_math_atan2.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_atan2_precise_math.cpp b/tests/Unit/AmpMath/amp_math_atan2_precise_math.cpp
index 3e8c4cf25b6..1e6648e1da2 100644
--- a/tests/Unit/AmpMath/amp_math_atan2_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_atan2_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_atan2f.cpp b/tests/Unit/AmpMath/amp_math_atan2f.cpp
index 07570bdcea8..e3ba4ae1542 100644
--- a/tests/Unit/AmpMath/amp_math_atan2f.cpp
+++ b/tests/Unit/AmpMath/amp_math_atan2f.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_atan2f_precise_math.cpp b/tests/Unit/AmpMath/amp_math_atan2f_precise_math.cpp
index 589d2c8f613..529e6105ad0 100644
--- a/tests/Unit/AmpMath/amp_math_atan2f_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_atan2f_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_atan_precise_math.cpp b/tests/Unit/AmpMath/amp_math_atan_precise_math.cpp
index 80ecfcc9710..5354f74222f 100644
--- a/tests/Unit/AmpMath/amp_math_atan_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_atan_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_atanf.cpp b/tests/Unit/AmpMath/amp_math_atanf.cpp
index 2dda8882c30..b2ce03769e6 100644
--- a/tests/Unit/AmpMath/amp_math_atanf.cpp
+++ b/tests/Unit/AmpMath/amp_math_atanf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_atanh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_atanh_precise_math.cpp
index 9ba2c066c3f..8500b28c5f3 100644
--- a/tests/Unit/AmpMath/amp_math_atanh_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_atanh_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_cbrt_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cbrt_precise_math.cpp
index 177130cf982..a5425ef2fb6 100644
--- a/tests/Unit/AmpMath/amp_math_cbrt_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_cbrt_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_cbrtf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cbrtf_precise_math.cpp
index 096132089c3..717d7b0b948 100644
--- a/tests/Unit/AmpMath/amp_math_cbrtf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_cbrtf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_ceil.cpp b/tests/Unit/AmpMath/amp_math_ceil.cpp
index 9fa9408e35a..0e64c171a4c 100644
--- a/tests/Unit/AmpMath/amp_math_ceil.cpp
+++ b/tests/Unit/AmpMath/amp_math_ceil.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_ceil_precise_math.cpp b/tests/Unit/AmpMath/amp_math_ceil_precise_math.cpp
index 85ab3f9ada9..0f865125fa4 100644
--- a/tests/Unit/AmpMath/amp_math_ceil_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_ceil_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_ceilf.cpp b/tests/Unit/AmpMath/amp_math_ceilf.cpp
index 5380c04bdc8..b278b4c03a1 100644
--- a/tests/Unit/AmpMath/amp_math_ceilf.cpp
+++ b/tests/Unit/AmpMath/amp_math_ceilf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_copysign_precise_math.cpp b/tests/Unit/AmpMath/amp_math_copysign_precise_math.cpp
index 8c73534f785..496874e82f2 100644
--- a/tests/Unit/AmpMath/amp_math_copysign_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_copysign_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_copysignf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_copysignf_precise_math.cpp
index a75eb7abb50..f5a09b692f0 100644
--- a/tests/Unit/AmpMath/amp_math_copysignf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_copysignf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_cos.cpp b/tests/Unit/AmpMath/amp_math_cos.cpp
index 75455e3c889..3a686aab8c7 100644
--- a/tests/Unit/AmpMath/amp_math_cos.cpp
+++ b/tests/Unit/AmpMath/amp_math_cos.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_cos_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cos_precise_math.cpp
index ea14cb4c3df..817e52787cf 100644
--- a/tests/Unit/AmpMath/amp_math_cos_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_cos_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_cosf.cpp b/tests/Unit/AmpMath/amp_math_cosf.cpp
index 3cd40409fe8..b6de1616aa4 100644
--- a/tests/Unit/AmpMath/amp_math_cosf.cpp
+++ b/tests/Unit/AmpMath/amp_math_cosf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_cosh.cpp b/tests/Unit/AmpMath/amp_math_cosh.cpp
index e687b424f44..9393c4a59b3 100644
--- a/tests/Unit/AmpMath/amp_math_cosh.cpp
+++ b/tests/Unit/AmpMath/amp_math_cosh.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_cosh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cosh_precise_math.cpp
index 477e4382119..93658dec440 100644
--- a/tests/Unit/AmpMath/amp_math_cosh_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_cosh_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_coshf.cpp b/tests/Unit/AmpMath/amp_math_coshf.cpp
index 972c95d3360..d0eb9b17234 100644
--- a/tests/Unit/AmpMath/amp_math_coshf.cpp
+++ b/tests/Unit/AmpMath/amp_math_coshf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_coshf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_coshf_precise_math.cpp
index eb2df4ae66e..1c2d497722d 100644
--- a/tests/Unit/AmpMath/amp_math_coshf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_coshf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_cospi_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cospi_precise_math.cpp
index 1663ce5a165..47f5c00d985 100644
--- a/tests/Unit/AmpMath/amp_math_cospi_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_cospi_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <cmath>
 #include <cassert>
diff --git a/tests/Unit/AmpMath/amp_math_cospif_precise_math.cpp b/tests/Unit/AmpMath/amp_math_cospif_precise_math.cpp
index 1663ce5a165..47f5c00d985 100644
--- a/tests/Unit/AmpMath/amp_math_cospif_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_cospif_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <cmath>
 #include <cassert>
diff --git a/tests/Unit/AmpMath/amp_math_erf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_erf_precise_math.cpp
index cce847c3789..83e9e4ac0ca 100644
--- a/tests/Unit/AmpMath/amp_math_erf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_erf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_erfc_precise_math.cpp b/tests/Unit/AmpMath/amp_math_erfc_precise_math.cpp
index 4c7a576ce13..c84bac0610e 100644
--- a/tests/Unit/AmpMath/amp_math_erfc_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_erfc_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_erfcf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_erfcf_precise_math.cpp
index d4fa4a617f1..c0d9bb31673 100644
--- a/tests/Unit/AmpMath/amp_math_erfcf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_erfcf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_erff_precise_math.cpp b/tests/Unit/AmpMath/amp_math_erff_precise_math.cpp
index ad216c20aae..fac9eede43a 100644
--- a/tests/Unit/AmpMath/amp_math_erff_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_erff_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_exp.cpp b/tests/Unit/AmpMath/amp_math_exp.cpp
index fd834fac3d6..136e760ccee 100644
--- a/tests/Unit/AmpMath/amp_math_exp.cpp
+++ b/tests/Unit/AmpMath/amp_math_exp.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_exp10.cpp b/tests/Unit/AmpMath/amp_math_exp10.cpp
index 66650f0d006..3909442b60b 100644
--- a/tests/Unit/AmpMath/amp_math_exp10.cpp
+++ b/tests/Unit/AmpMath/amp_math_exp10.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_exp10_precise_math.cpp b/tests/Unit/AmpMath/amp_math_exp10_precise_math.cpp
index b99e6fce951..369446f5302 100644
--- a/tests/Unit/AmpMath/amp_math_exp10_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_exp10_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_exp10f.cpp b/tests/Unit/AmpMath/amp_math_exp10f.cpp
index df81ec74530..4edcd59508d 100644
--- a/tests/Unit/AmpMath/amp_math_exp10f.cpp
+++ b/tests/Unit/AmpMath/amp_math_exp10f.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_exp2.cpp b/tests/Unit/AmpMath/amp_math_exp2.cpp
index 30ad66243d3..d0ada00cb6e 100644
--- a/tests/Unit/AmpMath/amp_math_exp2.cpp
+++ b/tests/Unit/AmpMath/amp_math_exp2.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_exp2_precise_math.cpp b/tests/Unit/AmpMath/amp_math_exp2_precise_math.cpp
index a763caa8a43..c242709740a 100644
--- a/tests/Unit/AmpMath/amp_math_exp2_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_exp2_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_exp2f.cpp b/tests/Unit/AmpMath/amp_math_exp2f.cpp
index daf80343061..05a0f5fec07 100644
--- a/tests/Unit/AmpMath/amp_math_exp2f.cpp
+++ b/tests/Unit/AmpMath/amp_math_exp2f.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_exp_precise_math.cpp b/tests/Unit/AmpMath/amp_math_exp_precise_math.cpp
index c22643b5218..5090f45b04d 100644
--- a/tests/Unit/AmpMath/amp_math_exp_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_exp_precise_math.cpp
@@ -5,8 +5,8 @@
 // random failure on fiji. Re-enable it after JIRA
 // ticket 136805 is fixed.
 
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_expf.cpp b/tests/Unit/AmpMath/amp_math_expf.cpp
index a75ab23e944..ff861dd2a44 100644
--- a/tests/Unit/AmpMath/amp_math_expf.cpp
+++ b/tests/Unit/AmpMath/amp_math_expf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_expf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_expf_precise_math.cpp
index 2cb11c0efcc..f487acaa1b6 100644
--- a/tests/Unit/AmpMath/amp_math_expf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_expf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_expm1.cpp b/tests/Unit/AmpMath/amp_math_expm1.cpp
index 37c26d55550..e1457865446 100644
--- a/tests/Unit/AmpMath/amp_math_expm1.cpp
+++ b/tests/Unit/AmpMath/amp_math_expm1.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_expm1_precise_math.cpp b/tests/Unit/AmpMath/amp_math_expm1_precise_math.cpp
index 7edd5c46923..156bb100d6b 100644
--- a/tests/Unit/AmpMath/amp_math_expm1_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_expm1_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_expm1f.cpp b/tests/Unit/AmpMath/amp_math_expm1f.cpp
index 989188c4c6b..e61f1d3623a 100644
--- a/tests/Unit/AmpMath/amp_math_expm1f.cpp
+++ b/tests/Unit/AmpMath/amp_math_expm1f.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fdim_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fdim_precise_math.cpp
index 2d16239175d..f5110f0e4a4 100644
--- a/tests/Unit/AmpMath/amp_math_fdim_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_fdim_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_floor.cpp b/tests/Unit/AmpMath/amp_math_floor.cpp
index 3fafec888c3..7621dac497e 100644
--- a/tests/Unit/AmpMath/amp_math_floor.cpp
+++ b/tests/Unit/AmpMath/amp_math_floor.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_floor_precise_math.cpp b/tests/Unit/AmpMath/amp_math_floor_precise_math.cpp
index 7979f24978d..25f41efbae2 100644
--- a/tests/Unit/AmpMath/amp_math_floor_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_floor_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_floorf.cpp b/tests/Unit/AmpMath/amp_math_floorf.cpp
index b0b6aab026c..562c6c953a9 100644
--- a/tests/Unit/AmpMath/amp_math_floorf.cpp
+++ b/tests/Unit/AmpMath/amp_math_floorf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fma_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fma_precise_math.cpp
index 671e68e9ae3..800e219dd9e 100644
--- a/tests/Unit/AmpMath/amp_math_fma_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_fma_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fmaf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fmaf_precise_math.cpp
index b6372b0cb85..a1b1040eaa2 100644
--- a/tests/Unit/AmpMath/amp_math_fmaf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_fmaf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fmax.cpp b/tests/Unit/AmpMath/amp_math_fmax.cpp
index 607e24b4bf9..6fbc10372e2 100644
--- a/tests/Unit/AmpMath/amp_math_fmax.cpp
+++ b/tests/Unit/AmpMath/amp_math_fmax.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fmax_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fmax_precise_math.cpp
index 67cdb67443b..fda10b34a81 100644
--- a/tests/Unit/AmpMath/amp_math_fmax_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_fmax_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fmaxf.cpp b/tests/Unit/AmpMath/amp_math_fmaxf.cpp
index ffc61ddc861..a5fdda09617 100644
--- a/tests/Unit/AmpMath/amp_math_fmaxf.cpp
+++ b/tests/Unit/AmpMath/amp_math_fmaxf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fmin.cpp b/tests/Unit/AmpMath/amp_math_fmin.cpp
index fdd7268a429..38d73c2f06c 100644
--- a/tests/Unit/AmpMath/amp_math_fmin.cpp
+++ b/tests/Unit/AmpMath/amp_math_fmin.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fmin_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fmin_precise_math.cpp
index 8d29f3e07ac..16e14edb8c5 100644
--- a/tests/Unit/AmpMath/amp_math_fmin_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_fmin_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fminf.cpp b/tests/Unit/AmpMath/amp_math_fminf.cpp
index 71e29460c01..10a22030e97 100644
--- a/tests/Unit/AmpMath/amp_math_fminf.cpp
+++ b/tests/Unit/AmpMath/amp_math_fminf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fmod.cpp b/tests/Unit/AmpMath/amp_math_fmod.cpp
index a8f90298d3a..4293f058079 100644
--- a/tests/Unit/AmpMath/amp_math_fmod.cpp
+++ b/tests/Unit/AmpMath/amp_math_fmod.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fmod_precise_math.cpp b/tests/Unit/AmpMath/amp_math_fmod_precise_math.cpp
index 35e308a1524..a3578f7841b 100644
--- a/tests/Unit/AmpMath/amp_math_fmod_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_fmod_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_fmodf.cpp b/tests/Unit/AmpMath/amp_math_fmodf.cpp
index eca54443b20..1554cd483e6 100644
--- a/tests/Unit/AmpMath/amp_math_fmodf.cpp
+++ b/tests/Unit/AmpMath/amp_math_fmodf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_hypot_precise_math.cpp b/tests/Unit/AmpMath/amp_math_hypot_precise_math.cpp
index 9ab5e532f78..b580d3b7e2f 100644
--- a/tests/Unit/AmpMath/amp_math_hypot_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_hypot_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_ilogb.cpp b/tests/Unit/AmpMath/amp_math_ilogb.cpp
index 5812a69ac0a..fe88e657835 100644
--- a/tests/Unit/AmpMath/amp_math_ilogb.cpp
+++ b/tests/Unit/AmpMath/amp_math_ilogb.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_ilogb_precise_math.cpp b/tests/Unit/AmpMath/amp_math_ilogb_precise_math.cpp
index 37d8c8621ad..125196bbddc 100644
--- a/tests/Unit/AmpMath/amp_math_ilogb_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_ilogb_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_ilogbf.cpp b/tests/Unit/AmpMath/amp_math_ilogbf.cpp
index bd7de28f7e6..4f6b6c334f1 100644
--- a/tests/Unit/AmpMath/amp_math_ilogbf.cpp
+++ b/tests/Unit/AmpMath/amp_math_ilogbf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_isfinite.cpp b/tests/Unit/AmpMath/amp_math_isfinite.cpp
index c3798778b51..ff9484eeaa2 100644
--- a/tests/Unit/AmpMath/amp_math_isfinite.cpp
+++ b/tests/Unit/AmpMath/amp_math_isfinite.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 
 using namespace hc;
 
diff --git a/tests/Unit/AmpMath/amp_math_isfinite_precise_math.cpp b/tests/Unit/AmpMath/amp_math_isfinite_precise_math.cpp
index ed27f4d845f..37e3f86dcf3 100644
--- a/tests/Unit/AmpMath/amp_math_isfinite_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_isfinite_precise_math.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 #include <cmath>
 #include <cassert>
 
diff --git a/tests/Unit/AmpMath/amp_math_isinf.cpp b/tests/Unit/AmpMath/amp_math_isinf.cpp
index c88e2eee8c0..b6a9806e250 100644
--- a/tests/Unit/AmpMath/amp_math_isinf.cpp
+++ b/tests/Unit/AmpMath/amp_math_isinf.cpp
@@ -1,10 +1,10 @@
 
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 
 using namespace hc;
 
diff --git a/tests/Unit/AmpMath/amp_math_isinf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_isinf_precise_math.cpp
index 638de6ff1bb..cc458c44bee 100644
--- a/tests/Unit/AmpMath/amp_math_isinf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_isinf_precise_math.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 #include <cmath>
 #include <cassert>
 
diff --git a/tests/Unit/AmpMath/amp_math_isnan.cpp b/tests/Unit/AmpMath/amp_math_isnan.cpp
index fec6d69fa2a..fcd8231ab93 100644
--- a/tests/Unit/AmpMath/amp_math_isnan.cpp
+++ b/tests/Unit/AmpMath/amp_math_isnan.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 #include <cmath>
 #include <cassert>
 
diff --git a/tests/Unit/AmpMath/amp_math_isnan_precise_math.cpp b/tests/Unit/AmpMath/amp_math_isnan_precise_math.cpp
index d9bb1851170..26254ac1257 100644
--- a/tests/Unit/AmpMath/amp_math_isnan_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_isnan_precise_math.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 #include <cmath>
 #include <cassert>
 
diff --git a/tests/Unit/AmpMath/amp_math_isnormal.cpp b/tests/Unit/AmpMath/amp_math_isnormal.cpp
index fca610102fc..4e29abe7d5e 100644
--- a/tests/Unit/AmpMath/amp_math_isnormal.cpp
+++ b/tests/Unit/AmpMath/amp_math_isnormal.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 
 using namespace hc;
 
diff --git a/tests/Unit/AmpMath/amp_math_isnormal_precise_math.cpp b/tests/Unit/AmpMath/amp_math_isnormal_precise_math.cpp
index c7781acba7e..ed9877b78d5 100644
--- a/tests/Unit/AmpMath/amp_math_isnormal_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_isnormal_precise_math.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 #include <cmath>
 #include <cassert>
 
diff --git a/tests/Unit/AmpMath/amp_math_ldexp.cpp b/tests/Unit/AmpMath/amp_math_ldexp.cpp
index e38f9aff1ae..64840618a45 100644
--- a/tests/Unit/AmpMath/amp_math_ldexp.cpp
+++ b/tests/Unit/AmpMath/amp_math_ldexp.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_ldexp_precise_math.cpp b/tests/Unit/AmpMath/amp_math_ldexp_precise_math.cpp
index d3084093adc..40eef985f85 100644
--- a/tests/Unit/AmpMath/amp_math_ldexp_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_ldexp_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_ldexpf.cpp b/tests/Unit/AmpMath/amp_math_ldexpf.cpp
index d777d3e03b6..27027b72df5 100644
--- a/tests/Unit/AmpMath/amp_math_ldexpf.cpp
+++ b/tests/Unit/AmpMath/amp_math_ldexpf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_ldexpf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_ldexpf_precise_math.cpp
index 71c3bd4cd1c..a3482908027 100644
--- a/tests/Unit/AmpMath/amp_math_ldexpf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_ldexpf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log.cpp b/tests/Unit/AmpMath/amp_math_log.cpp
index 9b4b814ac6d..59f36391c57 100644
--- a/tests/Unit/AmpMath/amp_math_log.cpp
+++ b/tests/Unit/AmpMath/amp_math_log.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log10.cpp b/tests/Unit/AmpMath/amp_math_log10.cpp
index e075cbefbf4..682fb7cbdbd 100644
--- a/tests/Unit/AmpMath/amp_math_log10.cpp
+++ b/tests/Unit/AmpMath/amp_math_log10.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log10_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log10_precise_math.cpp
index eb19cab2a30..18124b6b81e 100644
--- a/tests/Unit/AmpMath/amp_math_log10_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_log10_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log10f.cpp b/tests/Unit/AmpMath/amp_math_log10f.cpp
index 21b3d1489bb..26cb19bddee 100644
--- a/tests/Unit/AmpMath/amp_math_log10f.cpp
+++ b/tests/Unit/AmpMath/amp_math_log10f.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log1p_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log1p_precise_math.cpp
index b87d0152527..74cb5e6cd32 100644
--- a/tests/Unit/AmpMath/amp_math_log1p_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_log1p_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log1pf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log1pf_precise_math.cpp
index c2fe37042d8..dc71f43065d 100644
--- a/tests/Unit/AmpMath/amp_math_log1pf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_log1pf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log2.cpp b/tests/Unit/AmpMath/amp_math_log2.cpp
index b07e8c7634c..157d3dd5592 100644
--- a/tests/Unit/AmpMath/amp_math_log2.cpp
+++ b/tests/Unit/AmpMath/amp_math_log2.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log2_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log2_precise_math.cpp
index 53a05051165..4135f57fcc4 100644
--- a/tests/Unit/AmpMath/amp_math_log2_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_log2_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log2f.cpp b/tests/Unit/AmpMath/amp_math_log2f.cpp
index 701f2b0e410..25b8681e6db 100644
--- a/tests/Unit/AmpMath/amp_math_log2f.cpp
+++ b/tests/Unit/AmpMath/amp_math_log2f.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_log_precise_math.cpp b/tests/Unit/AmpMath/amp_math_log_precise_math.cpp
index 605d05ab85f..c9731f84f23 100644
--- a/tests/Unit/AmpMath/amp_math_log_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_log_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_logb_precise_math.cpp b/tests/Unit/AmpMath/amp_math_logb_precise_math.cpp
index 3d1e95f0545..e4f6ca013f4 100644
--- a/tests/Unit/AmpMath/amp_math_logb_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_logb_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_logbf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_logbf_precise_math.cpp
index 760befeaeaf..6fe5a9700de 100644
--- a/tests/Unit/AmpMath/amp_math_logbf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_logbf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_max.cpp b/tests/Unit/AmpMath/amp_math_max.cpp
index 1425ee96dd4..cb117301047 100644
--- a/tests/Unit/AmpMath/amp_math_max.cpp
+++ b/tests/Unit/AmpMath/amp_math_max.cpp
@@ -2,8 +2,8 @@
 
 #if !DISABLED_PENDING_REMOVAL
   // RUN: %cxxamp %s -o %t.out && %t.out
-  #include <hc.hpp>
-  #include <hc_math.hpp>
+  #include <hc/hc.hpp>
+  #include <hc/hc_math.hpp>
 
   #include <iostream>
   #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_max_precise_math.cpp b/tests/Unit/AmpMath/amp_math_max_precise_math.cpp
index 21c226bbfb8..951a4b3d8e4 100644
--- a/tests/Unit/AmpMath/amp_math_max_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_max_precise_math.cpp
@@ -2,8 +2,8 @@
 
 #if !DISABLED_PENDING_REMOVAL
   // RUN: %cxxamp %s -o %t.out && %t.out
-  #include <hc.hpp>
-  #include <hc_math.hpp>
+  #include <hc/hc.hpp>
+  #include <hc/hc_math.hpp>
 
   #include <iostream>
   #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_min.cpp b/tests/Unit/AmpMath/amp_math_min.cpp
index a90ac8705b0..b34cd1a5b97 100644
--- a/tests/Unit/AmpMath/amp_math_min.cpp
+++ b/tests/Unit/AmpMath/amp_math_min.cpp
@@ -2,8 +2,8 @@
 
 #if !DISABLED_PENDING_REMOVAL
   // RUN: %cxxamp %s -o %t.out && %t.out
-  #include <hc.hpp>
-  #include <hc_math.hpp>
+  #include <hc/hc.hpp>
+  #include <hc/hc_math.hpp>
 
   #include <iostream>
   #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_min_precise_math.cpp b/tests/Unit/AmpMath/amp_math_min_precise_math.cpp
index bde8bd48892..4330ada2ec8 100644
--- a/tests/Unit/AmpMath/amp_math_min_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_min_precise_math.cpp
@@ -2,8 +2,8 @@
 
 #if !DISABLED_PENDING_REMOVAL
   // RUN: %cxxamp %s -o %t.out && %t.out
-  #include <hc.hpp>
-  #include <hc_math.hpp>
+  #include <hc/hc.hpp>
+  #include <hc/hc_math.hpp>
 
   #include <iostream>
   #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_nearbyint_precise_math.cpp b/tests/Unit/AmpMath/amp_math_nearbyint_precise_math.cpp
index 227696d658b..433ff407762 100644
--- a/tests/Unit/AmpMath/amp_math_nearbyint_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_nearbyint_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_nextafter_precise_math.cpp b/tests/Unit/AmpMath/amp_math_nextafter_precise_math.cpp
index f8f6b33d4fb..40dad3282ad 100644
--- a/tests/Unit/AmpMath/amp_math_nextafter_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_nextafter_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_pow.cpp b/tests/Unit/AmpMath/amp_math_pow.cpp
index dcadf3ef8dc..9dc0ed73b77 100644
--- a/tests/Unit/AmpMath/amp_math_pow.cpp
+++ b/tests/Unit/AmpMath/amp_math_pow.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_pow_precise_math.cpp b/tests/Unit/AmpMath/amp_math_pow_precise_math.cpp
index ba151313935..c3b2b127b54 100644
--- a/tests/Unit/AmpMath/amp_math_pow_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_pow_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_powf.cpp b/tests/Unit/AmpMath/amp_math_powf.cpp
index bb40e3474aa..de9634d545e 100644
--- a/tests/Unit/AmpMath/amp_math_powf.cpp
+++ b/tests/Unit/AmpMath/amp_math_powf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_rcbrt_precise_math.cpp b/tests/Unit/AmpMath/amp_math_rcbrt_precise_math.cpp
index 417c7f5ee03..d74ac3854fa 100644
--- a/tests/Unit/AmpMath/amp_math_rcbrt_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_rcbrt_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_rcbrtf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_rcbrtf_precise_math.cpp
index c8a5e51a649..7cd821aa962 100644
--- a/tests/Unit/AmpMath/amp_math_rcbrtf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_rcbrtf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_remainder_precise_math.cpp b/tests/Unit/AmpMath/amp_math_remainder_precise_math.cpp
index c5b07f5b0d5..b93c3461ad3 100644
--- a/tests/Unit/AmpMath/amp_math_remainder_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_remainder_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_remainderf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_remainderf_precise_math.cpp
index 97c5620ac71..5fdc969caf5 100644
--- a/tests/Unit/AmpMath/amp_math_remainderf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_remainderf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_round.cpp b/tests/Unit/AmpMath/amp_math_round.cpp
index c7d856d448a..0c415558abd 100644
--- a/tests/Unit/AmpMath/amp_math_round.cpp
+++ b/tests/Unit/AmpMath/amp_math_round.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_round_precise_math.cpp b/tests/Unit/AmpMath/amp_math_round_precise_math.cpp
index 11684ab785c..0f21563bd6e 100644
--- a/tests/Unit/AmpMath/amp_math_round_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_round_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_roundf.cpp b/tests/Unit/AmpMath/amp_math_roundf.cpp
index 89a902810ac..e4899041906 100644
--- a/tests/Unit/AmpMath/amp_math_roundf.cpp
+++ b/tests/Unit/AmpMath/amp_math_roundf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_rsqrt.cpp b/tests/Unit/AmpMath/amp_math_rsqrt.cpp
index 4392363bc22..357be871f26 100644
--- a/tests/Unit/AmpMath/amp_math_rsqrt.cpp
+++ b/tests/Unit/AmpMath/amp_math_rsqrt.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_rsqrt_precise_math.cpp b/tests/Unit/AmpMath/amp_math_rsqrt_precise_math.cpp
index 57951c36293..598bc0f331d 100644
--- a/tests/Unit/AmpMath/amp_math_rsqrt_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_rsqrt_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_rsqrtf.cpp b/tests/Unit/AmpMath/amp_math_rsqrtf.cpp
index 42a09cf3afe..ffd01664768 100644
--- a/tests/Unit/AmpMath/amp_math_rsqrtf.cpp
+++ b/tests/Unit/AmpMath/amp_math_rsqrtf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_scalb_precise_math.cpp b/tests/Unit/AmpMath/amp_math_scalb_precise_math.cpp
index cad62bb926e..30be418df33 100644
--- a/tests/Unit/AmpMath/amp_math_scalb_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_scalb_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_scalbn_precise_math.cpp b/tests/Unit/AmpMath/amp_math_scalbn_precise_math.cpp
index fb09922fd79..35df0ecdf96 100644
--- a/tests/Unit/AmpMath/amp_math_scalbn_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_scalbn_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_scalbnf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_scalbnf_precise_math.cpp
index 41992683df1..cf1778582da 100644
--- a/tests/Unit/AmpMath/amp_math_scalbnf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_scalbnf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_signbit.cpp b/tests/Unit/AmpMath/amp_math_signbit.cpp
index 380682bcded..f1ce4403640 100644
--- a/tests/Unit/AmpMath/amp_math_signbit.cpp
+++ b/tests/Unit/AmpMath/amp_math_signbit.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 
 using namespace hc;
 
diff --git a/tests/Unit/AmpMath/amp_math_signbit_precise_math.cpp b/tests/Unit/AmpMath/amp_math_signbit_precise_math.cpp
index 03d41914cf0..a0ba8fd6e3f 100644
--- a/tests/Unit/AmpMath/amp_math_signbit_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_signbit_precise_math.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 #include <cmath>
 #include <cassert>
 
diff --git a/tests/Unit/AmpMath/amp_math_signbitf.cpp b/tests/Unit/AmpMath/amp_math_signbitf.cpp
index 4be8d8d012f..c7a39ec57de 100644
--- a/tests/Unit/AmpMath/amp_math_signbitf.cpp
+++ b/tests/Unit/AmpMath/amp_math_signbitf.cpp
@@ -1,9 +1,9 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <limits>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 #include <cmath>
 #include <cassert>
 
diff --git a/tests/Unit/AmpMath/amp_math_sin.cpp b/tests/Unit/AmpMath/amp_math_sin.cpp
index b77a65de3e1..671e3420f02 100644
--- a/tests/Unit/AmpMath/amp_math_sin.cpp
+++ b/tests/Unit/AmpMath/amp_math_sin.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_sin_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sin_precise_math.cpp
index 26db8298551..e5852910017 100644
--- a/tests/Unit/AmpMath/amp_math_sin_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_sin_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_sinf.cpp b/tests/Unit/AmpMath/amp_math_sinf.cpp
index bdb53ac477b..475759b6ce5 100644
--- a/tests/Unit/AmpMath/amp_math_sinf.cpp
+++ b/tests/Unit/AmpMath/amp_math_sinf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_sinh.cpp b/tests/Unit/AmpMath/amp_math_sinh.cpp
index ada64d00a6a..79990112551 100644
--- a/tests/Unit/AmpMath/amp_math_sinh.cpp
+++ b/tests/Unit/AmpMath/amp_math_sinh.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_sinh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sinh_precise_math.cpp
index b214938bafd..d55c525513c 100644
--- a/tests/Unit/AmpMath/amp_math_sinh_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_sinh_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_sinpi_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sinpi_precise_math.cpp
index 1890e848c35..7d224b97dbd 100644
--- a/tests/Unit/AmpMath/amp_math_sinpi_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_sinpi_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_sinpif_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sinpif_precise_math.cpp
index e1d2b2b8131..57c0ec2d7ec 100644
--- a/tests/Unit/AmpMath/amp_math_sinpif_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_sinpif_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_sqrt.cpp b/tests/Unit/AmpMath/amp_math_sqrt.cpp
index 5846a7b1d75..7d570c62a5c 100644
--- a/tests/Unit/AmpMath/amp_math_sqrt.cpp
+++ b/tests/Unit/AmpMath/amp_math_sqrt.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_sqrt_precise_math.cpp b/tests/Unit/AmpMath/amp_math_sqrt_precise_math.cpp
index 4102cd6e5cf..6e6c5899556 100644
--- a/tests/Unit/AmpMath/amp_math_sqrt_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_sqrt_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_sqrtf.cpp b/tests/Unit/AmpMath/amp_math_sqrtf.cpp
index 437b4d87ac9..16f8173687f 100644
--- a/tests/Unit/AmpMath/amp_math_sqrtf.cpp
+++ b/tests/Unit/AmpMath/amp_math_sqrtf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_tan.cpp b/tests/Unit/AmpMath/amp_math_tan.cpp
index 2b2ca7a0f3b..f5ff139c866 100644
--- a/tests/Unit/AmpMath/amp_math_tan.cpp
+++ b/tests/Unit/AmpMath/amp_math_tan.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_tan_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tan_precise_math.cpp
index 5bae874aa49..858b83d9c74 100644
--- a/tests/Unit/AmpMath/amp_math_tan_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_tan_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_tanf.cpp b/tests/Unit/AmpMath/amp_math_tanf.cpp
index ccb20c00a3d..f1513d275a1 100644
--- a/tests/Unit/AmpMath/amp_math_tanf.cpp
+++ b/tests/Unit/AmpMath/amp_math_tanf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_tanh.cpp b/tests/Unit/AmpMath/amp_math_tanh.cpp
index 24c5b1c156c..e5af3f5fb5f 100644
--- a/tests/Unit/AmpMath/amp_math_tanh.cpp
+++ b/tests/Unit/AmpMath/amp_math_tanh.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_tanh_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tanh_precise_math.cpp
index 7878e43976e..74a1b1b178d 100644
--- a/tests/Unit/AmpMath/amp_math_tanh_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_tanh_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_tanpi_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tanpi_precise_math.cpp
index 2afbda1df50..0f8b90ce25b 100644
--- a/tests/Unit/AmpMath/amp_math_tanpi_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_tanpi_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_tgamma_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tgamma_precise_math.cpp
index 9228636cc93..f359df81fcb 100644
--- a/tests/Unit/AmpMath/amp_math_tgamma_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_tgamma_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <cmath>
diff --git a/tests/Unit/AmpMath/amp_math_tgammaf_precise_math.cpp b/tests/Unit/AmpMath/amp_math_tgammaf_precise_math.cpp
index 3264f253573..18a7cfea0b7 100644
--- a/tests/Unit/AmpMath/amp_math_tgammaf_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_tgammaf_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_trunc.cpp b/tests/Unit/AmpMath/amp_math_trunc.cpp
index 2f9006c319b..7780f0e4109 100644
--- a/tests/Unit/AmpMath/amp_math_trunc.cpp
+++ b/tests/Unit/AmpMath/amp_math_trunc.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_trunc_precise_math.cpp b/tests/Unit/AmpMath/amp_math_trunc_precise_math.cpp
index be0507ae015..30966639d78 100644
--- a/tests/Unit/AmpMath/amp_math_trunc_precise_math.cpp
+++ b/tests/Unit/AmpMath/amp_math_trunc_precise_math.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpMath/amp_math_truncf.cpp b/tests/Unit/AmpMath/amp_math_truncf.cpp
index dcdb8dcca17..c2f0efd888d 100644
--- a/tests/Unit/AmpMath/amp_math_truncf.cpp
+++ b/tests/Unit/AmpMath/amp_math_truncf.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_2files.h b/tests/Unit/AmpShortVectors/amp_short_vectors_2files.h
index 6757352af26..91226bf1f6a 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_2files.h
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_2files.h
@@ -1,6 +1,6 @@
-#include <hc.hpp>
-#include <hc_math.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_2files_1.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_2files_1.cpp
index 825f169dfad..f4a2b4bdfb8 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_2files_1.cpp
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_2files_1.cpp
@@ -19,8 +19,8 @@ void add(const array_view<float,1> &gbIn,const array_view<float_2,1> &gbOut)
 
 	if(me < 64)
 	{
-	  gbOut[me].x = gbIn[me];
-	  gbOut[me].y = gbIn[me];
+	  gbOut[me].set_x(gbIn[me]);
+	  gbOut[me].set_y(gbIn[me]);
 	}
 
    });
@@ -34,8 +34,8 @@ int main()
     for(int i = 0; i< 64;i++)
     {
       gbIn[i] = i + 1;
-      gbOut[i].x = i + 1;
-      gbOut[i].y = i + 1;
+      gbOut[i].set_x(i + 1);
+      gbOut[i].set_y(i + 1);
     }
     
     const hc::array_view<float, 1> gbInA(64, gbIn);
@@ -72,8 +72,8 @@ void sub(const array_view<float,1> &gbIn,const array_view<float_2,1> &gbOut)
 
 	if(me < 64)
 	{
-	  gbOut[me].x = gbIn[me];
-	  gbOut[me].y = gbIn[me];
+	  gbOut[me].set_x(gbIn[me]);
+	  gbOut[me].set_y(gbIn[me]);
 	}
 
    });
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_double_3_addon.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_double_3_addon.cpp
index 33634f1aab7..41a4348bf82 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_double_3_addon.cpp
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_double_3_addon.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_float_2_addon.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_float_2_addon.cpp
index 521c6fd8bd9..0729dbbe884 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_float_2_addon.cpp
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_float_2_addon.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_int_4_addon.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_int_4_addon.cpp
index 38ea632f3be..1d9476d2038 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_int_4_addon.cpp
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_int_4_addon.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_norm.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_norm.cpp
index 8006fa59b44..f2277940b15 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_norm.cpp
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_norm.cpp
@@ -1,7 +1,7 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector.cpp
index 526074e10cb..9423663540a 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector.cpp
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
@@ -22,7 +22,7 @@ bool eqTypes() { return is_same<T, U>::result; }
 int main(void) {
 
   {
-    bool ret = eqTypes<short_vector<unsigned int, 1>::type, unsigned int>();
+    bool ret = eqTypes<short_vector<unsigned int, 1>::type, uint_1>();
     assert(ret);
   }
 
@@ -42,7 +42,7 @@ int main(void) {
   }
 
   {
-    bool ret = eqTypes<short_vector<int, 1>::type, int>();
+    bool ret = eqTypes<short_vector<int, 1>::type, int_1>();
     assert(ret);
   }
 
@@ -62,7 +62,7 @@ int main(void) {
   }
 
   {
-    bool ret = eqTypes<short_vector<float, 1>::type, float>();
+    bool ret = eqTypes<short_vector<float, 1>::type, float_1>();
     assert(ret);
   }
 
@@ -82,7 +82,7 @@ int main(void) {
   }
 
   {
-    bool ret = eqTypes<short_vector<unorm, 1>::type, unorm>();
+    bool ret = eqTypes<short_vector<unorm, 1>::type, unorm_1>();
     assert(ret);
   }
 
@@ -102,7 +102,7 @@ int main(void) {
   }
 
   {
-    bool ret = eqTypes<short_vector<norm, 1>::type, norm>();
+    bool ret = eqTypes<short_vector<norm, 1>::type, norm_1>();
     assert(ret);
   }
 
@@ -122,7 +122,7 @@ int main(void) {
   }
 
   {
-    bool ret = eqTypes<short_vector<double, 1>::type, double>();
+    bool ret = eqTypes<short_vector<double, 1>::type, double_1>();
     assert(ret);
   }
 
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector_traits.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector_traits.cpp
index eb5f956c4f8..0286754b2d3 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector_traits.cpp
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_short_vector_traits.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
@@ -16,42 +16,42 @@ template<typename T> struct is_same<T, T>
   static const bool result = true;
 };
 
-template<typename T, typename U> 
+template<typename T, typename U>
 bool eqTypes() { return is_same<T, U>::result; }
 
 int main(void) {
 
   {
-    bool ret1 = eqTypes<short_vector_traits<unsigned int>::value_type, 
+    bool ret1 = eqTypes<short_vector_traits<uint_1>::value_type,
                          unsigned int>();
-    bool ret2 = short_vector_traits<unsigned int>::size == 1;
+    bool ret2 = short_vector_traits<uint_1>::size == 1;
     assert(ret1 && ret2);
   }
 
   {
-    bool ret1 = eqTypes<short_vector_traits<uint_2>::value_type, 
+    bool ret1 = eqTypes<short_vector_traits<uint_2>::value_type,
                          unsigned int>();
     bool ret2 = short_vector_traits<uint_2>::size == 2;
     assert(ret1 && ret2);
   }
 
   {
-    bool ret1 = eqTypes<short_vector_traits<uint_3>::value_type, 
+    bool ret1 = eqTypes<short_vector_traits<uint_3>::value_type,
                          unsigned int>();
     bool ret2 = short_vector_traits<uint_3>::size == 3;
     assert(ret1 && ret2);
   }
 
   {
-    bool ret1 = eqTypes<short_vector_traits<uint_4>::value_type, 
+    bool ret1 = eqTypes<short_vector_traits<uint_4>::value_type,
                          unsigned int>();
     bool ret2 = short_vector_traits<uint_4>::size == 4;
     assert(ret1 && ret2);
   }
 
   {
-    bool ret1 = eqTypes<short_vector_traits<int>::value_type, int>();
-    bool ret2 = short_vector_traits<int>::size == 1;
+    bool ret1 = eqTypes<short_vector_traits<int_1>::value_type, int>();
+    bool ret2 = short_vector_traits<int_1>::size == 1;
     assert(ret1 && ret2);
   }
 
@@ -74,8 +74,8 @@ int main(void) {
   }
 
   {
-    bool ret1 = eqTypes<short_vector_traits<float>::value_type, float>();
-    bool ret2 = short_vector_traits<float>::size == 1;
+    bool ret1 = eqTypes<short_vector_traits<float_1>::value_type, float>();
+    bool ret2 = short_vector_traits<float_1>::size == 1;
     assert(ret1 && ret2);
   }
 
@@ -98,8 +98,8 @@ int main(void) {
   }
 
   {
-    bool ret1 = eqTypes<short_vector_traits<unorm>::value_type, unorm>();
-    bool ret2 = short_vector_traits<unorm>::size == 1;
+    bool ret1 = eqTypes<short_vector_traits<unorm_1>::value_type, unorm>();
+    bool ret2 = short_vector_traits<unorm_1>::size == 1;
     assert(ret1 && ret2);
   }
 
@@ -122,8 +122,8 @@ int main(void) {
   }
 
   {
-    bool ret1 = eqTypes<short_vector_traits<norm>::value_type, norm>();
-    bool ret2 = short_vector_traits<norm>::size == 1;
+    bool ret1 = eqTypes<short_vector_traits<norm_1>::value_type, norm>();
+    bool ret2 = short_vector_traits<norm_1>::size == 1;
     assert(ret1 && ret2);
   }
 
@@ -146,8 +146,8 @@ int main(void) {
   }
 
   {
-    bool ret1 = eqTypes<short_vector_traits<double>::value_type, double>();
-    bool ret2 = short_vector_traits<double>::size == 1;
+    bool ret1 = eqTypes<short_vector_traits<double_1>::value_type, double>();
+    bool ret2 = short_vector_traits<double_1>::size == 1;
     assert(ret1 && ret2);
   }
 
@@ -170,4 +170,4 @@ int main(void) {
   }
 
   return 0;
-}
+}
\ No newline at end of file
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_uint_2_addon.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_uint_2_addon.cpp
index 96a2c1449d2..6a453dfa9d5 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_uint_2_addon.cpp
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_uint_2_addon.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
diff --git a/tests/Unit/AmpShortVectors/amp_short_vectors_unorm.cpp b/tests/Unit/AmpShortVectors/amp_short_vectors_unorm.cpp
index 78d08e6a110..c5d322dabf8 100644
--- a/tests/Unit/AmpShortVectors/amp_short_vectors_unorm.cpp
+++ b/tests/Unit/AmpShortVectors/amp_short_vectors_unorm.cpp
@@ -1,7 +1,7 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
diff --git a/tests/Unit/AmpShortVectors/hc_short_vector_device.cpp b/tests/Unit/AmpShortVectors/hc_short_vector_device.cpp
index 956187fdb5c..0bb395bfc74 100644
--- a/tests/Unit/AmpShortVectors/hc_short_vector_device.cpp
+++ b/tests/Unit/AmpShortVectors/hc_short_vector_device.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_short_vector.hpp>
 
 #define GRID_SIZE (64)
 
diff --git a/tests/Unit/AmpShortVectors/hc_short_vector_device2.cpp b/tests/Unit/AmpShortVectors/hc_short_vector_device2.cpp
index c9b6da61daf..cc319f52078 100644
--- a/tests/Unit/AmpShortVectors/hc_short_vector_device2.cpp
+++ b/tests/Unit/AmpShortVectors/hc_short_vector_device2.cpp
@@ -1,8 +1,8 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <algorithm>
-#include <hc.hpp>
-#include <hc_short_vector.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_short_vector.hpp>
 
 using namespace hc;
 using namespace hc::short_vector;
@@ -83,47 +83,61 @@ int main() {
 
   int errors = 0;
 
-  #if 0
   errors += run_tests<short1,1024>();
   errors += run_tests<short2,1024>();
   errors += run_tests<short4,1024>();
-  errors += run_tests<short8,1024>();
+  #if defined(EXTENDED_VECTOR_TYPES)
+    errors += run_tests<short8,1024>();
   #endif
 
   errors += run_tests<int1,1024>();
   errors += run_tests<int2,1024>();
   errors += run_tests<int4,1024>();
-  errors += run_tests<int8,1024>();
+  #if defined(EXTENDED_VECTOR_TYPES)
+    errors += run_tests<int8,1024>();
+  #endif
 
   errors += run_tests<uint1,1024>();
   errors += run_tests<uint2,1024>();
   errors += run_tests<uint4,1024>();
-  errors += run_tests<uint8,1024>();
+  #if defined(EXTENDED_VECTOR_TYPES)
+    errors += run_tests<uint8,1024>();
+  #endif
 
   errors += run_tests<long1,1024>();
   errors += run_tests<long2,1024>();
   errors += run_tests<long4,1024>();
-  errors += run_tests<long8,1024>();
+  #if defined(EXTENDED_VECTOR_TYPES)
+    errors += run_tests<long8,1024>();
+  #endif
 
   errors += run_tests<ulong1,1024>();
   errors += run_tests<ulong2,1024>();
   errors += run_tests<ulong4,1024>();
-  errors += run_tests<ulong8,1024>();
+  #if defined(EXTENDED_VECTOR_TYPES)
+    errors += run_tests<ulong8,1024>();
+  #endif
 
   errors += run_tests<half1,1024>();
   errors += run_tests<half2,1024>();
   errors += run_tests<half4,1024>();
-  errors += run_tests<half8,1024>();
+  #if defined(EXTENDED_VECTOR_TYPES)
+    errors += run_tests<half8,1024>();
+  #endif
 
   errors += run_tests<float1,1024>();
   errors += run_tests<float2,1024>();
   errors += run_tests<float4,1024>();
-  errors += run_tests<float8,1024>();
+  #if defined(EXTENDED_VECTOR_TYPES)
+    errors += run_tests<float8,1024>();
+  #endif
 
   errors += run_tests<double1,1024>();
   errors += run_tests<double2,1024>();
   errors += run_tests<double4,1024>();
-  errors += run_tests<double8,1024>();
+  #if defined(EXTENDED_VECTOR_TYPES)
+    errors += run_tests<double8,1024>();
+  #endif
 
   return errors;
 }
diff --git a/tests/Unit/AsyncPFE/accelerator_view_wait.cpp b/tests/Unit/AsyncPFE/accelerator_view_wait.cpp
index 61ee2fffd13..0b9d107c75f 100644
--- a/tests/Unit/AsyncPFE/accelerator_view_wait.cpp
+++ b/tests/Unit/AsyncPFE/accelerator_view_wait.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -I%hsa_header_path -L%hsa_library_path -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
@@ -65,17 +65,6 @@ int main() {
   // launch kernel
   hc::completion_future fut1 = execute<1,1>(av1, av2, av3);
 
-  // obtain native handle
-  void* handle1 = fut1.get_native_handle();
-
-  // retrieve HSA signal value
-  hsa_signal_value_t signal_value1;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-#endif
-
-
   // initialize test data
   std::vector<int> table4(32);
   std::vector<int> table5(32);
@@ -91,17 +80,6 @@ int main() {
   // launch kernel
   hc::completion_future fut2 = execute<32,4>(av4, av5, av6);
 
-  // obtain native handle
-  void* handle2 = fut2.get_native_handle();
-
-  // retrieve HSA signal value
-  hsa_signal_value_t signal_value2;
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-#if TEST_DEBUG
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-#endif
-
-
   // initialize test data
   std::vector<int> table7(1024);
   std::vector<int> table8(1024);
@@ -117,72 +95,13 @@ int main() {
   // launch kernel
   hc::completion_future fut3 = execute<1024, 16>(av7, av8, av9);
 
-  // obtain native handle
-  void* handle3 = fut3.get_native_handle();
-
-  // retrieve HSA signal value
-  hsa_signal_value_t signal_value3;
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-
   // wait on all commands on the default queue to finish
   hc::accelerator().get_default_view().wait();
 
-  // after acclerator_view::wait(), all signals shall become 0 because all
-  // kernels are completed
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-#endif
-  // signal value shall be 0 after the kernel is completed
-  ret &= (signal_value1 == 0);
-
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-#if TEST_DEBUG
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-#endif
-  // signal value shall be 0 after the kernel is completed
-  ret &= (signal_value2 == 0);
-
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal value shall be 0 after the kernel is completed
-  ret &= (signal_value3 == 0);
-
-  // wait on all commands on the default queue to finish again
-  // the signal values should still be 0
-  hc::accelerator().get_default_view().wait();
-
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-#endif
-  // signal value shall be 0 after the kernel is completed
-  ret &= (signal_value1 == 0);
-
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-#if TEST_DEBUG
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-#endif
-  // signal value shall be 0 after the kernel is completed
-  ret &= (signal_value2 == 0);
-
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal value shall be 0 after the kernel is completed
-  ret &= (signal_value3 == 0);
-
   // verify computation result
   ret &= verify<1>(av1, av2, av3);
   ret &= verify<32>(av4, av5, av6);
   ret &= verify<1024>(av7, av8, av9);
 
   return !(ret == true);
-}
-
+}
\ No newline at end of file
diff --git a/tests/Unit/AsyncPFE/accelerator_view_wait2.cpp b/tests/Unit/AsyncPFE/accelerator_view_wait2.cpp
index 7b055dd14d3..44d9d6cf472 100644
--- a/tests/Unit/AsyncPFE/accelerator_view_wait2.cpp
+++ b/tests/Unit/AsyncPFE/accelerator_view_wait2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/AsyncPFE/accelerator_view_wait3.cpp b/tests/Unit/AsyncPFE/accelerator_view_wait3.cpp
index 69f4ee939a6..98a50bce8e9 100644
--- a/tests/Unit/AsyncPFE/accelerator_view_wait3.cpp
+++ b/tests/Unit/AsyncPFE/accelerator_view_wait3.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
@@ -90,7 +90,7 @@ bool test(bool useWaitMode, hc::hcWaitMode mode = hc::hcWaitModeBlocked) {
   if (!useWaitMode) {
     hc::accelerator().get_default_view().wait();
   } else {
-    hc::accelerator().get_default_view().wait(mode);
+    hc::accelerator().get_default_view().wait();
   }
 
   ret &= verify<1>(av1, av2, av3);
diff --git a/tests/Unit/AsyncPFE/async_array_add.cpp b/tests/Unit/AsyncPFE/async_array_add.cpp
index 0fb8ec0a85f..e2ef75cc55b 100644
--- a/tests/Unit/AsyncPFE/async_array_add.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add.cpp
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <random>
 #include <future>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -64,7 +64,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_2d.cpp b/tests/Unit/AsyncPFE/async_array_add_2d.cpp
index 622a6d015ad..b32149c0f31 100644
--- a/tests/Unit/AsyncPFE/async_array_add_2d.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_2d.cpp
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <random>
 #include <future>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -66,7 +66,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_3d.cpp b/tests/Unit/AsyncPFE/async_array_add_3d.cpp
index 8c3581a97ad..e2630a31358 100644
--- a/tests/Unit/AsyncPFE/async_array_add_3d.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_3d.cpp
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <random>
 #include <future>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -66,7 +66,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_4d.cpp b/tests/Unit/AsyncPFE/async_array_add_4d.cpp
deleted file mode 100644
index 8fdab391577..00000000000
--- a/tests/Unit/AsyncPFE/async_array_add_4d.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-
-// RUN: %hc %s -o %t.out && %t.out
-
-#include <iostream>
-#include <random>
-#include <future>
-#include <hc.hpp>
-
-// FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
-// (ex: 1024 * 1024).
-#define LOOP_COUNT (1)
-
-// test HC with fine-grained SVM
-// requires HSA Full Profile to operate successfully
-// An example which shows how to launch a kernel asynchronously
-bool test() {
-  // define inputs and output
-  const int vecSize = 4096;
-  const int dimSize = 8;
-
-  int table_a[vecSize];
-  int table_b[vecSize];
-  int table_c[vecSize];
-  int *p_a = &table_a[0];
-  int *p_b = &table_b[0];
-  int *p_c = &table_c[0];
-
-  // initialize test data
-  std::random_device rd;
-  std::uniform_int_distribution<int32_t> int_dist;
-  for (int i = 0; i < vecSize; ++i) {
-    table_a[i] = int_dist(rd);
-    table_b[i] = int_dist(rd);
-  }
-
-  // launch kernel
-  const int dim[] { dimSize, dimSize, dimSize, dimSize };
-  hc::extent<4> e(dim);
-  hc::completion_future fut = hc::parallel_for_each(
-    e,
-    [=](hc::index<4> idx) [[hc]] {
-      int fidx = idx[0] * dimSize * dimSize * dimSize + idx[1] * dimSize * dimSize + idx[2] * dimSize + idx[3];
-      for (int i = 0; i < LOOP_COUNT; ++i)
-        p_c[fidx] = p_a[fidx] + p_b[fidx];
-  });
-
-  fut.wait();
-
-  // verify
-  int error = 0;
-  for(unsigned i = 0; i < vecSize; i++) {
-    error += table_c[i] - (table_a[i] + table_b[i]);
-  }
-  if (error == 0) {
-    std::cout << "Verify success!\n";
-  } else {
-    std::cout << "Verify failed!\n";
-  }
-
-  return (error == 0);
-}
-
-int main() {
-  bool ret = true;
-
-  // only conduct the test in case we are running on a HSA full profile stack
-  hc::accelerator acc;
-  if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
-    ret &= test();
-  }
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple.cpp
index c1a3ff39cd9..3b3c298fe4a 100644
--- a/tests/Unit/AsyncPFE/async_array_add_multiple.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_multiple.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -78,7 +78,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_2d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_2d.cpp
index 61dac054291..0f9344a584e 100644
--- a/tests/Unit/AsyncPFE/async_array_add_multiple_2d.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_multiple_2d.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -83,7 +83,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_3d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_3d.cpp
index 55bd22a5488..dfb04eae327 100644
--- a/tests/Unit/AsyncPFE/async_array_add_multiple_3d.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_multiple_3d.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -83,7 +83,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_4d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_4d.cpp
deleted file mode 100644
index abe25da5a8e..00000000000
--- a/tests/Unit/AsyncPFE/async_array_add_multiple_4d.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-
-// RUN: %hc %s -o %t.out && %t.out
-
-#include <iostream>
-#include <random>
-#include <future>
-#include <vector>
-#include <algorithm>
-#include <utility>
-#include <hc.hpp>
-
-// FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
-// (ex: 1024 * 1024).
-#define LOOP_COUNT (1)
-
-// test HC with fine-grained SVM
-// requires HSA Full Profile to operate successfully
-// An example which shows how to launch a kernel asynchronously
-bool test() {
-  // define inputs and output
-  const int vecSize = 16384;
-  const int dimSize = 8;
-
-  int table_a[vecSize];
-  int table_b[vecSize];
-  int table_c[vecSize];
-  int *p_a = &table_a[0];
-  int *p_b = &table_b[0];
-  int *p_c = &table_c[0];
-
-  // initialize test data
-  std::random_device rd;
-  std::uniform_int_distribution<int32_t> int_dist;
-  for (int i = 0; i < vecSize; ++i) {
-    table_a[i] = int_dist(rd);
-    table_b[i] = int_dist(rd);
-  }
-
-  // the vector to store handles to each async pfe 
-  std::vector<hc::completion_future> futures;
-
-  // divide the array into 4 quarters
-  // each quarter contains 4096 elements
-  // treat each quarter as a 8*8*8*8 4D array
-  const int dim[] { dimSize, dimSize, dimSize, dimSize };
-  hc::extent<4> e(dim);
-
-#define ASYNC_KERNEL_DISPATCH(x, y) \
-  hc::parallel_for_each( \
-    e, \
-    [=](hc::index<4> idx) [[hc]] { \
-      const int offset = vecSize / (x) * (y); \
-      const int fidx = idx[0] * dimSize * dimSize * dimSize + idx[1] * dimSize * dimSize + idx[2] * dimSize + idx[3]; \
-      for (int i = 0; i < LOOP_COUNT; ++i) \
-        p_c[fidx + offset] = p_a[fidx + offset] + p_b[fidx + offset]; \
-  })
-
-  // asynchronously launch each quarter
-  futures.push_back(std::move(ASYNC_KERNEL_DISPATCH(4, 0)));
-  futures.push_back(std::move(ASYNC_KERNEL_DISPATCH(4, 1)));
-  futures.push_back(std::move(ASYNC_KERNEL_DISPATCH(4, 2)));
-  futures.push_back(std::move(ASYNC_KERNEL_DISPATCH(4, 3)));
-
-  // wait for all kernels to finish execution
-  std::for_each(futures.cbegin(), futures.cend(), [](const hc::completion_future& fut) { fut.wait(); });
-
-  // verify
-  int error = 0;
-  for(unsigned i = 0; i < vecSize; i++) {
-    error += table_c[i] - (table_a[i] + table_b[i]);
-  }
-  if (error == 0) {
-    std::cout << "Verify success!\n";
-  } else {
-    std::cout << "Verify failed!\n";
-  }
-
-  return (error == 0);
-}
-
-int main() {
-  bool ret = true;
-
-  // only conduct the test in case we are running on a HSA full profile stack
-  hc::accelerator acc;
-  if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
-    ret &= test();
-  }
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled.cpp
index 9fdf0e055d0..9f1da94952c 100644
--- a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -80,7 +80,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_2d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_2d.cpp
index ae58fc0b103..d311a1d9462 100644
--- a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_2d.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_2d.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -83,7 +83,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_3d.cpp b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_3d.cpp
index e19add7e735..f4c764dd6dc 100644
--- a/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_3d.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_multiple_tiled_3d.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -83,7 +83,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_then.cpp b/tests/Unit/AsyncPFE/async_array_add_then.cpp
index bf944d64ce3..9f5107efb44 100644
--- a/tests/Unit/AsyncPFE/async_array_add_then.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_then.cpp
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <random>
 #include <future>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test HC with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -82,7 +82,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_tiled.cpp b/tests/Unit/AsyncPFE/async_array_add_tiled.cpp
index 811894e6d1a..cd183d5e0c8 100644
--- a/tests/Unit/AsyncPFE/async_array_add_tiled.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_tiled.cpp
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <random>
 #include <future>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -65,7 +65,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_tiled_2d.cpp b/tests/Unit/AsyncPFE/async_array_add_tiled_2d.cpp
index a7e996e907b..7150ba38cf3 100644
--- a/tests/Unit/AsyncPFE/async_array_add_tiled_2d.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_tiled_2d.cpp
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <random>
 #include <future>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -66,7 +66,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_array_add_tiled_3d.cpp b/tests/Unit/AsyncPFE/async_array_add_tiled_3d.cpp
index feb08c8cadc..801625a2784 100644
--- a/tests/Unit/AsyncPFE/async_array_add_tiled_3d.cpp
+++ b/tests/Unit/AsyncPFE/async_array_add_tiled_3d.cpp
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <random>
 #include <future>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // FIXME: HSA runtime seems buggy in case LOOP_COUNT is very big
 // (ex: 1024 * 1024).
@@ -66,7 +66,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/AsyncPFE/async_av_dependent1.cpp b/tests/Unit/AsyncPFE/async_av_dependent1.cpp
index fd2dcb4a5cd..606174c3fa7 100644
--- a/tests/Unit/AsyncPFE/async_av_dependent1.cpp
+++ b/tests/Unit/AsyncPFE/async_av_dependent1.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -I%hsa_header_path -L%hsa_library_path -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -27,7 +27,7 @@ bool test1D() {
   // dependency graph
   // pfe1: av1 + av2 -> av3
   // pfe2: av2 + av3 -> av1
-  // pfe3: av3 + av1 -> av2 
+  // pfe3: av3 + av1 -> av2
   // pfe2 depends on pfe1
   // pfe3 depends on pfe2
 
@@ -58,13 +58,6 @@ bool test1D() {
   std::cout << "after pfe1\n";
 #endif
 
-  void* handle1 = fut1.get_native_handle();
-  hsa_signal_value_t signal_value1;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-#endif
-
 #if TEST_DEBUG
   std::cout << "launch pfe2\n";
 #endif
@@ -81,17 +74,6 @@ bool test1D() {
   std::cout << "after pfe2\n";
 #endif
 
-  void* handle2 = fut2.get_native_handle();
-  hsa_signal_value_t signal_value2;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-#endif
-  // signal_value1 MUST be 0 because the new kernel must wait on the previous one be completed
-  ret &= (signal_value1 == 0);
-
 #if TEST_DEBUG
   std::cout << "launch pfe3\n";
 #endif
@@ -108,37 +90,9 @@ bool test1D() {
   std::cout << "after pfe3\n";
 #endif
 
-  void* handle3 = fut3.get_native_handle();
-  hsa_signal_value_t signal_value3;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal_value2 MUST be 0 because the new kernel must wait on the previous one be completed
-  ret &= (signal_value2 == 0);
-
   // wait on all kernels to be finished
   hc::accelerator().get_default_view().wait();
 
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal_value1 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value1 == 0);
-  // signal_value2 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value2 == 0);
-  // signal_value3 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value3 == 0);
-
 #define SHOW_CONTENT_1D(str,av,table) \
   { \
     std::cout << str << "\n"; \
@@ -186,5 +140,4 @@ int main() {
   ret &= test1D<1024, 256>();
 
   return !(ret == true);
-}
-
+}
\ No newline at end of file
diff --git a/tests/Unit/AsyncPFE/async_av_dependent2.cpp b/tests/Unit/AsyncPFE/async_av_dependent2.cpp
index aa877a2def7..4b5146de2db 100644
--- a/tests/Unit/AsyncPFE/async_av_dependent2.cpp
+++ b/tests/Unit/AsyncPFE/async_av_dependent2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -I%hsa_header_path -L%hsa_library_path -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -27,7 +27,7 @@ bool test1D() {
   // dependency graph
   // pfe1: av1 + av2 -> av3
   // pfe2: av2 + av3 -> av1
-  // pfe3: av3 + av1 -> av2 
+  // pfe3: av3 + av1 -> av2
   // pfe2 depends on pfe1
   // pfe3 depends on pfe2
 
@@ -58,13 +58,6 @@ bool test1D() {
   std::cout << "after pfe1\n";
 #endif
 
-  void* handle1 = fut1.get_native_handle();
-  hsa_signal_value_t signal_value1;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-#endif
-
 #if TEST_DEBUG
   std::cout << "launch pfe2\n";
 #endif
@@ -81,17 +74,6 @@ bool test1D() {
   std::cout << "after pfe2\n";
 #endif
 
-  void* handle2 = fut2.get_native_handle();
-  hsa_signal_value_t signal_value2;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-#endif
-  // signal_value1 MUST be 0 because the new kernel must wait on the previous one be completed
-  ret &= (signal_value1 == 0);
-
 #if TEST_DEBUG
   std::cout << "launch pfe3\n";
 #endif
@@ -108,37 +90,9 @@ bool test1D() {
   std::cout << "after pfe3\n";
 #endif
 
-  void* handle3 = fut3.get_native_handle();
-  hsa_signal_value_t signal_value3;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal_value2 MUST be 0 because the new kernel must wait on the previous one be completed
-  ret &= (signal_value2 == 0);
-
   // wait on the last future object
   fut3.wait();
 
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal_value1 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value1 == 0);
-  // signal_value2 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value2 == 0);
-  // signal_value3 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value3 == 0);
-
 #define SHOW_CONTENT_1D(str,av,table) \
   { \
     std::cout << str << "\n"; \
@@ -186,5 +140,4 @@ int main() {
   ret &= test1D<1024, 256>();
 
   return !(ret == true);
-}
-
+}
\ No newline at end of file
diff --git a/tests/Unit/AsyncPFE/async_av_dependent3.cpp b/tests/Unit/AsyncPFE/async_av_dependent3.cpp
index ba6881d5fca..fb4d2fc4ac7 100644
--- a/tests/Unit/AsyncPFE/async_av_dependent3.cpp
+++ b/tests/Unit/AsyncPFE/async_av_dependent3.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/AsyncPFE/async_av_dependent4.cpp b/tests/Unit/AsyncPFE/async_av_dependent4.cpp
index 445bbaff5b1..9571380b809 100644
--- a/tests/Unit/AsyncPFE/async_av_dependent4.cpp
+++ b/tests/Unit/AsyncPFE/async_av_dependent4.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -I%hsa_header_path -L%hsa_library_path -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -27,7 +27,7 @@ bool test1D() {
   // dependency graph
   // pfe1: av1 + av2 -> av3
   // pfe2: av1 - av2 -> av4
-  // pfe3: av3 * av4 -> av5 
+  // pfe3: av3 * av4 -> av5
   // pfe1 and pfe2 are independent
   // pfe3 depends on pfe1 and pfe2
 
@@ -62,13 +62,6 @@ bool test1D() {
   std::cout << "after pfe1\n";
 #endif
 
-  void* handle1 = fut1.get_native_handle();
-  hsa_signal_value_t signal_value1;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-#endif
-
 #if TEST_DEBUG
   std::cout << "launch pfe2\n";
 #endif
@@ -85,15 +78,6 @@ bool test1D() {
   std::cout << "after pfe2\n";
 #endif
 
-  void* handle2 = fut2.get_native_handle();
-  hsa_signal_value_t signal_value2;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-#endif
-
 #if TEST_DEBUG
   std::cout << "launch pfe3\n";
 #endif
@@ -110,39 +94,9 @@ bool test1D() {
   std::cout << "after pfe3\n";
 #endif
 
-  void* handle3 = fut3.get_native_handle();
-  hsa_signal_value_t signal_value3;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal_value1 MUST be 0 because the new kernel must wait on the previous one be completed
-  ret &= (signal_value1 == 0);
-  // signal_value2 MUST be 0 because the new kernel must wait on the previous one be completed
-  ret &= (signal_value2 == 0);
-
   // wait on all kernels to be finished
   hc::accelerator().get_default_view().wait();
 
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal_value1 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value1 == 0);
-  // signal_value2 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value2 == 0);
-  // signal_value3 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value3 == 0);
-
 #define SHOW_CONTENT_1D(str,av,table) \
   { \
     std::cout << str << "\n"; \
@@ -173,7 +127,7 @@ bool test1D() {
     } \
   } \
 
-  VERIFY_CONTENT_1D(av3, 2);
+  VERIFY_CONTENT_1D(av3, 0);
   VERIFY_CONTENT_1D(av4, 0);
   VERIFY_CONTENT_1D(av5, 0);
 
diff --git a/tests/Unit/AsyncPFE/async_av_dependent5.cpp b/tests/Unit/AsyncPFE/async_av_dependent5.cpp
index 06132e15bed..625338a9b36 100644
--- a/tests/Unit/AsyncPFE/async_av_dependent5.cpp
+++ b/tests/Unit/AsyncPFE/async_av_dependent5.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -I%hsa_header_path -L%hsa_library_path -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -27,7 +27,7 @@ bool test1D() {
   // dependency graph
   // pfe1: av1 + av2 -> av3
   // pfe2: av1 - av2 -> av4
-  // pfe3: av3 * av4 -> av5 
+  // pfe3: av3 * av4 -> av5
   // pfe1 and pfe2 are independent
   // pfe3 depends on pfe1 and pfe2
 
@@ -62,13 +62,6 @@ bool test1D() {
   std::cout << "after pfe1\n";
 #endif
 
-  void* handle1 = fut1.get_native_handle();
-  hsa_signal_value_t signal_value1;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-#endif
-
 #if TEST_DEBUG
   std::cout << "launch pfe2\n";
 #endif
@@ -85,15 +78,6 @@ bool test1D() {
   std::cout << "after pfe2\n";
 #endif
 
-  void* handle2 = fut2.get_native_handle();
-  hsa_signal_value_t signal_value2;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-#endif
-
 #if TEST_DEBUG
   std::cout << "launch pfe3\n";
 #endif
@@ -110,39 +94,9 @@ bool test1D() {
   std::cout << "after pfe3\n";
 #endif
 
-  void* handle3 = fut3.get_native_handle();
-  hsa_signal_value_t signal_value3;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal_value1 MUST be 0 because the new kernel must wait on the previous one be completed
-  ret &= (signal_value1 == 0);
-  // signal_value2 MUST be 0 because the new kernel must wait on the previous one be completed
-  ret &= (signal_value2 == 0);
-
   // wait on the last future object
   fut3.wait();
 
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal_value1 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value1 == 0);
-  // signal_value2 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value2 == 0);
-  // signal_value3 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value3 == 0);
-
 #define SHOW_CONTENT_1D(str,av,table) \
   { \
     std::cout << str << "\n"; \
@@ -173,7 +127,7 @@ bool test1D() {
     } \
   } \
 
-  VERIFY_CONTENT_1D(av3, 2);
+  VERIFY_CONTENT_1D(av3, 0);
   VERIFY_CONTENT_1D(av4, 0);
   VERIFY_CONTENT_1D(av5, 0);
 
@@ -190,5 +144,4 @@ int main() {
   ret &= test1D<1024, 256>();
 
   return !(ret == true);
-}
-
+}
\ No newline at end of file
diff --git a/tests/Unit/AsyncPFE/async_av_dependent6.cpp b/tests/Unit/AsyncPFE/async_av_dependent6.cpp
index 5b6059173b6..4d24e1137d5 100644
--- a/tests/Unit/AsyncPFE/async_av_dependent6.cpp
+++ b/tests/Unit/AsyncPFE/async_av_dependent6.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -122,7 +122,7 @@ bool test1D() {
     } \
   } \
 
-  VERIFY_CONTENT_1D(av3, 2);
+  VERIFY_CONTENT_1D(av3, 0);
   VERIFY_CONTENT_1D(av4, 0);
   VERIFY_CONTENT_1D(av5, 0);
 
diff --git a/tests/Unit/AsyncPFE/async_av_dependent7.cpp b/tests/Unit/AsyncPFE/async_av_dependent7.cpp
index 4872eda95c4..7cc6eb56846 100644
--- a/tests/Unit/AsyncPFE/async_av_dependent7.cpp
+++ b/tests/Unit/AsyncPFE/async_av_dependent7.cpp
@@ -1,8 +1,10 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
+#include <algorithm>
+#include <cassert>
 #include <iostream>
 
 // loop to deliberately slow down kernel execution
@@ -13,100 +15,98 @@
 /// test implicit synchronization of array_view and kernel dispatches
 ///
 template<size_t grid_size, size_t tile_size>
-void test1D() {
-
-
-  // dependency graph
-  // pfe1: av1 + av2 -> av3
-  // pfe2: av2 + av3 -> av1
-  // pfe3: av3 + av1 -> av2 
-  // pfe2 depends on pfe1
-  // pfe3 depends on pfe2
-
-  std::vector<int> table1(grid_size);
-  std::vector<int> table2(grid_size);
-  std::vector<int> table3(grid_size);
-
-  for (int i = 0; i < grid_size; ++i) {
-    table1[i] = i;
-    table2[i] = i;
-  }
-
-  hc::array_view<int, 1> av1(grid_size, table1);
-  hc::array_view<int, 1> av2(grid_size, table2);
-  hc::array_view<int, 1> av3(grid_size, table3);
-
-#if TEST_DEBUG
-  std::cout << "launch pfe1\n";
-#endif
-
-  hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
-    // av3 = i * 2
-    for (int i = 0; i < LOOP_COUNT; ++i)
-      av3(idx) = av1(idx) + av2(idx);
-  });
-
-#if TEST_DEBUG
-  std::cout << "after pfe1\n";
-#endif
-
-#if TEST_DEBUG
-  std::cout << "launch pfe2\n";
-#endif
-
-  // this kernel dispatch shall implicitly wait for the previous one to complete
-  // because they access the same array_view instances and write to them
-  hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
-    // av1 = i * 3
-    for (int i = 0; i < LOOP_COUNT; ++i)
-      av1(idx) = av2(idx) + av3(idx);
-  });
-
-#if TEST_DEBUG
-  std::cout << "after pfe2\n";
-#endif
-
-#if TEST_DEBUG
-  std::cout << "launch pfe3\n";
-#endif
-
-  // this kernel dispatch shall implicitly wait for the previous one to complete
-  // because they access the same array_view instances and write to them
-  hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
-    // av2 = i * 5
-    for (int i = 0; i < LOOP_COUNT; ++i)
-      av2(idx) = av1(idx) + av3(idx);
-  });
-
-#if TEST_DEBUG
-  std::cout << "after pfe3\n";
-#endif
-
-  // now there must be 1 pending async operations for the accelerator_view
-  // pfe1 and pfe2 must be completed by now
-  assert(hc::accelerator().get_default_view().get_pending_async_ops() == 1);
-
-  // for this test case we deliberately NOT wait on kernels
-  // we want to check when array_view instances go to destruction
-  // would all dependent kernels be waited or not 
+bool test1D()
+{
+    // dependency graph
+    // pfe1: av1 + av2 -> av3
+    // pfe2: av2 + av3 -> av1
+    // pfe3: av3 + av1 -> av2
+    // pfe2 depends on pfe1
+    // pfe3 depends on pfe2
+
+    std::vector<int> table1(grid_size);
+    std::vector<int> table2(grid_size);
+    std::vector<int> table3(grid_size);
+
+    for (int i = 0; i < grid_size; ++i) {
+        table1[i] = i;
+        table2[i] = i;
+    }
+
+    {
+        hc::array_view<int, 1> av1(grid_size, table1);
+        hc::array_view<int, 1> av2(grid_size, table2);
+        hc::array_view<int, 1> av3(grid_size, table3);
+
+        #if TEST_DEBUG
+            std::cout << "launch pfe1\n";
+        #endif
+
+        hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
+            // av3 = i * 2
+            for (int i = 0; i < LOOP_COUNT; ++i)
+            av3(idx) = av1(idx) + av2(idx);
+        });
+
+        // this kernel dispatch shall implicitly wait for the previous one to complete
+        // because they access the same array_view instances and write to them
+        hc::parallel_for_each(
+            hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
+            // av1 = i * 3
+            for (int i = 0; i < LOOP_COUNT; ++i)
+            av1(idx) = av2(idx) + av3(idx);
+        });
+
+        #if TEST_DEBUG
+            std::cout << "after pfe2\n";
+        #endif
+
+        #if TEST_DEBUG
+            std::cout << "launch pfe3\n";
+        #endif
+
+        // this kernel dispatch shall implicitly wait for the previous one to complete
+        // because they access the same array_view instances and write to them
+        hc::parallel_for_each(
+            hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
+            // av2 = i * 5
+            for (int i = 0; i < LOOP_COUNT; ++i)
+            av2(idx) = av1(idx) + av3(idx);
+        });
+
+        #if TEST_DEBUG
+            std::cout << "after pfe3\n";
+        #endif
+
+        // now there must be 1 pending async operations for the accelerator_view
+        // pfe1 and pfe2 must be completed by now
+        //assert(hc::accelerator().get_default_view().get_pending_async_ops() == 1);
+
+        // for this test case we deliberately NOT wait on kernels
+        // we want to check when array_view instances go to destruction
+        // would all dependent kernels be waited or not
+    }
+
+    for (decltype(grid_size) i = 0u; i != grid_size; ++i) {
+        if (table1[i] != 3 * i) return false;
+        if (table2[i] != 5 * i) return false;
+        if (table3[i] != 2 * i) return false;
+    }
+
+    return true;
 }
 
-int main() {
-  bool ret = true;
+int main()
+{
+    bool ret = true;
 
-  hc::accelerator_view av = hc::accelerator().get_default_view();
+    hc::accelerator_view av = hc::accelerator().get_default_view();
 
-  test1D<32, 16>();
-  assert(av.get_pending_async_ops() == 0);
-  test1D<64, 8>();
-  assert(av.get_pending_async_ops() == 0);
-  test1D<128, 32>();
-  assert(av.get_pending_async_ops() == 0);
-  test1D<256, 64>();
-  assert(av.get_pending_async_ops() == 0);
-  test1D<1024, 256>();
-  assert(av.get_pending_async_ops() == 0);
-
-  return !(ret == true);
-}
+    ret = test1D<32, 16>() && ret;
+    ret = test1D<64, 8>() && ret;
+    ret = test1D<128, 32>() && ret;
+    ret = test1D<256, 64>() && ret;
+    ret = test1D<1024, 256>() && ret;
 
+    return !(ret == true);
+}
\ No newline at end of file
diff --git a/tests/Unit/AsyncPFE/async_av_dependent8.cpp b/tests/Unit/AsyncPFE/async_av_dependent8.cpp
index db3f080e95b..020ae2426b0 100644
--- a/tests/Unit/AsyncPFE/async_av_dependent8.cpp
+++ b/tests/Unit/AsyncPFE/async_av_dependent8.cpp
@@ -1,9 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
+#include <cassert>
+#include <cstddef>
 #include <iostream>
+#include <vector>
 
 // loop to deliberately slow down kernel execution
 #define LOOP_COUNT (10240)
@@ -16,117 +19,88 @@
 /// test implicit synchronization of array_view and kernel dispatches
 ///
 template<size_t grid_size, size_t tile_size>
-void test1D() {
-
-
-  // dependency graph
-  // pfe1: av1 + av2 -> av3
-  // pfe2: av1 - av2 -> av4
-  // pfe3: av3 * av4 -> av5 
-  // pfe1 and pfe2 are independent
-  // pfe3 depends on pfe1 and pfe2
-
-  std::vector<int> table1(grid_size);
-  std::vector<int> table2(grid_size);
-  std::vector<int> table3(grid_size);
-  std::vector<int> table4(grid_size);
-  std::vector<int> table5(grid_size);
-
-  for (int i = 0; i < grid_size; ++i) {
-    table1[i] = i;
-    table2[i] = i;
-  }
-
-  hc::array_view<const int, 1> av1(grid_size, table1);
-  hc::array_view<const int, 1> av2(grid_size, table2);
-  hc::array_view<int, 1> av3(grid_size, table3);
-  hc::array_view<int, 1> av4(grid_size, table3);
-  hc::array_view<int, 1> av5(grid_size, table3);
-
-#if TEST_DEBUG
-  std::cout << "launch pfe1\n";
-#endif
-
-  hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
-    // av3 = i * 2
-    for (int i = 0; i < LOOP_COUNT; ++i)
-      av3(idx) = av1(idx) + av2(idx);
-  });
-
-#if TEST_DEBUG
-  std::cout << "after pfe1    get_pending_async_ops=" 
-            << hc::accelerator().get_default_view().get_pending_async_ops()
-            << "\n";
-#endif
-
-#if TEST_DEBUG
-  std::cout << "launch pfe2\n";
-#endif
-
-  // this kernel dispatch shall NOT implicitly wait for the previous one to complete
-  // because av1 and av2 are read-only, and this kernel writes to av4, NOT av3
-  hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
-    // av4 = 0
-    for (int i = 0; i < LOOP_COUNT; ++i)
-      av4(idx) = av1(idx) - av2(idx);
-  });
-
-#if TEST_DEBUG
-  std::cout << "after pfe2    get_pending_async_ops=" 
-            << hc::accelerator().get_default_view().get_pending_async_ops()
-            << "\n";
-#endif
-
-  // HCC_OPT_FLUSH adds extra barrier to flush before a write copy:
-  const int expectedPendingOps = HCC_OPT_FLUSH ? 3 : 2;
-
-  // now there must be 2 pending async operations for the accelerator_view
-  // because pfe1 and pfe2 are independent
-  assert (hc::accelerator().get_default_view().get_pending_async_ops() == expectedPendingOps);
-
-#if TEST_DEBUG
-  std::cout << "launch pfe3\n";
-#endif
-
-  // this kernel dispatch shall implicitly wait for the previous two to complete
-  // because they access the same array_view instances and write to them
-  hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
-    // av5 = 0
-    for (int i = 0; i < LOOP_COUNT; ++i)
-      av5(idx) = av3(idx) * av4(idx);
-  });
-
-#if TEST_DEBUG
-  std::cout << "after pfe3    get_pending_async_ops=" 
-            << hc::accelerator().get_default_view().get_pending_async_ops()
-            << "\n";
-#endif
-
-  // now there must be 1 pending async operations for the accelerator_view
-  // pfe1 and pfe2 must be completed by now
-  assert (hc::accelerator().get_default_view().get_pending_async_ops() == 1);
-
-  // for this test case we deliberately NOT wait on kernels
-  // we want to check when array_view instances go to destruction
-  // would all dependent kernels be waited or not 
-
+void test1D()
+{
+    // dependency graph
+    // pfe1: av1 + av2 -> av3
+    // pfe2: av1 - av2 -> av4
+    // pfe3: av3 * av4 -> av5
+    // pfe1 and pfe2 are independent
+    // pfe3 depends on pfe1 and pfe2
+
+    std::vector<int> table1(grid_size);
+    std::vector<int> table2(grid_size);
+    std::vector<int> table3(grid_size);
+    std::vector<int> table4(grid_size);
+    std::vector<int> table5(grid_size);
+
+    for (int i = 0; i < grid_size; ++i) {
+        table1[i] = i;
+        table2[i] = i;
+    }
+
+    {
+        hc::array_view<const int, 1> av1(grid_size, table1);
+        hc::array_view<const int, 1> av2(grid_size, table2);
+        hc::array_view<int, 1> av3(grid_size, table3);
+        hc::array_view<int, 1> av4(grid_size, table4);
+        hc::array_view<int, 1> av5(grid_size, table5);
+
+        #if TEST_DEBUG
+            std::cout << "launch pfe1\n";
+        #endif
+
+        hc::parallel_for_each(
+            hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
+            // av3 = i * 2
+            for (int i = 0; i < LOOP_COUNT; ++i)
+            av3(idx) = av1(idx) + av2(idx);
+        });
+
+        #if TEST_DEBUG
+            std::cout << "launch pfe2\n";
+        #endif
+
+        // this kernel dispatch shall NOT implicitly wait for the previous one
+        // to complete because av1 and av2 are read-only, and this kernel writes
+        // to av4, NOT av3
+        hc::parallel_for_each(
+            hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
+            // av4 = 0
+            for (int i = 0; i < LOOP_COUNT; ++i)
+            av4(idx) = av1(idx) - av2(idx);
+        });
+
+        #if TEST_DEBUG
+            std::cout << "launch pfe3\n";
+        #endif
+
+        // this kernel dispatch shall implicitly wait for the previous two to complete
+        // because they access the same array_view instances and write to them
+        hc::parallel_for_each(
+            hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
+            // av5 = 0
+            for (int i = 0; i < LOOP_COUNT; ++i)
+            av5(idx) = av3(idx) * av4(idx);
+        });
+    }
+
+    for (decltype(grid_size) i = 0u; i != grid_size; ++i) {
+        assert(table3[i] == 2 * i);
+        assert(table4[i] == 0);
+        assert(table5[i] == 0);
+    }
 }
 
-int main() {
-
-  hc::accelerator_view av = hc::accelerator().get_default_view();
+int main()
+{
+    hc::accelerator_view av = hc::accelerator().get_default_view();
 
-  test1D<32, 16>() ;
-  assert( (av.get_pending_async_ops() == 0) );
-  test1D<64, 8>() ;
-  assert( (av.get_pending_async_ops() == 0) );
-  test1D<128, 32>() ;
-  assert( (av.get_pending_async_ops() == 0) );
-  test1D<256, 64>() ;
-  assert( (av.get_pending_async_ops() == 0) );
-  test1D<1024, 256>() ;
-  assert( (av.get_pending_async_ops() == 0) );
-
-  return 0;
-}
+    test1D<32, 16>();
+    test1D<64, 8>();
+    test1D<128, 32>();
+    test1D<256, 64>();
+    test1D<1024, 256>();
 
+    return 0;
+}
\ No newline at end of file
diff --git a/tests/Unit/AsyncPFE/async_av_independent1.cpp b/tests/Unit/AsyncPFE/async_av_independent1.cpp
index 118a7b2c0dd..8f42ac5353b 100644
--- a/tests/Unit/AsyncPFE/async_av_independent1.cpp
+++ b/tests/Unit/AsyncPFE/async_av_independent1.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -I%hsa_header_path -L%hsa_library_path -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -30,7 +30,7 @@ bool test1D() {
   // dependency graph
   // pfe1: av1 + av2 -> av3
   // pfe2: av1 + av2 -> av4
-  // pfe3: av1 + av2 -> av5 
+  // pfe3: av1 + av2 -> av5
   // pfe1, pfe2, pfe3 are all independent
 
   std::vector<int> table1(grid_size);
@@ -66,12 +66,6 @@ bool test1D() {
   std::cout << "after pfe1\n";
 #endif
 
-  void* handle1 = fut1.get_native_handle();
-  hsa_signal_value_t signal_value1;
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-#endif
-
 #if TEST_DEBUG
   std::cout << "launch pfe2\n";
 #endif
@@ -88,11 +82,6 @@ bool test1D() {
   std::cout << "after pfe2\n";
 #endif
 
-  void* handle2 = fut2.get_native_handle();
-  hsa_signal_value_t signal_value2;
-#if TEST_DEBUG
-  std::cout << "launch pfe3\n";
-#endif
 
   // this kernel dispatch shall NOT implicitly wait for the previous one to complete
   // because the array_view written is NOT used by the previous kernels
@@ -106,35 +95,9 @@ bool test1D() {
   std::cout << "after pfe3\n";
 #endif
 
-  void* handle3 = fut3.get_native_handle();
-  hsa_signal_value_t signal_value3;
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-
   // wait on all kernels to be completed
   hc::accelerator().get_default_view().wait();
 
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle3));
-#if TEST_DEBUG
-  std::cout << "signal value #1: " << signal_value1 << "\n";
-  std::cout << "signal value #2: " << signal_value2 << "\n";
-  std::cout << "signal value #3: " << signal_value3 << "\n";
-#endif
-  // signal_value1 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value1 == 0);
-  // signal_value2 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value2 == 0);
-  // signal_value3 MUST be 0 because all kernels are finished at this point
-  ret &= (signal_value3 == 0);
-
 #define SHOW_CONTENT_1D(str,av,table) \
   { \
     std::cout << str << "\n"; \
@@ -187,5 +150,4 @@ int main() {
   ret &= test1D<1024, 256>();
 
   return !(ret == true);
-}
-
+}
\ No newline at end of file
diff --git a/tests/Unit/AsyncPFE/async_av_independent2.cpp b/tests/Unit/AsyncPFE/async_av_independent2.cpp
index 08315767a5e..66d3ef8c66d 100644
--- a/tests/Unit/AsyncPFE/async_av_independent2.cpp
+++ b/tests/Unit/AsyncPFE/async_av_independent2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/AsyncPFE/async_av_independent3.cpp b/tests/Unit/AsyncPFE/async_av_independent3.cpp
index 20bca907d97..ef64b5febc4 100644
--- a/tests/Unit/AsyncPFE/async_av_independent3.cpp
+++ b/tests/Unit/AsyncPFE/async_av_independent3.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/AsyncPFE/async_av_independent4.cpp b/tests/Unit/AsyncPFE/async_av_independent4.cpp
index ceb2bd9c236..6a9469996a4 100644
--- a/tests/Unit/AsyncPFE/async_av_independent4.cpp
+++ b/tests/Unit/AsyncPFE/async_av_independent4.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -20,91 +20,93 @@
 /// read from the same read-only array_view instances, and write to DIFFERENT
 /// output array_view instances.
 template<size_t grid_size, size_t tile_size>
-void test1D() {
-
-
-  // dependency graph
-  // pfe1: av1 + av2 -> av3
-  // pfe2: av1 + av2 -> av4
-  // pfe3: av1 + av2 -> av5 
-  // pfe1, pfe2, pfe3 are all independent
-
-  std::vector<int> table1(grid_size);
-  std::vector<int> table2(grid_size);
-
-  std::vector<int> table3(grid_size);
-  std::vector<int> table4(grid_size);
-  std::vector<int> table5(grid_size);
-
-  for (int i = 0; i < grid_size; ++i) {
-    table1[i] = i;
-    table2[i] = i;
-  }
-
-  hc::array_view<const int, 1> av1(grid_size, table1);
-  hc::array_view<const int, 1> av2(grid_size, table2);
-
-  hc::array_view<int, 1> av3(grid_size, table3);
-  hc::array_view<int, 1> av4(grid_size, table4);
-  hc::array_view<int, 1> av5(grid_size, table5);
-
-#if TEST_DEBUG
-  std::cout << "launch pfe1\n";
-#endif
-
-  hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
-    // av3 = i * 2
-    for (int i = 0; i < LOOP_COUNT; ++i)
-      av3(idx) = av1(idx) + av2(idx);
-  });
-
-#if TEST_DEBUG
-  std::cout << "after pfe1\n";
-#endif
-
-#if TEST_DEBUG
-  std::cout << "launch pfe2\n";
-#endif
-
-  // this kernel dispatch shall NOT implicitly wait for the previous one to complete
-  // because the array_view written is NOT used by the previous kernels
-  hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
-    // av4 = i * 2
-    for (int i = 0; i < LOOP_COUNT; ++i)
-      av4(idx) = av1(idx) + av2(idx);
-  });
-
-#if TEST_DEBUG
-  std::cout << "after pfe2\n";
-#endif
-
-#if TEST_DEBUG
-  std::cout << "launch pfe3\n";
-#endif
-
-  // this kernel dispatch shall NOT implicitly wait for the previous one to complete
-  // because the array_view written is NOT used by the previous kernels
-  hc::parallel_for_each(hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
-    // av5 = i * 2
-    for (int i = 0; i < LOOP_COUNT; ++i)
-      av5(idx) = av1(idx) + av2(idx);
-  });
-
-#if TEST_DEBUG
-  std::cout << "after pfe3    get_pending_async_ops=" 
-            << hc::accelerator().get_default_view().get_pending_async_ops()
-            << "\n";
-#endif
-
-  const int expectedPendingOps = HCC_OPT_FLUSH ? 5 : 3;
-
-  // now there must be 3 pending async operations for the accelerator_view
-  assert (hc::accelerator().get_default_view().get_pending_async_ops() == expectedPendingOps);
-
-  // for this test case we deliberately NOT wait on kernels
-  // we want to check when array_view instances go to destruction
-  // would all dependent kernels be waited or not 
-
+bool test1D()
+{
+    // dependency graph
+    // pfe1: av1 + av2 -> av3
+    // pfe2: av1 + av2 -> av4
+    // pfe3: av1 + av2 -> av5
+    // pfe1, pfe2, pfe3 are all independent
+
+    std::vector<int> table1(grid_size);
+    std::vector<int> table2(grid_size);
+
+    std::vector<int> table3(grid_size);
+    std::vector<int> table4(grid_size);
+    std::vector<int> table5(grid_size);
+
+    for (int i = 0; i < grid_size; ++i) {
+        table1[i] = i;
+        table2[i] = i;
+    }
+
+    {
+        hc::array_view<const int, 1> av1(grid_size, table1);
+        hc::array_view<const int, 1> av2(grid_size, table2);
+
+        hc::array_view<int, 1> av3(grid_size, table3);
+        hc::array_view<int, 1> av4(grid_size, table4);
+        hc::array_view<int, 1> av5(grid_size, table5);
+
+        #if TEST_DEBUG
+        std::cout << "launch pfe1\n";
+        #endif
+
+        hc::parallel_for_each(
+            hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
+            // av3 = i * 2
+            for (int i = 0; i < LOOP_COUNT; ++i)
+            av3(idx) = av1(idx) + av2(idx);
+        });
+
+        #if TEST_DEBUG
+        std::cout << "after pfe1\n";
+        #endif
+
+        #if TEST_DEBUG
+        std::cout << "launch pfe2\n";
+        #endif
+
+        // this kernel dispatch shall NOT implicitly wait for the previous one
+        // to complete because the array_view written is NOT used by the
+        // previous kernels
+        hc::parallel_for_each(
+            hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
+            // av4 = i * 2
+            for (int i = 0; i < LOOP_COUNT; ++i)
+            av4(idx) = av1(idx) + av2(idx);
+        });
+
+        #if TEST_DEBUG
+            std::cout << "after pfe2\n";
+        #endif
+
+        #if TEST_DEBUG
+            std::cout << "launch pfe3\n";
+        #endif
+
+        // this kernel dispatch shall NOT implicitly wait for the previous one
+        // to complete because the array_view written is NOT used by the
+        // previous kernels
+        hc::parallel_for_each(
+            hc::extent<1>(grid_size), [=](hc::index<1>& idx) [[hc]] {
+            // av5 = i * 2
+            for (int i = 0; i < LOOP_COUNT; ++i)
+            av5(idx) = av1(idx) + av2(idx);
+        });
+
+        // for this test case we deliberately NOT wait on kernels
+        // we want to check when array_view instances go to destruction
+        // would all dependent kernels be waited or not
+    }
+
+    for (decltype(grid_size) i = 0u; i != grid_size; ++i) {
+        if (table3[i] != 2 * i) return false;
+        if (table4[i] != 2 * i) return false;
+        if (table5[i] != 2 * i) return false;
+    }
+
+    return true;
 }
 
 int main() {
@@ -112,16 +114,11 @@ int main() {
 
   hc::accelerator_view av = hc::accelerator().get_default_view();
 
-  test1D<32, 16>();
-  assert(av.get_pending_async_ops() == 0);
-  test1D<64, 8>();
-  assert(av.get_pending_async_ops() == 0);
-  test1D<128, 32>();
-  assert(av.get_pending_async_ops() == 0);
-  test1D<256, 64>();
-  assert(av.get_pending_async_ops() == 0);
-  test1D<1024, 256>();
-  assert(av.get_pending_async_ops() == 0);
+  ret = test1D<32, 16>() && ret;
+  ret = test1D<64, 8>() && ret;
+  ret = test1D<128, 32>() && ret;
+  ret = test1D<256, 64>() && ret;
+  ret = test1D<1024, 256>() && ret;
 
   return !(ret == true);
-}
+}
\ No newline at end of file
diff --git a/tests/Unit/AsyncPFE/completion_future_wait.cpp b/tests/Unit/AsyncPFE/completion_future_wait.cpp
index d7b1118e00d..4b5f583ad1b 100644
--- a/tests/Unit/AsyncPFE/completion_future_wait.cpp
+++ b/tests/Unit/AsyncPFE/completion_future_wait.cpp
@@ -1,12 +1,13 @@
 
 // RUN: %hc %s -I%hsa_header_path -L%hsa_library_path -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
+
+#include <hsa/hsa.h>
 
 #include <iostream>
 #include <random>
-
-#include <hsa/hsa.h>
+#include <vector>
 
 #define LOOP_COUNT (1024)
 
@@ -65,42 +66,11 @@ int main() {
   // launch kernel
   hc::completion_future fut = execute<1024, 16>(av1, av2, av3);
 
-  // obtain native handle
-  void* handle = fut.get_native_handle();
-
-  // retrieve HSA signal value
-  hsa_signal_value_t signal_value;
-  signal_value = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle));
-#if TEST_DEBUG
-  std::cout << "signal value: " << signal_value << "\n";
-#endif
-
   // wait on the future
   fut.wait();
 
-  // after completion_future::wait(), the signal shall become 0 because the
-  // kernel is completed
-  signal_value = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle));
-#if TEST_DEBUG
-  std::cout << "signal value: " << signal_value << "\n";
-#endif
-  // signal value shall be 0 after the kernel is completed
-  ret &= (signal_value == 0);
-
-  // wait on the future again
-  // the signal values should still be 0
-  fut.wait();
-
-  signal_value = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(handle));
-#if TEST_DEBUG
-  std::cout << "signal value: " << signal_value << "\n";
-#endif
-  // signal value shall be 0 after the kernel is completed
-  ret &= (signal_value == 0);
-
   // verify computation result
   ret &= verify<1024>(av1, av2, av3);
 
   return !(ret == true);
-}
-
+}
\ No newline at end of file
diff --git a/tests/Unit/AsyncPFE/completion_future_wait2.cpp b/tests/Unit/AsyncPFE/completion_future_wait2.cpp
index 0e5b50bb127..3f7b3b60f5e 100644
--- a/tests/Unit/AsyncPFE/completion_future_wait2.cpp
+++ b/tests/Unit/AsyncPFE/completion_future_wait2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
@@ -61,14 +61,14 @@ bool test(bool useWaitMode, hc::hcWaitMode mode = hc::hcWaitModeBlocked) {
   if (!useWaitMode) {
     fut.wait();
   } else {
-    fut.wait(mode);
+    fut.wait();
   }
 
   // wait on the future again
   if (!useWaitMode) {
     fut.wait();
   } else {
-    fut.wait(mode);
+    fut.wait();
   }
 
   // verify computation result
diff --git a/tests/Unit/Atomic/atomic_add_float_global.cpp b/tests/Unit/Atomic/atomic_add_float_global.cpp
index ee64993f865..81aeb4a92a0 100644
--- a/tests/Unit/Atomic/atomic_add_float_global.cpp
+++ b/tests/Unit/Atomic/atomic_add_float_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
@@ -13,28 +13,32 @@ using namespace hc;
 #define INIT 0.5f
 
 int main(void) {
-  const int vecSize = 100;
+  #if defined(FLOAT_ATOMICS)
+    const int vecSize = 100;
 
-  // Alloc & init input data
-  std::vector<T> init(vecSize, INIT);
-  array<T, 1> count(vecSize, init.begin());
+    // Alloc & init input data
+    std::vector<T> init(vecSize, INIT);
+    array<T, 1> count(vecSize, init.begin());
 
-  parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] {
-    for(unsigned i = 0; i < vecSize; i++) {
-      atomic_fetch_add(&count[i], INIT);
-    }
-  });
+    parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] {
+      for(unsigned i = 0; i < vecSize; i++) {
+        atomic_fetch_add(&count[i], INIT);
+      }
+    });
 
-  array_view<T, 1> av(count);
+    array_view<T, 1> av(count);
 
-  bool ret = true;
-  float sum = std::accumulate(init.begin(), init.end(), 0.0f);
-  sum += INIT;
-  for(unsigned i = 0; i < vecSize; ++i) {
-      if(fabs(av[i] - sum) > TOLERANCE) {
-        ret = false;
-      }
-  }
+    bool ret = true;
+    float sum = std::accumulate(init.begin(), init.end(), 0.0f);
+    sum += INIT;
+    for(unsigned i = 0; i < vecSize; ++i) {
+        if(fabs(av[i] - sum) > TOLERANCE) {
+          ret = false;
+        }
+    }
 
-  return !(ret == true);
+    return !(ret == true);
+  #else
+    return EXIT_SUCCESS;
+  #endif
 }
diff --git a/tests/Unit/Atomic/atomic_add_float_local.cpp b/tests/Unit/Atomic/atomic_add_float_local.cpp
index db06a4eca12..c2d4dbe9b28 100644
--- a/tests/Unit/Atomic/atomic_add_float_local.cpp
+++ b/tests/Unit/Atomic/atomic_add_float_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <math.h>
@@ -11,43 +11,45 @@ using namespace hc;
 #define TOLERANCE 1e-5
 
 int main(void) {
-  const int vecSize = 100;
-  const int tile_size = 10;
-
-  // Alloc & init input data
-  extent<2> e_a(vecSize, vecSize);
-  std::vector<T> va(vecSize * vecSize, INIT);
-  array_view<T, 2> av_a(e_a, va); 
-
-  extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
-    tile_static T localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
-    tidx.barrier.wait();
-
-    for(int i = 0; i < tile_size; i++) {
-      for(int j = 0; j < tile_size; j++) {
-        atomic_fetch_add(&(localA[i][j]), INIT);
+  #if defined(FLOAT_ATOMICS)
+    const int vecSize = 100;
+    const int tile_size = 10;
+
+    // Alloc & init input data
+    extent<2> e_a(vecSize, vecSize);
+    std::vector<T> va(vecSize * vecSize, INIT);
+    array_view<T, 2> av_a(e_a, va); 
+
+    extent<2> compute_domain(e_a);
+    parallel_for_each(
+      compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
+      tile_static T localA[tile_size][tile_size];
+      localA[tidx.local[0]][tidx.local[1]] = 0;
+      tidx.barrier.wait();
+
+      for(int i = 0; i < tile_size; i++) {
+        for(int j = 0; j < tile_size; j++) {
+          atomic_fetch_add(&(localA[i][j]), INIT);
+        }
       }
-    }
-  tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
-  });
-
-  // accumlate tile_size * tile_size times
-  float sum = 0.0f;
-  for (int i = 0; i < tile_size * tile_size; ++i)
-    sum += INIT;
-  for(unsigned i = 0; i < vecSize; i++) {
-    for(unsigned j = 0; j < vecSize; j++) {
-      if(fabs(av_a(i, j) - sum) > TOLERANCE) {
-        return 1;
+    tidx.barrier.wait();
+    av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
+    });
+
+    // accumlate tile_size * tile_size times
+    float sum = 0.0f;
+    for (int i = 0; i < tile_size * tile_size; ++i)
+      sum += INIT;
+    for(unsigned i = 0; i < vecSize; i++) {
+      for(unsigned j = 0; j < vecSize; j++) {
+        if(fabs(av_a(i, j) - sum) > TOLERANCE) {
+          return 1;
+        }
       }
     }
-  }
 
-  return 0;
+    return 0;
+  #else
+    return EXIT_SUCCESS;
+  #endif
 }
diff --git a/tests/Unit/Atomic/atomic_add_global.cpp b/tests/Unit/Atomic/atomic_add_global.cpp
index 52bd2acf3bd..64f5445487c 100644
--- a/tests/Unit/Atomic/atomic_add_global.cpp
+++ b/tests/Unit/Atomic/atomic_add_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_add_local.cpp b/tests/Unit/Atomic/atomic_add_local.cpp
index bfe8fa16320..c67c803725b 100644
--- a/tests/Unit/Atomic/atomic_add_local.cpp
+++ b/tests/Unit/Atomic/atomic_add_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,21 +18,19 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static unsigned localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
       for(int j = 0; j < tile_size; j++) {
-        atomic_fetch_add(&(localA[i][j]), 1);
+        atomic_fetch_add(&(localA[i][j]), 1u);
       }
     }
   tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+  av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/Atomic/atomic_and_global.cpp b/tests/Unit/Atomic/atomic_and_global.cpp
index 13d1ad75cbb..082f6a6d197 100644
--- a/tests/Unit/Atomic/atomic_and_global.cpp
+++ b/tests/Unit/Atomic/atomic_and_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_and_local.cpp b/tests/Unit/Atomic/atomic_and_local.cpp
index 3f735a8234f..e47cb193714 100644
--- a/tests/Unit/Atomic/atomic_and_local.cpp
+++ b/tests/Unit/Atomic/atomic_and_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,12 +18,10 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static int localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -32,7 +30,7 @@ int main(void) {
       }
     }
   tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+  av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/Atomic/atomic_compare_exchange_global.cpp b/tests/Unit/Atomic/atomic_compare_exchange_global.cpp
index b800e8a9516..ce5f2f55689 100644
--- a/tests/Unit/Atomic/atomic_compare_exchange_global.cpp
+++ b/tests/Unit/Atomic/atomic_compare_exchange_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_compare_exchange_local.cpp b/tests/Unit/Atomic/atomic_compare_exchange_local.cpp
index 28a8cb3f47b..6d452059025 100644
--- a/tests/Unit/Atomic/atomic_compare_exchange_local.cpp
+++ b/tests/Unit/Atomic/atomic_compare_exchange_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,13 +18,12 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     int v = 0;
 
     tile_static int localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -33,7 +32,7 @@ int main(void) {
       }
     }
     tidx.barrier.wait();
-    av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+    av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(int i = 0; i < vecSize; ++i) {
diff --git a/tests/Unit/Atomic/atomic_dec_global.cpp b/tests/Unit/Atomic/atomic_dec_global.cpp
index 7f814ed6be6..181481cc6f5 100644
--- a/tests/Unit/Atomic/atomic_dec_global.cpp
+++ b/tests/Unit/Atomic/atomic_dec_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
@@ -9,7 +9,7 @@ int main(void) {
   const int vecSize = 100;
 
   // Alloc & init input data
-  int init[vecSize] { 0 };
+  int init[vecSize]{};
   array<int, 1> count(vecSize, std::begin(init));
 
   parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] {
diff --git a/tests/Unit/Atomic/atomic_dec_local.cpp b/tests/Unit/Atomic/atomic_dec_local.cpp
index 929b8d200eb..971908994dd 100644
--- a/tests/Unit/Atomic/atomic_dec_local.cpp
+++ b/tests/Unit/Atomic/atomic_dec_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,12 +18,10 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static int localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -32,7 +30,7 @@ int main(void) {
       }
     }
   tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+  av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/Atomic/atomic_exchange_float_global.cpp b/tests/Unit/Atomic/atomic_exchange_float_global.cpp
index a2c92214e9e..4c4ebc25e4f 100644
--- a/tests/Unit/Atomic/atomic_exchange_float_global.cpp
+++ b/tests/Unit/Atomic/atomic_exchange_float_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_exchange_float_local.cpp b/tests/Unit/Atomic/atomic_exchange_float_local.cpp
index e803c4e6ae1..f4f068993f2 100644
--- a/tests/Unit/Atomic/atomic_exchange_float_local.cpp
+++ b/tests/Unit/Atomic/atomic_exchange_float_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <math.h>
@@ -20,12 +20,13 @@ int main(void) {
   array_view<T, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     index<2> localIdx = tidx.local;
     index<2> globalIdx = tidx.global;
 
     tile_static T localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -34,7 +35,7 @@ int main(void) {
       }
     }
     tidx.barrier.wait();
-    av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+    av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(int i = 0; i < vecSize; ++i) {
diff --git a/tests/Unit/Atomic/atomic_exchange_global.cpp b/tests/Unit/Atomic/atomic_exchange_global.cpp
index 2ef397a0e42..e5ea2f42529 100644
--- a/tests/Unit/Atomic/atomic_exchange_global.cpp
+++ b/tests/Unit/Atomic/atomic_exchange_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_exchange_local.cpp b/tests/Unit/Atomic/atomic_exchange_local.cpp
index 0958334c864..7b4a59099d6 100644
--- a/tests/Unit/Atomic/atomic_exchange_local.cpp
+++ b/tests/Unit/Atomic/atomic_exchange_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,12 +18,10 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static int localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -32,7 +30,7 @@ int main(void) {
       }
     }
     tidx.barrier.wait();
-    av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+    av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(int i = 0; i < vecSize; ++i) {
diff --git a/tests/Unit/Atomic/atomic_inc_global.cpp b/tests/Unit/Atomic/atomic_inc_global.cpp
index 63e0dc3d0da..729ab909b44 100644
--- a/tests/Unit/Atomic/atomic_inc_global.cpp
+++ b/tests/Unit/Atomic/atomic_inc_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_inc_local.cpp b/tests/Unit/Atomic/atomic_inc_local.cpp
index bced3c2947f..6acf3bd929a 100644
--- a/tests/Unit/Atomic/atomic_inc_local.cpp
+++ b/tests/Unit/Atomic/atomic_inc_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,12 +18,10 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static unsigned localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -32,7 +30,7 @@ int main(void) {
       }
     }
   tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+  av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/Atomic/atomic_max_global.cpp b/tests/Unit/Atomic/atomic_max_global.cpp
index b45d672e705..2a689e347b0 100644
--- a/tests/Unit/Atomic/atomic_max_global.cpp
+++ b/tests/Unit/Atomic/atomic_max_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_max_local.cpp b/tests/Unit/Atomic/atomic_max_local.cpp
index 05f07b8683c..7148630b1d3 100644
--- a/tests/Unit/Atomic/atomic_max_local.cpp
+++ b/tests/Unit/Atomic/atomic_max_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,12 +18,10 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static int localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -32,7 +30,7 @@ int main(void) {
       }
     }
   tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+  av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/Atomic/atomic_min_global.cpp b/tests/Unit/Atomic/atomic_min_global.cpp
index 858668cf405..be8271c51b8 100644
--- a/tests/Unit/Atomic/atomic_min_global.cpp
+++ b/tests/Unit/Atomic/atomic_min_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_min_local.cpp b/tests/Unit/Atomic/atomic_min_local.cpp
index 6d48dbc3a6d..da5a38a1063 100644
--- a/tests/Unit/Atomic/atomic_min_local.cpp
+++ b/tests/Unit/Atomic/atomic_min_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,12 +18,10 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static int localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -32,7 +30,7 @@ int main(void) {
       }
     }
   tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+  av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/Atomic/atomic_or_global.cpp b/tests/Unit/Atomic/atomic_or_global.cpp
index 35e3e41736e..ac2d4909b74 100644
--- a/tests/Unit/Atomic/atomic_or_global.cpp
+++ b/tests/Unit/Atomic/atomic_or_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_or_local.cpp b/tests/Unit/Atomic/atomic_or_local.cpp
index 0e9e6e29451..8900a56eb85 100644
--- a/tests/Unit/Atomic/atomic_or_local.cpp
+++ b/tests/Unit/Atomic/atomic_or_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,12 +18,10 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static int localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -32,7 +30,7 @@ int main(void) {
       }
     }
   tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+  av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/Atomic/atomic_sub_float_global.cpp b/tests/Unit/Atomic/atomic_sub_float_global.cpp
index 92fb93dd001..c3caa4301df 100644
--- a/tests/Unit/Atomic/atomic_sub_float_global.cpp
+++ b/tests/Unit/Atomic/atomic_sub_float_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
@@ -13,28 +13,32 @@ using namespace hc;
 #define INIT 0.5f
 
 int main(void) {
-  const int vecSize = 100;
+  #if defined(FLOAT_ATOMICS)
+    const int vecSize = 100;
 
-  // Alloc & init input data
-  std::vector<T> init(vecSize, INIT);
-  array<T, 1> count(vecSize, init.begin());
+    // Alloc & init input data
+    std::vector<T> init(vecSize, INIT);
+    array<T, 1> count(vecSize, init.begin());
 
-  parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] {
-    for(unsigned i = 0; i < vecSize; i++) {
-      atomic_fetch_sub(&count[i], INIT);
-    }
-  });
+    parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] {
+      for(unsigned i = 0; i < vecSize; i++) {
+        atomic_fetch_sub(&count[i], INIT);
+      }
+    });
 
-  array_view<T, 1> av(count);
+    array_view<T, 1> av(count);
 
-  bool ret = true;
-  float sum = -std::accumulate(init.begin(), init.end(), 0.0f);
-  sum += INIT;
-  for(unsigned i = 0; i < vecSize; ++i) {
-      if(fabs(av[i] - sum) > TOLERANCE) {
-        ret = false;
-      }
-  }
+    bool ret = true;
+    float sum = -std::accumulate(init.begin(), init.end(), 0.0f);
+    sum += INIT;
+    for(unsigned i = 0; i < vecSize; ++i) {
+        if(fabs(av[i] - sum) > TOLERANCE) {
+          ret = false;
+        }
+    }
 
-  return !(ret == true);
+    return !(ret == true);
+  #else
+    return EXIT_SUCCESS;
+  #endif
 }
diff --git a/tests/Unit/Atomic/atomic_sub_float_local.cpp b/tests/Unit/Atomic/atomic_sub_float_local.cpp
index 7448d1a937b..aff7ae6fbd5 100644
--- a/tests/Unit/Atomic/atomic_sub_float_local.cpp
+++ b/tests/Unit/Atomic/atomic_sub_float_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <math.h>
@@ -11,43 +11,45 @@ using namespace hc;
 #define TOLERANCE 1e-5
 
 int main(void) {
-  const int vecSize = 100;
-  const int tile_size = 10;
-
-  // Alloc & init input data
-  extent<2> e_a(vecSize, vecSize);
-  std::vector<T> va(vecSize * vecSize, INIT);
-  array_view<T, 2> av_a(e_a, va); 
-
-  extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
-    tile_static T localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
-    tidx.barrier.wait();
-
-    for(int i = 0; i < tile_size; i++) {
-      for(int j = 0; j < tile_size; j++) {
-        atomic_fetch_sub(&(localA[i][j]), INIT);
+  #if defined(FLOAT_ATOMICS)
+    const int vecSize = 100;
+    const int tile_size = 10;
+
+    // Alloc & init input data
+    extent<2> e_a(vecSize, vecSize);
+    std::vector<T> va(vecSize * vecSize, INIT);
+    array_view<T, 2> av_a(e_a, va); 
+
+    extent<2> compute_domain(e_a);
+    parallel_for_each(
+      compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
+      tile_static T localA[tile_size][tile_size];
+      localA[tidx.local[0]][tidx.local[1]] = 0;
+      tidx.barrier.wait();
+
+      for(int i = 0; i < tile_size; i++) {
+        for(int j = 0; j < tile_size; j++) {
+          atomic_fetch_sub(&(localA[i][j]), INIT);
+        }
       }
-    }
-  tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
-  });
-
-  // accumlate tile_size * tile_size times
-  float sum = 0.0f;
-  for (int i = 0; i < tile_size * tile_size; ++i)
-    sum -= INIT;
-  for(unsigned i = 0; i < vecSize; i++) {
-    for(unsigned j = 0; j < vecSize; j++) {
-      if(fabs(av_a(i, j) - sum) > TOLERANCE) {
-        return 1;
+    tidx.barrier.wait();
+    av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
+    });
+
+    // accumlate tile_size * tile_size times
+    float sum = 0.0f;
+    for (int i = 0; i < tile_size * tile_size; ++i)
+      sum -= INIT;
+    for(unsigned i = 0; i < vecSize; i++) {
+      for(unsigned j = 0; j < vecSize; j++) {
+        if(fabs(av_a(i, j) - sum) > TOLERANCE) {
+          return 1;
+        }
       }
     }
-  }
 
-  return 0;
+    return 0;
+  #else
+    return EXIT_SUCCESS;
+  #endif
 }
diff --git a/tests/Unit/Atomic/atomic_sub_global.cpp b/tests/Unit/Atomic/atomic_sub_global.cpp
index aec8d22eaaa..f22b9fda3c4 100644
--- a/tests/Unit/Atomic/atomic_sub_global.cpp
+++ b/tests/Unit/Atomic/atomic_sub_global.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/Atomic/atomic_sub_local.cpp b/tests/Unit/Atomic/atomic_sub_local.cpp
index 64847472961..0f66b6ae001 100644
--- a/tests/Unit/Atomic/atomic_sub_local.cpp
+++ b/tests/Unit/Atomic/atomic_sub_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,12 +18,10 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static int localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -32,7 +30,7 @@ int main(void) {
       }
     }
   tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+  av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/Atomic/atomic_xor_global.cpp b/tests/Unit/Atomic/atomic_xor_global.cpp
index 545bd41b7d5..409337ec9b6 100644
--- a/tests/Unit/Atomic/atomic_xor_global.cpp
+++ b/tests/Unit/Atomic/atomic_xor_global.cpp
@@ -1,7 +1,7 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
+
 #include <stdlib.h>
-#include <iostream>
 #include <vector>
 using namespace hc;
 
diff --git a/tests/Unit/Atomic/atomic_xor_local.cpp b/tests/Unit/Atomic/atomic_xor_local.cpp
index fa67fed4367..d43f54492e4 100644
--- a/tests/Unit/Atomic/atomic_xor_local.cpp
+++ b/tests/Unit/Atomic/atomic_xor_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -18,12 +18,10 @@ int main(void) {
   array_view<int, 2> av_a(e_a, va); 
 
   extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[cpu, hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
+  parallel_for_each(
+    compute_domain.tile(tile_size, tile_size), [=](tiled_index<2> tidx) [[hc]] {
     tile_static int localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
+    localA[tidx.local[0]][tidx.local[1]] = 0;
     tidx.barrier.wait();
 
     for(int i = 0; i < tile_size; i++) {
@@ -31,8 +29,8 @@ int main(void) {
         atomic_fetch_xor(&(localA[i][j]), 1);
       }
     }
-  tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+    tidx.barrier.wait();
+    av_a[tidx.global] = localA[tidx.local[0]][tidx.local[1]];
   });
 
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/CXXLangExt/array_array.cpp b/tests/Unit/CXXLangExt/array_array.cpp
index 09a4bf3454a..77ed231d75d 100644
--- a/tests/Unit/CXXLangExt/array_array.cpp
+++ b/tests/Unit/CXXLangExt/array_array.cpp
@@ -2,10 +2,10 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // require HSA Full Profile to operate successfully
@@ -54,7 +54,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/array_pointer.cpp b/tests/Unit/CXXLangExt/array_pointer.cpp
index 43531acce7b..5176f54369a 100644
--- a/tests/Unit/CXXLangExt/array_pointer.cpp
+++ b/tests/Unit/CXXLangExt/array_pointer.cpp
@@ -2,10 +2,10 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -48,7 +48,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/enum.cpp b/tests/Unit/CXXLangExt/enum.cpp
index 609b142c508..502cbd6515c 100644
--- a/tests/Unit/CXXLangExt/enum.cpp
+++ b/tests/Unit/CXXLangExt/enum.cpp
@@ -21,10 +21,10 @@
 // RUN: %hc -DTYPE="unsigned long long"  %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -74,7 +74,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/function_declarator_Varargs.cpp b/tests/Unit/CXXLangExt/function_declarator_Varargs.cpp
index aa5ad0fc04f..fd8eee9491d 100644
--- a/tests/Unit/CXXLangExt/function_declarator_Varargs.cpp
+++ b/tests/Unit/CXXLangExt/function_declarator_Varargs.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 void NoEllipsisAllowed(int x, ...) [[hc]] {}
 
diff --git a/tests/Unit/CXXLangExt/local_param_ret.cpp b/tests/Unit/CXXLangExt/local_param_ret.cpp
index e7fec540c5e..28923af171d 100644
--- a/tests/Unit/CXXLangExt/local_param_ret.cpp
+++ b/tests/Unit/CXXLangExt/local_param_ret.cpp
@@ -30,9 +30,9 @@
 
 // RUN: %hc -DTYPE="wchar_t"  %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cmath>
 #include <iostream>
@@ -81,7 +81,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/local_param_ret_half-float.cpp b/tests/Unit/CXXLangExt/local_param_ret_half-float.cpp
index 710112f7d18..8525ab79652 100644
--- a/tests/Unit/CXXLangExt/local_param_ret_half-float.cpp
+++ b/tests/Unit/CXXLangExt/local_param_ret_half-float.cpp
@@ -2,7 +2,7 @@
 // RUN: %hc -DTYPE="half float" %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 TYPE func(TYPE arg) [[hc]]
 {
diff --git a/tests/Unit/CXXLangExt/local_param_ret_pointer-to-function.cpp b/tests/Unit/CXXLangExt/local_param_ret_pointer-to-function.cpp
index 181999a4ffc..0843a34b83d 100644
--- a/tests/Unit/CXXLangExt/local_param_ret_pointer-to-function.cpp
+++ b/tests/Unit/CXXLangExt/local_param_ret_pointer-to-function.cpp
@@ -2,10 +2,10 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -50,7 +50,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/local_param_ret_pointer.cpp b/tests/Unit/CXXLangExt/local_param_ret_pointer.cpp
index 33aac7f0b13..fb287f97256 100644
--- a/tests/Unit/CXXLangExt/local_param_ret_pointer.cpp
+++ b/tests/Unit/CXXLangExt/local_param_ret_pointer.cpp
@@ -30,9 +30,9 @@
 
 // RUN: %hc -DTYPE="wchar_t"  %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cmath>
 #include <iostream>
@@ -82,7 +82,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/local_param_ret_ref-to-pointer.cpp b/tests/Unit/CXXLangExt/local_param_ret_ref-to-pointer.cpp
index f86eeb0e762..f8d91a44e91 100644
--- a/tests/Unit/CXXLangExt/local_param_ret_ref-to-pointer.cpp
+++ b/tests/Unit/CXXLangExt/local_param_ret_ref-to-pointer.cpp
@@ -30,9 +30,9 @@
 
 // RUN: %hc -DTYPE="wchar_t"  %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cmath>
 #include <iostream>
@@ -83,7 +83,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/local_param_ret_ref.cpp b/tests/Unit/CXXLangExt/local_param_ret_ref.cpp
index 9ef2038bee9..5927e94f934 100644
--- a/tests/Unit/CXXLangExt/local_param_ret_ref.cpp
+++ b/tests/Unit/CXXLangExt/local_param_ret_ref.cpp
@@ -30,9 +30,9 @@
 
 // RUN: %hc -DTYPE="wchar_t"  %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cmath>
 #include <iostream>
@@ -82,7 +82,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/local_param_ret_static-local.cpp b/tests/Unit/CXXLangExt/local_param_ret_static-local.cpp
index c10e3e1fd52..20f3ae172dc 100644
--- a/tests/Unit/CXXLangExt/local_param_ret_static-local.cpp
+++ b/tests/Unit/CXXLangExt/local_param_ret_static-local.cpp
@@ -28,10 +28,8 @@
 
 // RUN: %hc -DTYPE="wchar_t"  %s -o %t.out && %t.out
 
-#include <hc.hpp>
-
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <iostream>
@@ -76,7 +74,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/statement_asm.cpp b/tests/Unit/CXXLangExt/statement_asm.cpp
index 00f500324d0..35bc4a8d6a4 100644
--- a/tests/Unit/CXXLangExt/statement_asm.cpp
+++ b/tests/Unit/CXXLangExt/statement_asm.cpp
@@ -2,7 +2,7 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 void func () [[hc]]
 {
diff --git a/tests/Unit/CXXLangExt/statement_global-variable.cpp b/tests/Unit/CXXLangExt/statement_global-variable.cpp
index 5a03208c98e..0606e0145f5 100644
--- a/tests/Unit/CXXLangExt/statement_global-variable.cpp
+++ b/tests/Unit/CXXLangExt/statement_global-variable.cpp
@@ -2,7 +2,7 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 [[hc]] int flag;
 
diff --git a/tests/Unit/CXXLangExt/statement_goto_label.cpp b/tests/Unit/CXXLangExt/statement_goto_label.cpp
index 5aff19ae8ed..d6e8cba6a1c 100644
--- a/tests/Unit/CXXLangExt/statement_goto_label.cpp
+++ b/tests/Unit/CXXLangExt/statement_goto_label.cpp
@@ -2,10 +2,10 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -51,7 +51,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/statement_recursion.cpp b/tests/Unit/CXXLangExt/statement_recursion.cpp
index 991a160fb84..6bb1454d196 100644
--- a/tests/Unit/CXXLangExt/statement_recursion.cpp
+++ b/tests/Unit/CXXLangExt/statement_recursion.cpp
@@ -2,9 +2,9 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
-int fib(int x) restrict (amp, cpu) {
+int fib(int x) [[cpu, hc]] {
   if (x == 0) return 0;
   if (x == 1) return 1;
   return fib(x - 1) + fib(x - 2);
diff --git a/tests/Unit/CXXLangExt/struct_class_union.cpp b/tests/Unit/CXXLangExt/struct_class_union.cpp
index cd6f7315ea7..64532d23736 100644
--- a/tests/Unit/CXXLangExt/struct_class_union.cpp
+++ b/tests/Unit/CXXLangExt/struct_class_union.cpp
@@ -29,9 +29,9 @@
 // RUN: %hc -DTYPE="bool"  %s -o %t.out && %t.out
 
 // RUN: %hc -DTYPE="wchar_t"  %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cmath>
 #include <iostream>
@@ -93,7 +93,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/struct_class_union_bitfields.cpp b/tests/Unit/CXXLangExt/struct_class_union_bitfields.cpp
index 2ac0e5700bd..52fdde352e6 100644
--- a/tests/Unit/CXXLangExt/struct_class_union_bitfields.cpp
+++ b/tests/Unit/CXXLangExt/struct_class_union_bitfields.cpp
@@ -2,10 +2,10 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -65,7 +65,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/struct_class_union_half-float.cpp b/tests/Unit/CXXLangExt/struct_class_union_half-float.cpp
index 9cb2e53e379..e9e796aca39 100644
--- a/tests/Unit/CXXLangExt/struct_class_union_half-float.cpp
+++ b/tests/Unit/CXXLangExt/struct_class_union_half-float.cpp
@@ -2,7 +2,7 @@
 // RUN: %hc -DTYPE="half float" %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 struct S {
   TYPE var;
diff --git a/tests/Unit/CXXLangExt/struct_class_union_pointer.cpp b/tests/Unit/CXXLangExt/struct_class_union_pointer.cpp
index b8697ad48e2..a8651abfece 100644
--- a/tests/Unit/CXXLangExt/struct_class_union_pointer.cpp
+++ b/tests/Unit/CXXLangExt/struct_class_union_pointer.cpp
@@ -30,9 +30,8 @@
 
 // RUN: %hc -DTYPE="wchar_t"  %s -o %t.out && %t.out
 
-#include <hc.hpp>
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cmath>
 #include <iostream>
@@ -95,7 +94,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/struct_class_union_ref.cpp b/tests/Unit/CXXLangExt/struct_class_union_ref.cpp
index 64cd0e48bda..c6983e72d2a 100644
--- a/tests/Unit/CXXLangExt/struct_class_union_ref.cpp
+++ b/tests/Unit/CXXLangExt/struct_class_union_ref.cpp
@@ -30,9 +30,8 @@
 
 // RUN: %hc -DTYPE="wchar_t"  %s -o %t.out && %t.out
 
-#include <hc.hpp>
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cmath>
 #include <iostream>
@@ -42,13 +41,13 @@
 
 struct S {
   TYPE & ref;
-  explicit S(TYPE &var) restrict (amp) : ref(var) {};
+  explicit S(TYPE &var) [[hc]] : ref(var) {};
 };
 
 class C {
 public:
   TYPE & ref;
-  explicit C(TYPE &var) restrict (amp) : ref(var) {};
+  explicit C(TYPE &var) [[hc]] : ref(var) {};
 };
 
 bool test() {
@@ -89,7 +88,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/struct_class_union_unaligned-member.cpp b/tests/Unit/CXXLangExt/struct_class_union_unaligned-member.cpp
index e2c4ed63233..aae5bf272ef 100644
--- a/tests/Unit/CXXLangExt/struct_class_union_unaligned-member.cpp
+++ b/tests/Unit/CXXLangExt/struct_class_union_unaligned-member.cpp
@@ -2,10 +2,9 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -54,7 +53,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CXXLangExt/struct_class_virtual-base-class.cpp b/tests/Unit/CXXLangExt/struct_class_virtual-base-class.cpp
index 0d854eeac0d..a6c230c4384 100644
--- a/tests/Unit/CXXLangExt/struct_class_virtual-base-class.cpp
+++ b/tests/Unit/CXXLangExt/struct_class_virtual-base-class.cpp
@@ -2,7 +2,7 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 class MyBaseClass
 {
diff --git a/tests/Unit/CXXLangExt/tile_static.cpp b/tests/Unit/CXXLangExt/tile_static.cpp
index 59835807f1b..1788b5d05a2 100644
--- a/tests/Unit/CXXLangExt/tile_static.cpp
+++ b/tests/Unit/CXXLangExt/tile_static.cpp
@@ -8,7 +8,7 @@
 // RUN: %hc -DTYPE="signed short"  %s -o %t.out && %t.out
 // RUN: %hc -DTYPE="unsigned short"  %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 
 // A test which uses different types not allowed in C++AMP specification
diff --git a/tests/Unit/CaptureByCopy/test1.cpp b/tests/Unit/CaptureByCopy/test1.cpp
index fb1b6b009d5..9d98f4b535d 100644
--- a/tests/Unit/CaptureByCopy/test1.cpp
+++ b/tests/Unit/CaptureByCopy/test1.cpp
@@ -1,13 +1,13 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <atomic>
 #include <iostream>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -147,7 +147,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     ret &= test1(user_functor());
     ret &= test2(user_functor());
diff --git a/tests/Unit/CaptureByCopy/test2.cpp b/tests/Unit/CaptureByCopy/test2.cpp
index ee30b9eaba7..f0c4ed9cd76 100644
--- a/tests/Unit/CaptureByCopy/test2.cpp
+++ b/tests/Unit/CaptureByCopy/test2.cpp
@@ -1,13 +1,13 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <atomic>
 #include <iostream>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -152,7 +152,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     ret &= test1<int, SIZE>(user_functor<int>());
     ret &= test1<unsigned, SIZE>(user_functor<unsigned>());
diff --git a/tests/Unit/CaptureByCopy/test3.cpp b/tests/Unit/CaptureByCopy/test3.cpp
index eb8bb87b389..1ca357d5438 100644
--- a/tests/Unit/CaptureByCopy/test3.cpp
+++ b/tests/Unit/CaptureByCopy/test3.cpp
@@ -1,14 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-
 #include <atomic>
 #include <iostream>
 #include <random>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -151,7 +149,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     // setup RNG
     std::random_device rd;
diff --git a/tests/Unit/CaptureByCopy/test4.cpp b/tests/Unit/CaptureByCopy/test4.cpp
index 911ee980964..c73883fef93 100644
--- a/tests/Unit/CaptureByCopy/test4.cpp
+++ b/tests/Unit/CaptureByCopy/test4.cpp
@@ -1,14 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-
 #include <atomic>
 #include <iostream>
 #include <random>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -156,7 +154,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     // setup RNG
     std::random_device rd;
diff --git a/tests/Unit/CaptureByRef/test1.cpp b/tests/Unit/CaptureByRef/test1.cpp
index 5241c4ac94f..7320ef774e7 100644
--- a/tests/Unit/CaptureByRef/test1.cpp
+++ b/tests/Unit/CaptureByRef/test1.cpp
@@ -1,11 +1,10 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -49,7 +48,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test10.cpp b/tests/Unit/CaptureByRef/test10.cpp
index f06ed28dca8..9f29892c3fa 100644
--- a/tests/Unit/CaptureByRef/test10.cpp
+++ b/tests/Unit/CaptureByRef/test10.cpp
@@ -1,12 +1,11 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -57,7 +56,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test11.cpp b/tests/Unit/CaptureByRef/test11.cpp
index 1b036fe9791..ec9dfd1f8aa 100644
--- a/tests/Unit/CaptureByRef/test11.cpp
+++ b/tests/Unit/CaptureByRef/test11.cpp
@@ -1,12 +1,11 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -82,7 +81,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test12.cpp b/tests/Unit/CaptureByRef/test12.cpp
index 2fe0b880581..79cdc6b05e6 100644
--- a/tests/Unit/CaptureByRef/test12.cpp
+++ b/tests/Unit/CaptureByRef/test12.cpp
@@ -1,12 +1,11 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -54,7 +53,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test13.cpp b/tests/Unit/CaptureByRef/test13.cpp
index 4f570985f13..542039f60c4 100644
--- a/tests/Unit/CaptureByRef/test13.cpp
+++ b/tests/Unit/CaptureByRef/test13.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -62,7 +62,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test14.cpp b/tests/Unit/CaptureByRef/test14.cpp
index 8d2cda5bb93..1e5df0a9848 100644
--- a/tests/Unit/CaptureByRef/test14.cpp
+++ b/tests/Unit/CaptureByRef/test14.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -69,7 +69,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test15.cpp b/tests/Unit/CaptureByRef/test15.cpp
index b246f034e30..e7b4326cc55 100644
--- a/tests/Unit/CaptureByRef/test15.cpp
+++ b/tests/Unit/CaptureByRef/test15.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -67,7 +67,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test2.cpp b/tests/Unit/CaptureByRef/test2.cpp
index ad22024d8b1..e8feae11e4e 100644
--- a/tests/Unit/CaptureByRef/test2.cpp
+++ b/tests/Unit/CaptureByRef/test2.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -51,7 +51,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test3.cpp b/tests/Unit/CaptureByRef/test3.cpp
index db7d0b8495a..2bcdedd58f0 100644
--- a/tests/Unit/CaptureByRef/test3.cpp
+++ b/tests/Unit/CaptureByRef/test3.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -47,7 +47,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test4.cpp b/tests/Unit/CaptureByRef/test4.cpp
index 2386def9df9..8e9f3ce9eef 100644
--- a/tests/Unit/CaptureByRef/test4.cpp
+++ b/tests/Unit/CaptureByRef/test4.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -48,7 +48,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test5.cpp b/tests/Unit/CaptureByRef/test5.cpp
index 0a570cff190..80110bfd571 100644
--- a/tests/Unit/CaptureByRef/test5.cpp
+++ b/tests/Unit/CaptureByRef/test5.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -50,7 +50,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test6.cpp b/tests/Unit/CaptureByRef/test6.cpp
index 1395cec6486..59366303ffa 100644
--- a/tests/Unit/CaptureByRef/test6.cpp
+++ b/tests/Unit/CaptureByRef/test6.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -51,7 +51,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test7.cpp b/tests/Unit/CaptureByRef/test7.cpp
index 3fba370dbb2..a453410d463 100644
--- a/tests/Unit/CaptureByRef/test7.cpp
+++ b/tests/Unit/CaptureByRef/test7.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -52,7 +52,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test8.cpp b/tests/Unit/CaptureByRef/test8.cpp
index 95411117af3..541363fb424 100644
--- a/tests/Unit/CaptureByRef/test8.cpp
+++ b/tests/Unit/CaptureByRef/test8.cpp
@@ -1,12 +1,11 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
 #include <iostream>
 #include <cstdlib>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -56,7 +55,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/CaptureByRef/test9.cpp b/tests/Unit/CaptureByRef/test9.cpp
deleted file mode 100644
index e2045e9e36f..00000000000
--- a/tests/Unit/CaptureByRef/test9.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-
-// RUN: %hc %s -o %t.out && %t.out
-
-#include <hc.hpp>
-#include <iostream>
-#include <cstdlib>
-
-// added for checking HSA profile
-#include <hc.hpp>
-
-// test C++AMP with fine-grained SVM
-// requires HSA Full Profile to operate successfully
-
-#define VECTOR_SIZE (16)
-
-bool test() {
-  using namespace hc;
-
-  int p = rand() % 15 + 1;
-
-  int table[VECTOR_SIZE][VECTOR_SIZE][VECTOR_SIZE][VECTOR_SIZE];
-  int table2[VECTOR_SIZE][VECTOR_SIZE][VECTOR_SIZE][VECTOR_SIZE];
-  for (int i = 0; i < VECTOR_SIZE; ++i) {
-    for (int j = 0; j < VECTOR_SIZE; ++j) {
-      for (int k = 0; k < VECTOR_SIZE; ++k) {
-        for (int l = 0; l < VECTOR_SIZE; ++l) {
-          table[i][j][k][l] = rand() % 255 + 1;
-        }
-      }
-    }
-  }
-
-  int dim[4] { VECTOR_SIZE, VECTOR_SIZE, VECTOR_SIZE, VECTOR_SIZE };
-  extent<4> ex(dim);
-  parallel_for_each(ex, [&](index<4> idx) [[hc]] {
-    // capture multiple 4D array types and scalar type by reference
-    table2[idx[0]][idx[1]][idx[2]][idx[3]] = table[idx[0]][idx[1]][idx[2]][idx[3]] * p;
-  });
-
-  // verify result
-  for (int i = 0; i < VECTOR_SIZE; ++i) {
-    for (int j = 0; j < VECTOR_SIZE; ++j) {
-      for (int k = 0; k < VECTOR_SIZE; ++k) {
-        for (int l = 0; l < VECTOR_SIZE; ++l) {
-          if (table2[i][j][k][l] != table[i][j][k][l] * p) {
-            std::cout << "Failed at (" << i << "," << j << "," << k << "," << l << ")" << std::endl;
-            return false;
-          }
-        }
-      }
-    }
-  }
-
-  std::cout << "Passed" << std::endl;
-  return true;
-}
-
-int main() {
-  bool ret = true;
-
-  // only conduct the test in case we are running on a HSA full profile stack
-  hc::accelerator acc;
-  if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
-    ret &= test();
-  }
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/Codegen/barrier_should_not_unwind.cpp b/tests/Unit/Codegen/barrier_should_not_unwind.cpp
index 62a695bdc71..f4c841038d1 100644
--- a/tests/Unit/Codegen/barrier_should_not_unwind.cpp
+++ b/tests/Unit/Codegen/barrier_should_not_unwind.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 void
diff --git a/tests/Unit/Codegen/compile_error_for_arraytype.cpp b/tests/Unit/Codegen/compile_error_for_arraytype.cpp
deleted file mode 100644
index 27210b977ee..00000000000
--- a/tests/Unit/Codegen/compile_error_for_arraytype.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
-
-//////////////////////////////////////////////////////////////////////////////////
-// Do not delete or add any line; it is referred to by absolute line number in the
-// FileCheck lines below
-//////////////////////////////////////////////////////////////////////////////////
-class baz {
- public:
-  void cho(void) [[hc]] {};
-  int bar;
-  int* n[10];
-};
-// CHECK: compile_error_for_arraytype.cpp:[[@LINE-2]]:3: error: the field type is not amp-compatible
-// CHECK-NEXT: int* n[10];
-// CHECK-NEXT: ^
-
-
-int kerker(void) [[cpu, hc]] {
-  baz bl;
-  return 0;
-}
-// CHECK: compile_error_for_arraytype.cpp:[[@LINE-3]]:3: error: 'class baz': unsupported type in amp restricted code
-// CHECK-NEXT: baz bl;
-// CHECK-NEXT: ^
-
diff --git a/tests/Unit/Codegen/deser_decl.cpp b/tests/Unit/Codegen/deser_decl.cpp
deleted file mode 100644
index 8f74b5c0f95..00000000000
--- a/tests/Unit/Codegen/deser_decl.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: %amp_device -c -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s
-class base{
- public:
-  __attribute__((annotate("deserialize"))) /* For compiler */
-  base(float a_,float b_) [[hc]] :a(a_), b(b_) {}
-  float a;
-  float b;
-};
-class baz {
- public:
-#if 0 // This declaration is supposed to be generated
-  __attribute__((annotate("deserialize"))) /* For compiler */
-  baz(float a, float b, int foo) [[hc]];
-#endif
-  void cho(void) [[hc]] {};
-
-  base B;
-  int bar;
-};
-
-int kerker(void) [[cpu, hc]] {
-  // Will pass if deserializer declaration and definition are generated
-  baz bl(0.0, 0.0, 1);
-  return bl.bar;
-}
-// The definition should be generated by clang
-// CHECK: define {{.*}}void @baz::baz(float, float, int)(
diff --git a/tests/Unit/Codegen/deser_decl_support_inheritclass.cpp b/tests/Unit/Codegen/deser_decl_support_inheritclass.cpp
deleted file mode 100644
index 7cfde43a628..00000000000
--- a/tests/Unit/Codegen/deser_decl_support_inheritclass.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: %amp_device -c -S -emit-llvm %s -o-|%cppfilt|%FileCheck %s
-class base {
- public:
-  __attribute__((annotate("deserialize"))) /* For compiler */
-  base(float a_,float b_) [[hc]] :a(a_), b(b_) {}
-  float a;
-  float b;
-};
-
-class baz:public base {
-  public:
-#if 0 // This declaration is supposed to be generated
-  __attribute__((annotate("deserialize"))) /* For compiler */
-  baz(float a, float b, int foo) [[hc]];
-#endif
-  void cho(void) [[hc]] {};
-
-  int bar;
-};
-
-int kerker(void) [[cpu, hc]] {
-  // Will pass if deserializer declaration and definition are generated
-  baz bl(0.0, 0.0, 1);
-  return bl.bar;
-}
-
-// The definition should be generated by clang
-// CHECK: define {{.*}}void @baz::baz(float, float, int)(
diff --git a/tests/Unit/Codegen/deser_def.cpp b/tests/Unit/Codegen/deser_def.cpp
deleted file mode 100644
index 249e11bc17e..00000000000
--- a/tests/Unit/Codegen/deser_def.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: %amp_device -c -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s
-class baz {
- public:
-  int cho(void) [[hc]] {
-    return 45;
-  }
-  baz(void): foo(1234) {}
-  __attribute__((used)) /* Forcing this function to be present in the output */
-  __attribute__((annotate("auto_deserialize"))) /* For compiler */
-  baz(int foo_, float bar_) [[hc]];
-  // :foo(foo_), bar(bar_) {}
- private:
-  int foo;
-  float bar;
-};
-
-int kerker(void) [[cpu, hc]] {
-  baz b1;
-  baz bll(1, 2.0);
-  return b1.cho()+bll.cho();
-}
-// The definition should be generated by clang
-// CHECK: define {{.*}}void @baz::baz(int, float)(%class.baz*{{.*}}, i32{{.*}}, float{{.*}})
diff --git a/tests/Unit/Codegen/deser_def_body.cpp b/tests/Unit/Codegen/deser_def_body.cpp
deleted file mode 100644
index 63df28ccad0..00000000000
--- a/tests/Unit/Codegen/deser_def_body.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %gtest_amp %s -o %t && %t
-#include <stdlib.h>
-#ifndef __KALMAR_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti
-#include <gtest/gtest.h>
-#endif
-class baz {
- public:
-  baz(void): foo(1234) {}
-  __attribute__((annotate("auto_deserialize"))) baz(int foo_, float bar_) [[cpu, hc]];
-  //:foo(foo_), bar(bar_) {}
-  int foo;
-  float bar;
-};
-
- __attribute__((annotate("user_deserialize")))
-int fake_use(void)
-  [[hc]] {
-  baz bll(1, 2.0);
-  return bll.foo;
-}
-#ifndef __KALMAR_ACCELERATOR__
-TEST(GPUCodeGen, Constructor) {
-  baz bll(1, 2.0);
-  EXPECT_EQ(bll.foo, 1);
-}
-#endif
diff --git a/tests/Unit/Codegen/deser_def_body_compound.cpp b/tests/Unit/Codegen/deser_def_body_compound.cpp
deleted file mode 100644
index c2d8c06672e..00000000000
--- a/tests/Unit/Codegen/deser_def_body_compound.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// XFAIL: *
-// RUN: %gtest_amp %s -o %t && %t
-#include <stdlib.h>
-#ifndef __KALMAR_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti
-#include <gtest/gtest.h>
-#endif
-class Member {
- public:
-  // Compiler-generated constructor
-  __attribute__((noinline))
-  __attribute__((annotate("auto_deserialize"))) Member(float, int) [[hc]];
-  float bzzt;
-  int zzz;
-};
-
-class baz {
- public:
-  // Compiler-generated constructor
-  __attribute__((annotate("auto_deserialize"))) baz(float m1, int m2,
-    int foo_, float bar_) [[cpu, hc]];
-
-  Member m;
-  int foo;
-  float bar;
-};
-
-__attribute__((annotate("user_deserialize")))
-int fake_use(void) [[hc]] {
-  baz bll(0.0, 0,  1, 2.0);
-  return bll.foo;
-}
-#ifndef __KALMAR_ACCELERATOR__
-TEST(GPUCodeGen, ConstructorCompound) {
-  float local_float = 2.78f;
-  baz bll(local_float, 2, 1, 2.0);
-  EXPECT_EQ(bll.foo, 1);
-  EXPECT_EQ(bll.m.bzzt, local_float);
-  EXPECT_EQ(bll.m.zzz, 2);
-}
-#endif
diff --git a/tests/Unit/Codegen/deser_def_body_compound_support_inheritclass.cpp b/tests/Unit/Codegen/deser_def_body_compound_support_inheritclass.cpp
deleted file mode 100644
index 8bd05d4fea1..00000000000
--- a/tests/Unit/Codegen/deser_def_body_compound_support_inheritclass.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__=1 %s -c -o %t.device.o
-// RUN: %gtest_amp %s %t.device.o -o %t && %t
-// XFAIL: *
-
-#include <stdlib.h>
-#ifndef __KALMAR_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti
-#include <gtest/gtest.h>
-#endif 
-class Member {
- public:
-  // Compiler-generated constructor
-  __attribute__((annotate("auto_deserialize"))) Member(float, int) [[cpu, hc]];
-  float bzzt;
-  int zzz;
-};
-
-class base {
- public:
-  // Compiler-generated constructor
-  __attribute__((annotate("auto_deserialize"))) base(float m1, int m2,
-    int foo_, float bar_) [[cpu, hc]];
-
-  Member m;
-  int foo;
-  float bar;
-};
-
-class baz :public base {
- public:
-  // Compiler-generated constructor
-  __attribute__((annotate("auto_deserialize"))) baz(float m1, int m2,
-    int foo_, float bar_, int bar_foo_) [[cpu, hc]];
-  int baz_foo;
-};
-
-__attribute__((annotate("user_deserialize")))
-int fake_use(void) [[hc]] {
-  baz bll(0, 0,  1, 2.0, 1);
-  return bll.foo;
-}
-#ifndef __KALMAR_ACCELERATOR__
-TEST(GPUCodeGen, ConstructorCompound) {
-  float local_float = 2.78f;
-  baz bll(local_float, 2, 1, 2.0,1);
-  EXPECT_EQ(bll.foo, 1);
-  EXPECT_EQ(bll.m.bzzt, local_float);
-  EXPECT_EQ(bll.m.zzz, 2);
-  EXPECT_EQ(bll.baz_foo, 1);
-}
-#endif
diff --git a/tests/Unit/Codegen/deser_def_ref.cpp b/tests/Unit/Codegen/deser_def_ref.cpp
deleted file mode 100644
index 3083dcfa4f1..00000000000
--- a/tests/Unit/Codegen/deser_def_ref.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// XFAIL: *
-// RUN: %amp_device -c -D__KALMAR_ACCELERATOR__=1 -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s
-// RUN: %amp_device -c -D__KALMAR_ACCELERATOR__=1 %s -o %t.device.o
-// RUN: %gtest_amp %s %t.device.o -o %t && %t
-#ifndef __KALMAR_ACCELERATOR__
-#include <gtest/gtest.h>
-#endif
-class base{
- public:
-  __attribute__((annotate("deserialize"))) /* For compiler */
-  base(int a_,float b_) [[hc]] :a(a_), b(b_) {}
-  int a;
-  float b;
-};
-class baz {
- public:
-#if 0 // This declaration is supposed to be generated
-  __attribute__((annotate("deserialize"))) /* For compiler */
-  baz(base&, int foo) [[hc]];
-#endif
-  void cho(void) [[hc]] {};
-
-  base &B; // No reference type is considered amp-compatible
-  int bar;
-};
-
-#ifdef __KALMAR_ACCELERATOR__
-int kerker(void) [[cpu, hc]] {
-  base b(1234, 0.0f);
-  // Will pass if deserializer declaration and definition are generated
-  baz bl(b, 1);
-  return bl.B.a;
-}
-#else
-extern int kerker(void) [[cpu, hc]];
-TEST(GPUCodeGen, ConstructorWithRef) {
-  EXPECT_EQ(kerker(), 1234);
-}
-#endif
-// The definition should be generated by clang
-// CHECK: define {{.*}}void @baz::baz(base&, int)(
diff --git a/tests/Unit/Codegen/index_operator_test.cpp b/tests/Unit/Codegen/index_operator_test.cpp
index 81f57c81e25..d24481729cf 100644
--- a/tests/Unit/Codegen/index_operator_test.cpp
+++ b/tests/Unit/Codegen/index_operator_test.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 int main(void)
 {
     hc::index<1> a(1), b;
diff --git a/tests/Unit/Codegen/indirect-func-arg.cpp b/tests/Unit/Codegen/indirect-func-arg.cpp
index bcabcb75612..d2009ac0c88 100644
--- a/tests/Unit/Codegen/indirect-func-arg.cpp
+++ b/tests/Unit/Codegen/indirect-func-arg.cpp
@@ -4,12 +4,12 @@
 // RUN: %llvm-dis %T/indirect-func-arg/dump-gfx803.opt.bc -f -o - | %FileCheck %s
 // RUN: %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <vector>
 
 #define GRID_SIZE (1024)
 
-// CHECK-LABEL: define weak_odr amdgpu_kernel void @"_ZZ4mainEN3$_019__cxxamp_trampolineEPjiii"(i32*, i32, i32, i32)
+// CHECK-LABEL: define weak_odr amdgpu_kernel void {{.*Kernel_emitter.*}}
 struct A {
   int x[8];
   A()[[hc]] {
diff --git a/tests/Unit/Codegen/opt_level0.cpp b/tests/Unit/Codegen/opt_level0.cpp
index 3bfe1063632..9baa1e9677e 100644
--- a/tests/Unit/Codegen/opt_level0.cpp
+++ b/tests/Unit/Codegen/opt_level0.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
 #include <functional>
 #include <vector>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 #define N 10
diff --git a/tests/Unit/Codegen/opt_level1.cpp b/tests/Unit/Codegen/opt_level1.cpp
index 9ff7ebe1163..d5262e976bf 100644
--- a/tests/Unit/Codegen/opt_level1.cpp
+++ b/tests/Unit/Codegen/opt_level1.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
 #include <functional>
 #include <vector>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 #define N 10
diff --git a/tests/Unit/Codegen/restric_overload.cpp b/tests/Unit/Codegen/restric_overload.cpp
index 573d57f1fcd..9bdcf9370ef 100644
--- a/tests/Unit/Codegen/restric_overload.cpp
+++ b/tests/Unit/Codegen/restric_overload.cpp
@@ -1,6 +1,6 @@
 // RUN: %gtest_amp %s -O2 -o %t && %t
 #include <stdlib.h>
-#ifndef __KALMAR_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti
+#ifndef __HCC_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti
 #include <gtest/gtest.h>
 #endif
 class baz {
@@ -15,7 +15,7 @@ int fake_use(void) [[cpu, hc]] {
   baz_cpu.foo(); //call the one with [[cpu]]
   return baz_cpu.bar;
 }
-#ifndef __KALMAR_ACCELERATOR__
+#ifndef __HCC_ACCELERATOR__
 TEST(GPUCodeGen, Constructor) {
  EXPECT_EQ(2, fake_use());
 }
diff --git a/tests/Unit/Codegen/ser_decl.cpp b/tests/Unit/Codegen/ser_decl.cpp
deleted file mode 100644
index e0877f31fcc..00000000000
--- a/tests/Unit/Codegen/ser_decl.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s
-#include <cstdlib> //for size_t
-//Serialization object decl
-namespace hc {
-class Serialize {
- public:
-  void Append(size_t x, const void *s);
-};
-}
-
-class baz {
- public:
-  int cho(void) [[hc]] {
-    return 45;
-  }
-  baz(void): foo(1234) {}
-#if 0
-  __attribute__((annotate("serialize")))/* For compiler */
-  __cxxamp_serialize(hc::Serialize& s) const;
-#endif
- private:
-  int foo;
-};
-
-int kerker(void) [[cpu, hc]] {
-  baz b1;
-  hc::Serialize s;
-  b1.__cxxamp_serialize(s);
-  return b1.cho();
-}
-// The definition should be generated by clang
-// CHECK: define {{.*}}void @baz::__cxxamp_serialize(hc::Serialize&)
diff --git a/tests/Unit/Codegen/ser_decl_ref.cpp b/tests/Unit/Codegen/ser_decl_ref.cpp
deleted file mode 100644
index 8304200250e..00000000000
--- a/tests/Unit/Codegen/ser_decl_ref.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-XFAIL: *
-// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s
-//Serialization object decl
-#include <cstdlib>
-namespace hc {
-class Serialize {
- public:
-  void Append(size_t x, const void *s);
-};
-}
-
-class base{
- public:
-  __attribute__((annotate("user_deserialize"))) /* For compiler */
-  base(int a_,float b_) [[cpu, hc]] :a(a_), b(b_) {}
-  int cho(void) [[hc]];
-  int a;
-  float b;
-};
-class baz {
- public:
-#if 0 // This declaration is supposed to be generated
-  __attribute__((annotate("deserialize"))) /* For compiler */
-  baz(base&, int foo) [[hc]];
-#endif
-  int cho(void) [[hc]] { return 0; };
-
-  base &B; //  reference object is not allowed in amp codes
-  int bar;
-};
-
-int kerker(void) [[cpu, hc]] {
-  base b(1234, 0.0f);
-  // Will pass if deserializer declaration and definition are generated
-  baz bl(b, 1);
-  hc::Serialize s;
-  bl.__cxxamp_serialize(s);
-  return bl.cho();
-}
-// The definition should be generated by clang
-// CHECK: define {{.*}}void @baz::__cxxamp_serialize(hc::Serialize&)
-// CHECK: call void @base::__cxxamp_serialize(hc::Serialize&)
diff --git a/tests/Unit/Codegen/ser_def.cpp b/tests/Unit/Codegen/ser_def.cpp
deleted file mode 100644
index 7e020049bc5..00000000000
--- a/tests/Unit/Codegen/ser_def.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s
-#include <cstdlib> //for size_t
-//Serialization object decl
-namespace hc {
-class Serialize {
- public:
-  void Append(size_t x, const void *s);
-};
-}
-
-class baz {
- public:
-  int cho(void) [[hc]] {
-    return 45;
-  }
-  baz(void): foo(1234) {}
-  void __cxxamp_serialize(hc::Serialize& s);
- private:
-  int foo;
-};
-
-int kerker(void) [[cpu, hc]] {
-  baz b1;
-  hc::Serialize s;
-  b1.__cxxamp_serialize(s);
-  return b1.cho();
-}
-// The definition should be generated by clang
-// CHECK: define {{.*}}void @baz::__cxxamp_serialize(hc::Serialize&)
diff --git a/tests/Unit/Codegen/ser_def_body.cpp b/tests/Unit/Codegen/ser_def_body.cpp
deleted file mode 100644
index bcb46cc5a1c..00000000000
--- a/tests/Unit/Codegen/ser_def_body.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s
-// RUN: %gtest_amp %s -DUSING_GTEST=1 -o %t && %t
-#include <cstdlib> //for size_t
-//Serialization object decl
-namespace hc {
-class Serialize {
- public:
-  Serialize():x(0) {}
-  void Append(size_t sz, const void *s) {
-    x++;
-  }
-  int x;
-};
-template<typename T>
-class gmac_array {
- public:
-  __attribute__((annotate("serialize")))/* For compiler */
-   void __cxxamp_serialize(Serialize& s) const {
-     s.Append(0, NULL);
-   }
-   T t;
-};
-}
-class nontemplate {
-  public:
-  __attribute__((annotate("serialize")))/* For compiler */
-    void __cxxamp_serialize(hc::Serialize& s) const {
-      s.Append(0, NULL);
-    }
-};
-class baz {
- public:
-  __attribute__((annotate("serialize")))/* For compiler */
-  void __cxxamp_serialize(hc::Serialize& s) const;
- private:
-  hc::gmac_array<float> foo;
-  hc::gmac_array<float> bar;
-  nontemplate nt;
-};
-
-int kerker(void) [[cpu, hc]] {
-  baz b1;
-  hc::Serialize s;
-  b1.__cxxamp_serialize(s);
-  return 1;
-}
-#ifdef USING_GTEST
-// The definition should be generated by clang
-// CHECK: call {{.*}}void @hc::gmac_array<float>::__cxxamp_serialize
-// Executable tests
-#include <gtest/gtest.h>
-TEST(Serialization, Call) {
-  baz bl;
-  hc::Serialize s;
-  bl.__cxxamp_serialize(s);
-  EXPECT_EQ(3, s.x);
-}
-#endif
diff --git a/tests/Unit/Codegen/ser_def_body_support_inheritclass.cpp b/tests/Unit/Codegen/ser_def_body_support_inheritclass.cpp
deleted file mode 100644
index db846788fa9..00000000000
--- a/tests/Unit/Codegen/ser_def_body_support_inheritclass.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s
-// RUN: %gtest_amp %s -DUSING_GTEST=1 -o %t && %t
-#include <cstdlib> //for size_t
-#include <iostream>
-//Serialization object decl
-namespace hc {
-class Serialize {
- public:
-  Serialize():x(0) {}
-  void Append(size_t sz, const void *s) {
-    x+=1;
-  }
-  int x;
-};
-template<typename T>
-class gmac_array {
- public:
-  __attribute__((annotate("serialize")))/* For compiler */
-  void __cxxamp_serialize(Serialize& s) const {
-    s.Append(0, NULL);
-  }
-  T t;
-};
-}
-class base {
- public:
-  __attribute__((annotate("serialize")))/* For compiler */
-  void __cxxamp_serialize(hc::Serialize& s) const;
- private:
-  hc::gmac_array<float> a;
-  int i;
-};
-class derive:public base {
- public:
-  __attribute__((annotate("serialize")))/* For compiler */
-  void __cxxamp_serialize(hc::Serialize& s) const;
- private:
-  float f;
-  hc::gmac_array<float> b;
-};
-
-int kerker(void) [[cpu, hc]] {
-  derive b1;
-  hc::Serialize s;
-  b1.__cxxamp_serialize(s);
-  return 1;
-}
-
-// The definition should be generated by clang
-// CHECK: define {{.*}}derive::__cxxamp_serialize
-// CHECK: call {{.*}}void @base::__cxxamp_serialize(hc::Serialize&) const
-// CHECK: }
-
-#ifdef USING_GTEST
-// Executable tests
-#include <gtest/gtest.h>
-TEST(Serialization, Call) {
-  derive bl;
-  hc::Serialize s;
-  bl.__cxxamp_serialize(s);
-  EXPECT_EQ(4, s.x);
-}
-#endif
diff --git a/tests/Unit/Codegen/ser_def_body_support_scalar.cpp b/tests/Unit/Codegen/ser_def_body_support_scalar.cpp
deleted file mode 100644
index 736c7db8242..00000000000
--- a/tests/Unit/Codegen/ser_def_body_support_scalar.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: %cxxamp -emit-llvm -S -c %s -o -|%cppfilt|%FileCheck %s
-// RUN: %gtest_amp %s -DUSING_GTEST=1 -o %t && %t
-#include <cstdlib> //for size_t
-#include <iostream>
-//Serialization object decl
-namespace hc {
-class Serialize {
- public:
-  Serialize():x(0) {}
-  void Append(size_t sz, const void *s) {
-    x++;
-  }
-  int x;
-};
-template<typename T>
-class gmac_array {
- public:
-  __attribute__((annotate("serialize")))/* For compiler */
-  void __cxxamp_serialize(Serialize& s) const {
-    s.Append(0, NULL);
-  }
-  T t;
-};
-}
-class baz {
- public:
-  __attribute__((annotate("serialize")))/* For compiler */
-  void __cxxamp_serialize(hc::Serialize& s) const;
- private:
-  hc::gmac_array<float> foo;
-  int i;
-  float f;
-};
-
-int kerker(void) [[cpu, hc]] {
-  baz b1;
-  hc::Serialize s;
-  b1.__cxxamp_serialize(s);
-  return 1;
-}
-
-// The definition should be generated by clang
-// CHECK: define {{.*}}baz::__cxxamp_serialize
-// CHECK: call {{.*}}void @hc::gmac_array<float>::__cxxamp_serialize
-// CHECK: call {{.*}}void @hc::Serialize::Append
-// CHECK: }
-
-#ifdef USING_GTEST
-// Executable tests
-#include <gtest/gtest.h>
-TEST(Serialization, Call) {
-  baz bl;
-  hc::Serialize s;
-  bl.__cxxamp_serialize(s);
-  EXPECT_EQ(3, s.x);
-}
-#endif
diff --git a/tests/Unit/Codegen/signature.cpp b/tests/Unit/Codegen/signature.cpp
deleted file mode 100644
index bfad292c5d2..00000000000
--- a/tests/Unit/Codegen/signature.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// XFAIL: *
-// RUN: %amp_device -O2 -D__KALMAR_ACCELERATOR__=1 %s -c -o %t.device.o
-// RUN: %gtest_amp %s %t.device.o -O2 -o %t && %t
-#include <stdlib.h>
-#ifndef __KALMAR_ACCELERATOR__ //gtest requires rtti, but amp_device forbids rtti
-#include <gtest/gtest.h>
-#endif
-class member {
- public:
-   void cho(void) [[hc]] {};
-  member(int i) {
-    _i = i+1;
-  }
-  int _i;
-};
-class base {
- public:
-  void cho(void) [[hc]] {};
-  base(float f) {
-    _f = f+1;
-  }
-  float _f;
-};
-class baz: public base {
- public:
-  void cho(void) [[hc]] {};
-  // User-defined constructor with same signature as generated
-  // deserializer
-  baz(float f, int bar_, int i): base(f), bar(bar_), m(i){}
-  int bar;
-  member m;
-};
-#ifdef __KALMAR_ACCELERATOR__
-__attribute__((annotate("user_deserialize")))
-float fake_use(void) [[hc]] {
-  baz bll(1.1, 2, 1); // calls the deserializer
-  return bll._f;
-}
-#else
-extern float fake_use(void);
-TEST(GPUCodeGen, Constructor) {
- baz user(1.1f, 2, 1); //calls user-defined constructor
- EXPECT_EQ(user._f, 2.1f);
- EXPECT_EQ(1.1f, fake_use()); //fake_use calls the generated constructor
-}
-#endif
diff --git a/tests/Unit/Codegen/trampoline.cpp b/tests/Unit/Codegen/trampoline.cpp
deleted file mode 100644
index 392a43acb0c..00000000000
--- a/tests/Unit/Codegen/trampoline.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ -c -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s
-#include "hc.hpp"
-class baz {
- public:
-  void operator()(hc::index<1> idx) [[hc]] {
-  }
-#if 0
-  // The declaration and body of this function will be generated
-  static __attribute__((annotate("__cxxamp_trampoline")))
-  void __cxxamp_trampoline(int, float) [[hc]];
-#endif
- private:
-  int foo;
-  float bar;
-};
-template<typename Foo>
-void kerker(void) [[hc]] {
-  // This reference triggers declaration&definition of __cxxamp_trampoline
-  int* b = reinterpret_cast<int*>(&Foo::__cxxamp_trampoline);
-}
-void kk(void) [[hc]] {
-  kerker<baz>();
-}
-// The definition should be generated by clang
-// CHECK: define {{.*}}void @baz::__cxxamp_trampoline
diff --git a/tests/Unit/Codegen/trampoline_byref.cpp b/tests/Unit/Codegen/trampoline_byref.cpp
deleted file mode 100644
index ed6dc084a94..00000000000
--- a/tests/Unit/Codegen/trampoline_byref.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ -c -S -emit-llvm %s -o -|%cppfilt|%FileCheck %s
-#include "hc.hpp"
-class baz {
- public:
-  void operator()(hc::index<1> &idx) [[hc]] {
-  }
-#if 0
-  // The declaration and body of this function will be generated
-  static __attribute__((annotate("__cxxamp_trampoline")))
-  void __cxxamp_trampoline(int foo, float bar) [[hc]];
-#endif
- private:
-  int foo;
-  float bar;
-};
-template<typename Foo>
-void kerker(void) [[hc]] {
-  // This reference triggers declaration&definition of __cxxamp_trampoline
-  int* b = reinterpret_cast<int*>(&Foo::__cxxamp_trampoline);
-}
-void kk(void) [[hc]] {
-  kerker<baz>();
-}
-// The definition should be generated by clang
-// CHECK: define {{.*}}void @baz::__cxxamp_trampoline
diff --git a/tests/Unit/Codegen/trampoline_name.cpp b/tests/Unit/Codegen/trampoline_name.cpp
deleted file mode 100644
index 06d01ce7ef5..00000000000
--- a/tests/Unit/Codegen/trampoline_name.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: %gtest_amp %s -o %t && %t
-#include <stdlib.h>
-#include <hc.hpp>
-#include <gtest/gtest.h>
-// the functor to test
-class baz {
- public:
-  void operator()(hc::index<1> idx) [[hc]] {}
-  int foo;
-  float bar;
-};
-
-TEST(GPUCodeGen, TrampolineName) {
-  // Inject the trampoline declaration
-  void* bar = reinterpret_cast<void*>(&baz::__cxxamp_trampoline);
-  // An injected member function __cxxamp_trampoline_name
-  // should return the mangled name of the trampoline
-  // hardcoded for now..
-  EXPECT_EQ(std::string("_ZN3baz19__cxxamp_trampolineEif"),
-    std::string(baz::__cxxamp_trampoline_name()));
-}
diff --git a/tests/Unit/Codegen/tworef.cpp b/tests/Unit/Codegen/tworef.cpp
index 03768436ab5..37f719d2a57 100644
--- a/tests/Unit/Codegen/tworef.cpp
+++ b/tests/Unit/Codegen/tworef.cpp
@@ -1,5 +1,5 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ -c -S -emit-llvm %s
-#include <hc.hpp>
+// RUN: %amp_device -D__HCC_ACCELERATOR__ -c -S -emit-llvm %s
+#include <hc/hc.hpp>
 
 using namespace hc;
 
diff --git a/tests/Unit/Codegen/vector_addition_using_array.cpp b/tests/Unit/Codegen/vector_addition_using_array.cpp
index 33774c954a9..d7b84892f51 100644
--- a/tests/Unit/Codegen/vector_addition_using_array.cpp
+++ b/tests/Unit/Codegen/vector_addition_using_array.cpp
@@ -1,9 +1,9 @@
-// RUN: %cxxamp -Werror %s -o %t.out && %t.out
+// RUN: %cxxamp %s -o %t.out && %t.out
 #include <stdlib.h>
 #include <iostream>
 #include <functional>
 #include <vector>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 #define N 10
@@ -23,11 +23,12 @@ void vectorAdd_by_array(const std::vector<float>& vecA, const std::vector<float>
    array_view<float> cv(C);
    extent<1> e(N);
 
-   parallel_for_each(e,
-         [=](index<1> idx) [[hc]] { cv[idx] = av[idx] + bv[idx]; }); 
+   parallel_for_each(e, [=](index<1> idx) [[hc]] {
+       cv[idx] = av[idx] + bv[idx];
+    });
 }
 
-int main(void)
+int main()
 {
     std::vector<float> vecA(N);
     std::vector<float> vecB(N);
diff --git a/tests/Unit/CompilerRT/host_half_conv1.cpp b/tests/Unit/CompilerRT/host_half_conv1.cpp
index c8e24c88f45..8d387bc1976 100644
--- a/tests/Unit/CompilerRT/host_half_conv1.cpp
+++ b/tests/Unit/CompilerRT/host_half_conv1.cpp
@@ -1,7 +1,7 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <hc_defines.h>
+#include <hc/hc.hpp>
+#include <hc/hc_defines.hpp>
 #include <iostream>
 
 int main() {
diff --git a/tests/Unit/CompilerRT/host_half_conv2.cpp b/tests/Unit/CompilerRT/host_half_conv2.cpp
index 6bcdd1cc18c..86e3c66c6ba 100644
--- a/tests/Unit/CompilerRT/host_half_conv2.cpp
+++ b/tests/Unit/CompilerRT/host_half_conv2.cpp
@@ -1,7 +1,7 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <hc_defines.h>
+#include <hc/hc.hpp>
+#include <hc/hc_defines.hpp>
 #include <iostream>
 
 int main() {
diff --git a/tests/Unit/Copy/copy.cpp b/tests/Unit/Copy/copy.cpp
index d9fb8812a40..cb2d569cbdc 100644
--- a/tests/Unit/Copy/copy.cpp
+++ b/tests/Unit/Copy/copy.cpp
@@ -1,11 +1,12 @@
 // XFAIL: *
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <stdlib.h>
+#include <hc/hc.hpp>
+
+#include <cmath>
+#include <cstdlib>
 #include <iostream>
-#include <vector>
 #include <numeric>
-#include <math.h>
+#include <vector>
 
 using namespace hc;
 
@@ -17,26 +18,26 @@ int main(void) {
   std::vector<accelerator> accs = accelerator::get_all();
   accelerator gpu_acc;
   for (auto& it: accs)
-    if (it != accelerator(accelerator::cpu_accelerator)) {
+    if (it != accelerator{accelerator::cpu_accelerator()}) {
       gpu_acc = it;
       break;
     }
   accelerator_view gpu_av = gpu_acc.get_default_view();
 
   std::vector<T> source(vecSize, INIT + 1);
-  array<T, 1> src(vecSize, source.begin());
+  array<T, 1> src(vecSize, source.begin(), gpu_av);
 
   std::vector<T> destination(vecSize, INIT);
-  array<T, 1> dest(vecSize, destination.begin());
+  array<T, 1> dest(vecSize, destination.begin(), gpu_av);
 
   // array that holds original value of dest
   std::vector<T> target(vecSize, 0);
-  array<T, 1> tgt(vecSize, target.begin());
+  array<T, 1> tgt(vecSize, target.begin(), gpu_av);
 
   // Run in a separate thread
   std::thread t([&]() {
      parallel_for_each(
-       gpu_av, dest.get_extent(), [=, &dest, &tgt](hc::index<1> idx) [[hc]] {
+       gpu_av, dest.get_extent(), [=, &dest, &tgt](index<1> idx) [[hc]] {
      for(unsigned i = 0; i < vecSize; i++)
        for (unsigned j = 0; j < vecSize; j++)
          tgt[idx] = dest[i];
@@ -44,10 +45,11 @@ int main(void) {
     });
   t.join();
 
-  // At this point, the copying needs to wait for availability of dest in thread t
-  // otherwise, undefined behavior happens in PFE since dest[i] is not deterministic
+  // At this point, the copying needs to wait for availability of dest in thread
+  // t otherwise, undefined behavior happens in PFE since dest[i] is not
+  // deterministic.
   copy(src, dest);
-  
+
   // Verify tgt on CPU
   array_view<T> av(tgt);
   bool ret = true;
@@ -58,4 +60,4 @@ int main(void) {
       }
   }
   return !(ret == true);
-}
+}
\ No newline at end of file
diff --git a/tests/Unit/DataContainers/array_view.cpp b/tests/Unit/DataContainers/array_view.cpp
index 50329e83862..7d2934c8e7c 100644
--- a/tests/Unit/DataContainers/array_view.cpp
+++ b/tests/Unit/DataContainers/array_view.cpp
@@ -3,12 +3,16 @@
 // What's in the comment above indicates it will build this file using
 // -std=c++amp and all other necessary flags to build. Then the system will 
 // run the built program and check its results with all google test cases.
-#include <stdlib.h>
-#include <hc.hpp>
+#include <hc/hc.hpp>
+
 #include <gtest/gtest.h>
 
+#include <cstdlib>
+
 #define N0 5000
 
+using namespace hc;
+
 int init1D(std::vector<int>& vec) {
   int n = N0;
   for (int i = 0; i < n; ++i) {
@@ -22,14 +26,14 @@ TEST(ClassArrayView, Constructor) {
   int old_vec0 = vec[0];
   // Testing line 2251 of C++AMP Language and Programming Model version 1.0
   {
-    hc::array_view<int> av(sizeVec, vec);
+    array_view<int> av(sizeVec, vec);
     EXPECT_EQ(vec[0], av[0]);
     av[0]+=1234;
   }
   // Synchronize back at destruction time
   EXPECT_EQ(old_vec0+1234, vec[0]);
   {
-    hc::array_view<int> av(sizeVec, vec);
+    array_view<int> av(sizeVec, vec);
     EXPECT_EQ(vec[0], av[0]);
     old_vec0 = vec[0]++;
     av.refresh();
@@ -38,10 +42,10 @@ TEST(ClassArrayView, Constructor) {
   // Testing line 2554 of C++AMP LPM v 1.0
   {
     int foo[]={123, 456, 789};
-    hc::array_view<int> av(3, foo);
+    array_view<int> av(3, foo);
     EXPECT_EQ(foo[2], av[2]);
     {
-      hc::array_view<int> bv(av);
+      array_view<int> bv(av);
       EXPECT_EQ(av[1], bv[1]);
     }
     // Line 2178 of C++AMP LPM v 1.0
diff --git a/tests/Unit/DataContainers/array_view_2d.1.cpp b/tests/Unit/DataContainers/array_view_2d.1.cpp
index 22ae034d6c1..944b11c370c 100644
--- a/tests/Unit/DataContainers/array_view_2d.1.cpp
+++ b/tests/Unit/DataContainers/array_view_2d.1.cpp
@@ -1,7 +1,7 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
 #include <iostream> 
-#include <hc.hpp> 
+#include <hc/hc.hpp> 
 using namespace hc; 
 int main() 
 {
diff --git a/tests/Unit/DataContainers/array_view_2d.2.cpp b/tests/Unit/DataContainers/array_view_2d.2.cpp
index 21984fc9154..e6fa2065371 100644
--- a/tests/Unit/DataContainers/array_view_2d.2.cpp
+++ b/tests/Unit/DataContainers/array_view_2d.2.cpp
@@ -1,7 +1,7 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
 #include <iostream> 
-#include <hc.hpp> 
+#include <hc/hc.hpp> 
 using namespace hc; 
 int main() 
 {
diff --git a/tests/Unit/DataContainers/array_view_2d.3.cpp b/tests/Unit/DataContainers/array_view_2d.3.cpp
index aaeadd808b8..c34e43e70a5 100644
--- a/tests/Unit/DataContainers/array_view_2d.3.cpp
+++ b/tests/Unit/DataContainers/array_view_2d.3.cpp
@@ -1,7 +1,7 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
 #include <iostream> 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <vector>
 using namespace hc; 
 int main() 
diff --git a/tests/Unit/DataContainers/extent.cpp b/tests/Unit/DataContainers/extent.cpp
index 701dc10acc5..81fbc6a1348 100644
--- a/tests/Unit/DataContainers/extent.cpp
+++ b/tests/Unit/DataContainers/extent.cpp
@@ -3,7 +3,7 @@
 // What's in the comment above indicates it will build this file using
 // -std=c++amp and all other necessary flags to build. Then the system will 
 // run the built program and check its results with all google test cases.
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <gtest/gtest.h>
 
 #define N0 10
diff --git a/tests/Unit/Design/2d.cpp b/tests/Unit/Design/2d.cpp
index 291d79b94f7..9e64280d99d 100644
--- a/tests/Unit/Design/2d.cpp
+++ b/tests/Unit/Design/2d.cpp
@@ -1,77 +1,75 @@
-//_view RUN: %gtest_amp %s -o %t.out && %t.out
+    //_view RUN: %gtest_amp %s -o %t.out && %t.out
+
+#include <hc/hc.hpp>
 
-#include <hc.hpp>
-#include <stdlib.h>
-#include <iostream>
-#ifndef __KALMAR_ACCELERATOR__
 #include <gtest/gtest.h>
-#endif
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+
+using namespace hc;
 
 class myVecAdd {
- public:
-  // CPU-side constructor. Written by the user
-  myVecAdd(hc::array_view<int, 2>& a,
-    hc::array_view<int, 2> &b,
-    hc::array_view<int, 2> &c):
-    a_(a), b_(b), c_(c) {
-  }
-  void operator() (hc::index<2> idx) [[hc]] {
-    c_[idx] = a_[idx]+b_[idx];
-  }
-  void operator() (hc::tiled_index<2> idx) [[hc]] {
-    c_[idx] = a_[idx]+b_[idx];
-  }
- private:
-  hc::array_view<int, 2> &c_;
-  hc::array_view<int, 2> a_, b_;
+    array_view<int, 2> a_, b_, c_;
+public:
+    // CPU-side constructor. Written by the user
+    myVecAdd(
+        array_view<int, 2>& a, array_view<int, 2>& b, array_view<int, 2>& c)
+        : a_(a), b_(b), c_(c)
+    {}
+
+    void operator()(index<2> idx) const [[hc]] { c_[idx] = a_[idx]+b_[idx]; }
+    void operator()(tiled_index<2> idx) const [[hc]]
+    {
+        c_[idx] = a_[idx] + b_[idx];
+    }
 };
-void bar(void) [[cpu, hc]] {
-  int* foo = reinterpret_cast<int*>(&myVecAdd::__cxxamp_trampoline);
-}
-#ifndef __KALMAR_ACCELERATOR__
+
 #define M 20
 #define N 40
-TEST(Design, Final) {
-  std::vector<int> vector_a(M*N),
-                   vector_b(M*N);
-  for (int i = 0; i < M*N; i++) {
-    vector_a[i] = 100.0f * rand() / RAND_MAX;
-    vector_b[i] = 100.0f * rand() / RAND_MAX;
-  }
-  hc::extent<2> e(M, N);
-  hc::array_view<int, 2> av(e, vector_a);
-  EXPECT_EQ(vector_a[2], av(0,2));
-  hc::array_view<int, 2> bv(e, vector_b);
-  { // Test untiled version
-    hc::array_view<int, 2> c(e);
-    myVecAdd mf(av, bv, c);
-    hc::parallel_for_each(e, mf);
-    int error=0;
-    for(int i = 0; i < M; i++) {
-      for(int j = 0; j < N; j++) {
-	std::cout << "av[" <<i<<","<<j<<"] = "<<av(i,j)<<"\n";
-	std::cout << "bv[" <<i<<","<<j<<"] = "<<bv(i,j)<<"\n";
-	std::cout << "c[" <<i<<","<<j<<"] = "<<c(i,j)<<"\n";
-	error += abs(c(i, j) - (av(i, j) + bv(i, j)));
-      }
+
+TEST(Design, Final)
+{
+    std::vector<int> vector_a(M * N), vector_b(M * N);
+
+    for (int i = 0; i < M * N; i++) {
+        vector_a[i] = 100.0f * rand() / RAND_MAX;
+        vector_b[i] = 100.0f * rand() / RAND_MAX;
+    }
+    extent<2> e(M, N);
+    array_view<int, 2> av(e, vector_a);
+    EXPECT_EQ(vector_a[2], av(0, 2));
+    array_view<int, 2> bv(e, vector_b);
+    { // Test untiled version
+        array_view<int, 2> c(e);
+        myVecAdd mf(av, bv, c);
+        parallel_for_each(e, mf);
+        int error=0;
+        for(int i = 0; i < M; i++) {
+            for(int j = 0; j < N; j++) {
+                std::cout << "av[" <<i<<","<<j<<"] = "<<av(i,j)<<"\n";
+                std::cout << "bv[" <<i<<","<<j<<"] = "<<bv(i,j)<<"\n";
+                std::cout << "c[" <<i<<","<<j<<"] = "<<c(i,j)<<"\n";
+                error += abs(c(i, j) - (av(i, j) + bv(i, j)));
+            }
+        }
+        EXPECT_EQ(0, error);
     }
-    EXPECT_EQ(0, error);
-  }
-  {
-   // Test tiled version
-    hc::array_view<int, 2> c(e);
-    myVecAdd mf(av, bv, c);
-    hc::parallel_for_each(e.tile(4, 4), mf);
-    int error=0;
-    for(int i = 0; i < M; i++) {
-      for(int j = 0; j < N; j++) {
-	std::cout << "av[" <<i<<","<<j<<"] = "<<av(i,j)<<"\n";
-	std::cout << "bv[" <<i<<","<<j<<"] = "<<bv(i,j)<<"\n";
-	std::cout << "c[" <<i<<","<<j<<"] = "<<c(i,j)<<"\n";
-	error += abs(c(i, j) - (av(i, j) + bv(i, j)));
-      }
+    {
+        // Test tiled version
+        array_view<int, 2> c(e);
+        myVecAdd mf(av, bv, c);
+        parallel_for_each(e.tile(4, 4), mf);
+        int error=0;
+        for(int i = 0; i < M; i++) {
+            for(int j = 0; j < N; j++) {
+                std::cout << "av[" <<i<<","<<j<<"] = "<<av(i,j)<<"\n";
+                std::cout << "bv[" <<i<<","<<j<<"] = "<<bv(i,j)<<"\n";
+                std::cout << "c[" <<i<<","<<j<<"] = "<<c(i,j)<<"\n";
+                error += abs(c(i, j) - (av(i, j) + bv(i, j)));
+            }
+        }
+        EXPECT_EQ(0, error);
     }
-    EXPECT_EQ(0, error);
-  }
-}
-#endif
+}
\ No newline at end of file
diff --git a/tests/Unit/Design/5d.support.cpp b/tests/Unit/Design/5d.support.cpp
deleted file mode 100644
index 1244ef2e51c..00000000000
--- a/tests/Unit/Design/5d.support.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-// RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-using namespace hc;
-
-template<typename _type, int _rank>
-bool test_array_rank(int extval = _rank)
-{
-    int *data = new int[_rank];
-    for (int i = 0; i < _rank; i++)
-        data[i] = extval;
-
-    extent<_rank> e(data);
-    array<_type, _rank> a1(e);
-
-    parallel_for_each(e, [&](index<_rank> idx) [[hc]] {
-        a1[idx] = 1;
-    });
-
-    // is the rank correct
-    if (a1.rank != _rank)
-    {
-        return false;
-    }
-
-    // verify data
-    std::vector<_type> vdata = a1;
-    for (unsigned int i = 0; i < e.size(); i++)
-    {
-        if (vdata[i] != 1)
-            return false;
-    }
-
-    return true;
-}
-
-int main()
-{
-	int result = 1;
-
-	result &= ((test_array_rank<int, 1>()));
-	result &= ((test_array_rank<int, 5>()));
-    
-    return !result;
-}
diff --git a/tests/Unit/Design/addr_space.cpp b/tests/Unit/Design/addr_space.cpp
index e7468591950..6abc70c1525 100644
--- a/tests/Unit/Design/addr_space.cpp
+++ b/tests/Unit/Design/addr_space.cpp
@@ -1,8 +1,8 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 #include <random>
 
 using namespace hc;
diff --git a/tests/Unit/Design/array_view_extent.cpp b/tests/Unit/Design/array_view_extent.cpp
index 7c15cb4f31e..bd49ca934e6 100644
--- a/tests/Unit/Design/array_view_extent.cpp
+++ b/tests/Unit/Design/array_view_extent.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 #include <iostream> 
-#include <hc.hpp> 
+#include <hc/hc.hpp> 
 using namespace hc; 
 int main() 
 {
diff --git a/tests/Unit/Design/array_view_extent_2d.cpp b/tests/Unit/Design/array_view_extent_2d.cpp
index ee2c6d71da4..71a5ae02ce3 100644
--- a/tests/Unit/Design/array_view_extent_2d.cpp
+++ b/tests/Unit/Design/array_view_extent_2d.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 #include <iostream> 
-#include <hc.hpp> 
+#include <hc/hc.hpp> 
 using namespace hc; 
 int main() 
 {
diff --git a/tests/Unit/Design/array_view_extent_2d_tile.cpp b/tests/Unit/Design/array_view_extent_2d_tile.cpp
index ee862622ce4..440ce5c60e9 100644
--- a/tests/Unit/Design/array_view_extent_2d_tile.cpp
+++ b/tests/Unit/Design/array_view_extent_2d_tile.cpp
@@ -1,18 +1,17 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <iostream> 
-#include <hc.hpp> 
-using namespace hc; 
-int main() 
+#include <iostream>
+#include <hc/hc.hpp>
+using namespace hc;
+int main()
 {
   int v[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
   extent<2> e(5, 2);
 
-  array_view<int, 2> av(e, v); 
+  array_view<int, 2> av(e, v);
   assert(av.get_extent() == e);
   // Testing tiled_index
-  parallel_for_each(av.get_extent().tile(1,2),
-    [=](tiled_index<2> idx) [[hc]] { 
-    av[idx] -= 1; 
+  parallel_for_each(av.get_extent().tile(1, 2), [=](tiled_index<2> idx) [[hc]] {
+    av[idx] -= 1;
   });
   assert(av.get_extent() == e);
   for(unsigned int i = 0; i < av.get_extent()[0]; i++)
diff --git a/tests/Unit/Design/double_lamda_in_one_fuction.cpp b/tests/Unit/Design/double_lamda_in_one_fuction.cpp
index 76f13089c60..96e85409068 100644
--- a/tests/Unit/Design/double_lamda_in_one_fuction.cpp
+++ b/tests/Unit/Design/double_lamda_in_one_fuction.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 #include <iostream> 
-#include <hc.hpp> 
+#include <hc/hc.hpp> 
 using namespace hc;
 int main() {
   int v[11] = {0,1,2,3,4,5,6,7,8,9,10};
diff --git a/tests/Unit/Design/lambda.cpp b/tests/Unit/Design/lambda.cpp
index 488ba607ca2..567c35e783b 100644
--- a/tests/Unit/Design/lambda.cpp
+++ b/tests/Unit/Design/lambda.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 int main(void){
@@ -17,7 +17,7 @@ int main(void){
   for (hc::index<1> i(0); i[0] < vecSize; i++) {
     ga[i] = 100.0f * rand() / RAND_MAX;
     gb[i] = 100.0f * rand() / RAND_MAX;
-    sum += a[i] + b[i];
+    sum += ga[i] + gb[i];
   }
 
   hc::parallel_for_each(
diff --git a/tests/Unit/Design/lambda_tiled.cpp b/tests/Unit/Design/lambda_tiled.cpp
index a4086349f35..6a805f57054 100644
--- a/tests/Unit/Design/lambda_tiled.cpp
+++ b/tests/Unit/Design/lambda_tiled.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 int main(void){
@@ -21,12 +21,10 @@ int main(void){
   for (hc::index<1> i(0); i[0] < vecSize; i++) {
     ga[i] = 100.0f * rand() / RAND_MAX;
     gb[i] = 100.0f * rand() / RAND_MAX;
-    sum += a[i] + b[i];
+    sum += ga[i] + gb[i];
   }
 
-  hc::parallel_for_each(
-    et,
-    [=](hc::tiled_index<1> idx) [[hc]] {
+  hc::parallel_for_each(et, [=](hc::tiled_index<1> idx) [[hc]] {
     gc[idx] = ga[idx]+gb[idx];
   });
 
diff --git a/tests/Unit/Design/lambda_tiled_local.cpp b/tests/Unit/Design/lambda_tiled_local.cpp
index e6a36511bc5..83585261ef5 100644
--- a/tests/Unit/Design/lambda_tiled_local.cpp
+++ b/tests/Unit/Design/lambda_tiled_local.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 int main(void){
@@ -21,12 +21,10 @@ int main(void){
   for (hc::index<1> i(0); i[0] < vecSize; i++) {
     ga[i] = 100.0f * rand() / RAND_MAX;
     gb[i] = 100.0f * rand() / RAND_MAX;
-    sum += a[i] + b[i];
+    sum += ga[i] + gb[i];
   }
 
-  hc::parallel_for_each(
-    et,
-    [=](hc::tiled_index<1> idx) [[hc]] {
+  hc::parallel_for_each(et, [=](hc::tiled_index<1> idx) [[hc]] {
     tile_static int shm[TILE];
     shm[idx.local[0]] = ga[idx];
     idx.barrier.wait();
diff --git a/tests/Unit/Design/overload.cpp b/tests/Unit/Design/overload.cpp
index f359fe92641..daf429a60e1 100644
--- a/tests/Unit/Design/overload.cpp
+++ b/tests/Unit/Design/overload.cpp
@@ -1,5 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
+
 using namespace hc;
 
 int f() [[hc]] { return 55; }
@@ -16,9 +17,7 @@ bool TestOnDevice()
     array<int, 1> a((extent<1>(1)));
     array_view<int> A(a);
     extent<1> ex(1);
-    parallel_for_each(ex, [&](index<1> idx) [[cpu, hc]] {
-        A(idx) = g();
-    });
+    parallel_for_each(ex, [=](index<1> idx) [[hc]] { A(idx) = g(); });
     return A[0] == 55;
 }
 
diff --git a/tests/Unit/Design/pass_by_ref.cpp b/tests/Unit/Design/pass_by_ref.cpp
index c14975e46fd..e95d486b6f6 100644
--- a/tests/Unit/Design/pass_by_ref.cpp
+++ b/tests/Unit/Design/pass_by_ref.cpp
@@ -1,32 +1,25 @@
-// RUN: %gtest_amp %s -o %t.out 
+// RUN: %gtest_amp %s -o %t.out
 // RUN: %t.out
 
-#include <hc.hpp>
-#include <stdlib.h>
-#include <iostream>
-#ifndef __KALMAR_ACCELERATOR__
+#include <hc/hc.hpp>
+
 #include <gtest/gtest.h>
-#endif
+
+#include <cstdlib>
+#include <iostream>
 
 class myVecAdd {
  public:
   // CPU-side constructor. Written by the user
-  myVecAdd(hc::array_view<int>& a,
-    hc::array_view<int> &b,
-    hc::array_view<int, 1> &c):
-    a_(a), b_(b), c_(c) {
-  }
-  void operator() (hc::index<1> idx) [[hc]] {
-    c_[idx] = a_[idx]+b_[idx];
-  }
+  myVecAdd(
+    hc::array_view<int>& a, hc::array_view<int> &b, hc::array_view<int, 1> &c)
+    : a_(a), b_(b), c_(c)
+  {}
+  void operator()(hc::index<1> idx) const [[hc]] { c_[idx] = a_[idx]+b_[idx]; }
  private:
-  hc::array_view<int> a_, b_;
-  hc::array_view<int>& c_;
+  hc::array_view<int> a_, b_, c_;
 };
-void bar(void) [[cpu, hc]] {
-  int* foo = reinterpret_cast<int*>(&myVecAdd::__cxxamp_trampoline);
-}
-#ifndef __KALMAR_ACCELERATOR__
+
 TEST(Design, Final) {
   const int vecSize = 100;
 
@@ -42,12 +35,10 @@ TEST(Design, Final) {
   for (hc::index<1> i(0); i[0] < vecSize; i++) {
     ga[i] = 100.0f * rand() / RAND_MAX;
     gb[i] = 100.0f * rand() / RAND_MAX;
-    sum += a[i] + b[i];
+    sum += ga[i] + gb[i];
   }
 
-  hc::parallel_for_each(
-    e,
-    mf);
+  hc::parallel_for_each(e, mf);
 
   int error = 0;
   for(unsigned i = 0; i < vecSize; i++) {
@@ -55,4 +46,4 @@ TEST(Design, Final) {
   }
   EXPECT_EQ(error, 0);
 }
-#endif
+
diff --git a/tests/Unit/Design/quick_prototype_vector_add_using_gmac.cpp b/tests/Unit/Design/quick_prototype_vector_add_using_gmac.cpp
index 69b529bbad9..23fc99aa3b1 100644
--- a/tests/Unit/Design/quick_prototype_vector_add_using_gmac.cpp
+++ b/tests/Unit/Design/quick_prototype_vector_add_using_gmac.cpp
@@ -1,11 +1,11 @@
 // RUN: %gtest_amp %s -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <stdlib.h>
-#include <iostream>
-#ifndef __KALMAR_ACCELERATOR__
+#include <hc/hc.hpp>
+
 #include <gtest/gtest.h>
-#endif
+
+#include <cstdlib>
+#include <iostream>
 
 class myVecAdd {
  public:
@@ -15,16 +15,12 @@ class myVecAdd {
     hc::array_view<int> &c):
     a_(a), b_(b), c_(c) {
   }
-  void operator() (hc::index<1> idx) [[hc]] {
+  void operator() (hc::index<1> idx) const [[hc]] {
     c_[idx] = a_[idx]+b_[idx];
   }
  private:
   hc::array_view<int> a_, b_, c_;
 };
-void bar(void) [[cpu, hc]] {
-  int* foo = reinterpret_cast<int*>(&myVecAdd::__cxxamp_trampoline);
-}
-#ifndef __KALMAR_ACCELERATOR__
 TEST(Design, Final) {
   const int vecSize = 100;
 
@@ -42,17 +38,14 @@ TEST(Design, Final) {
   for (hc::index<1> i(0); i[0] < vecSize; i++) {
     ga[i] = 100.0f * rand() / RAND_MAX;
     gb[i] = 100.0f * rand() / RAND_MAX;
-    sum += a[i] + b[i];
+    sum += ga[i] + gb[i];
   }
   myVecAdd mf(ga, gb, gc);
-  hc::parallel_for_each(
-    e,
-    mf);
+  hc::parallel_for_each(e, mf);
 
   int error = 0;
   for(unsigned i = 0; i < vecSize; i++) {
     error += gc[i] - (ga[i] + gb[i]);
   }
   EXPECT_EQ(error, 0);
-}
-#endif
+}
\ No newline at end of file
diff --git a/tests/Unit/Design/transpose.cpp b/tests/Unit/Design/transpose.cpp
index 90e127d2d3a..22ba660c091 100644
--- a/tests/Unit/Design/transpose.cpp
+++ b/tests/Unit/Design/transpose.cpp
@@ -6,7 +6,7 @@
 // Implement C++ AMP version of matrix transpose
 //----------------------------------------------------------------------------
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <cmath>
 #include <assert.h>
 #include <iostream>
@@ -221,7 +221,7 @@ void transpose_tiled_truncate_option_b(
   // Transform matrix to be multiple of 16*16 and transpose.
   auto b  = data.section(index<2>(0,0), e_truncated);
   auto b_t = data_transpose.section(index<2>(0,0),
-                 transpose(static_cast<extent<2>>(e_truncated)));
+                 transpose(static_cast<const extent<2>&>(e_truncated)));
   transpose_tiled_even<_value_type, _tile_size>(b, b_t);
 
   // leftover processing
@@ -310,5 +310,4 @@ int main() {
                       "transpose_tiled_truncate_option_b");
 #endif
   return 0;
-}
-
+}
\ No newline at end of file
diff --git a/tests/Unit/Design/veccadd3.cpp b/tests/Unit/Design/veccadd3.cpp
index 074c960797a..91f10d11d4c 100644
--- a/tests/Unit/Design/veccadd3.cpp
+++ b/tests/Unit/Design/veccadd3.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 
 using namespace hc;
diff --git a/tests/Unit/DispatchAql/dispatch_hsa_kernel.cpp b/tests/Unit/DispatchAql/dispatch_hsa_kernel.cpp
index 4621e7c9c96..5f847a5eaca 100644
--- a/tests/Unit/DispatchAql/dispatch_hsa_kernel.cpp
+++ b/tests/Unit/DispatchAql/dispatch_hsa_kernel.cpp
@@ -1,6 +1,6 @@
-// RUN: %hc %s %S/hsacodelib.CPP -I/opt/rocm/include -L/opt/rocm/lib -lhsa-runtime64 -lhc_am -o %t.out && %t.out %S/vcpy_isa.hsaco
+// RUN: %hc %s %S/hsacodelib.CPP -o %t.out && %t.out %S/vcpy_isa.hsaco
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
@@ -9,7 +9,7 @@
 #include <hsa/hsa.h>
 
 #include "hsacodelib.h"
-#include <hc_am.hpp>
+#include <hc/hc_am.hpp>
 
 int p_db = 1;
 int p_wait = 1;
diff --git a/tests/Unit/DispatchAql/hsacodelib.CPP b/tests/Unit/DispatchAql/hsacodelib.CPP
index ca920eb7bbe..94f72910279 100644
--- a/tests/Unit/DispatchAql/hsacodelib.CPP
+++ b/tests/Unit/DispatchAql/hsacodelib.CPP
@@ -2,7 +2,7 @@
 #include <fstream>
 #include <assert.h>
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <grid_launch.h>
 
 #include <hsa/hsa.h>
diff --git a/tests/Unit/DynamicTileStatic/test1.cpp b/tests/Unit/DynamicTileStatic/test1.cpp
index d0f02dcbc73..6827eb4c6dd 100644
--- a/tests/Unit/DynamicTileStatic/test1.cpp
+++ b/tests/Unit/DynamicTileStatic/test1.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -14,7 +14,7 @@ bool test() {
   size_t size1 = acc.get_max_tile_static_size();
   std::cout << "Max tile static size of accelerator: " << size1 << "\n";
 
-  size_t size2 = av.get_max_tile_static_size();
+  size_t size2 = av.get_accelerator().get_max_tile_static_size();
   std::cout << "Max tile static size of accelerator_view: " << size2 << "\n";
 
   // size1 and size2 shall agree
diff --git a/tests/Unit/DynamicTileStatic/test10.cpp b/tests/Unit/DynamicTileStatic/test10.cpp
index 1eb54406dc7..cdd6e54254c 100644
--- a/tests/Unit/DynamicTileStatic/test10.cpp
+++ b/tests/Unit/DynamicTileStatic/test10.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/DynamicTileStatic/test11.cpp b/tests/Unit/DynamicTileStatic/test11.cpp
index 78124344c0c..753e456e5c1 100644
--- a/tests/Unit/DynamicTileStatic/test11.cpp
+++ b/tests/Unit/DynamicTileStatic/test11.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/DynamicTileStatic/test12.cpp b/tests/Unit/DynamicTileStatic/test12.cpp
index 4d007ef3729..c852dff0b0a 100644
--- a/tests/Unit/DynamicTileStatic/test12.cpp
+++ b/tests/Unit/DynamicTileStatic/test12.cpp
@@ -1,12 +1,10 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
-#define __KERNEL__ __attribute__((amp))
-
 template<size_t GRID_SIZE, size_t TILE_SIZE>
 bool test() {
   using namespace hc;
@@ -16,9 +14,8 @@ bool test() {
   tiled_extent<1> ex(GRID_SIZE, TILE_SIZE);
   ex.set_dynamic_group_segment_size(1024);
   
-  completion_future fut = parallel_for_each(hc::accelerator().get_default_view(),
-                    ex,
-                    __KERNEL__ [=](tiled_index<1>& tidx) {
+  completion_future fut = parallel_for_each(
+    hc::accelerator().get_default_view(), ex, [=](tiled_index<1>& tidx) [[hc]] {
     tile_static int lds1[TILE_SIZE];
 
     // obtain workitem absolute index and workgroup index
diff --git a/tests/Unit/DynamicTileStatic/test13.cpp b/tests/Unit/DynamicTileStatic/test13.cpp
index 82c8f8eb140..a337a623f16 100644
--- a/tests/Unit/DynamicTileStatic/test13.cpp
+++ b/tests/Unit/DynamicTileStatic/test13.cpp
@@ -1,12 +1,10 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
-#define __KERNEL__ __attribute__((amp))
-
 template<size_t GRID_SIZE, size_t TILE_SIZE>
 bool test() {
   using namespace hc;
@@ -14,10 +12,9 @@ bool test() {
 
   array_view<int, 1> av(GRID_SIZE);
   tiled_extent<1> ex(GRID_SIZE, TILE_SIZE, 1024);
-  
-  completion_future fut = parallel_for_each(hc::accelerator().get_default_view(),
-                    ex,
-                    __KERNEL__ [=](tiled_index<1>& tidx) {
+
+  completion_future fut = parallel_for_each(
+    hc::accelerator().get_default_view(), ex, [=](tiled_index<1>& tidx) [[hc]] {
     tile_static int lds1[TILE_SIZE];
 
     // obtain workitem absolute index and workgroup index
diff --git a/tests/Unit/DynamicTileStatic/test14.cpp b/tests/Unit/DynamicTileStatic/test14.cpp
index 27321f4fab4..c217eae0a83 100644
--- a/tests/Unit/DynamicTileStatic/test14.cpp
+++ b/tests/Unit/DynamicTileStatic/test14.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/DynamicTileStatic/test15.cpp b/tests/Unit/DynamicTileStatic/test15.cpp
index 316ac0f2687..18586e35f92 100644
--- a/tests/Unit/DynamicTileStatic/test15.cpp
+++ b/tests/Unit/DynamicTileStatic/test15.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <iomanip>
diff --git a/tests/Unit/DynamicTileStatic/test16.cpp b/tests/Unit/DynamicTileStatic/test16.cpp
index 05aad86dc3c..88aac876423 100644
--- a/tests/Unit/DynamicTileStatic/test16.cpp
+++ b/tests/Unit/DynamicTileStatic/test16.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/DynamicTileStatic/test2.cpp b/tests/Unit/DynamicTileStatic/test2.cpp
index a8a3dd95dd9..6d2f5f84e34 100644
--- a/tests/Unit/DynamicTileStatic/test2.cpp
+++ b/tests/Unit/DynamicTileStatic/test2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/DynamicTileStatic/test3.cpp b/tests/Unit/DynamicTileStatic/test3.cpp
index b7cac9ba9c8..5facbdef836 100644
--- a/tests/Unit/DynamicTileStatic/test3.cpp
+++ b/tests/Unit/DynamicTileStatic/test3.cpp
@@ -1,8 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -22,7 +21,9 @@ bool test1D() {
   hc::array_view<int, 1> av3(grid_size, table3);
   hc::array_view<int, 1> av4(grid_size, table4);
 
-  hc::parallel_for_each(hc::extent<1>(grid_size).tile(tile_size), [=](hc::tiled_index<1>& idx) [[hc]] {
+  hc::parallel_for_each(
+    hc::extent<1>(grid_size).tile(tile_size),
+    [=](hc::tiled_index<1>& idx) [[hc]] {
     av1(idx) = idx.global[0];
     av2(idx) = idx.local[0];
     av3(idx) = idx.tile[0];
@@ -41,7 +42,9 @@ bool test1D() {
   hc::array_view<int, 1> av7(grid_size, table7);
   hc::array_view<int, 1> av8(grid_size, table8);
 
-  hc::completion_future fut = hc::parallel_for_each(hc::tiled_extent<1>(grid_size, tile_size), [=](hc::tiled_index<1>& idx) [[hc]] {
+  hc::completion_future fut = hc::parallel_for_each(
+    hc::tiled_extent<1>(grid_size, tile_size),
+    [=](hc::tiled_index<1>& idx) [[hc]] {
     av5(idx) = idx.global[0];
     av6(idx) = idx.local[0];
     av7(idx) = idx.tile[0];
@@ -122,7 +125,9 @@ bool test2D() {
   hc::array_view<int, 2> av7(grid_size_0, grid_size_1, table7);
   hc::array_view<int, 2> av8(grid_size_0, grid_size_1, table8);
 
-  hc::parallel_for_each(hc::extent<2>(grid_size_0, grid_size_1).tile(tile_size_0, tile_size_1), [=](hc::tiled_index<2>& idx) [[hc]] {
+  hc::parallel_for_each(
+    hc::extent<2>(grid_size_0, grid_size_1).tile(tile_size_0, tile_size_1),
+    [=](hc::tiled_index<2>& idx) [[hc]] {
     av1(idx) = idx.global[0];
     av2(idx) = idx.global[1];
     av3(idx) = idx.local[0];
@@ -152,7 +157,9 @@ bool test2D() {
   hc::array_view<int, 2> av15(grid_size_0, grid_size_1, table15);
   hc::array_view<int, 2> av16(grid_size_0, grid_size_1, table16);
 
-  hc::completion_future fut = hc::parallel_for_each(hc::tiled_extent<2>(grid_size_0, grid_size_1, tile_size_0, tile_size_1), [=](hc::tiled_index<2>& idx) [[hc]] {
+  hc::completion_future fut = hc::parallel_for_each(
+    hc::tiled_extent<2>(grid_size_0, grid_size_1, tile_size_0, tile_size_1),
+    [=](hc::tiled_index<2>& idx) [[hc]] {
     av9(idx) = idx.global[0];
     av10(idx) = idx.global[1];
     av11(idx) = idx.local[0];
@@ -222,7 +229,13 @@ bool test2D() {
 }
 
 /// test HC parallel_for_each interface
-template<size_t grid_size_0, size_t grid_size_1, size_t grid_size_2, size_t tile_size_0, size_t tile_size_1, size_t tile_size_2>
+template<
+  size_t grid_size_0,
+  size_t grid_size_1,
+  size_t grid_size_2,
+  size_t tile_size_0,
+  size_t tile_size_1,
+  size_t tile_size_2>
 bool test3D() {
 
   bool ret = true;
@@ -254,7 +267,10 @@ bool test3D() {
   hc::array_view<int, 3> av11(grid_size_0, grid_size_1, grid_size_2, table11);
   hc::array_view<int, 3> av12(grid_size_0, grid_size_1, grid_size_2, table12);
 
-  hc::parallel_for_each(hc::extent<3>(grid_size_0, grid_size_1, grid_size_2).tile(tile_size_0, tile_size_1, tile_size_2), [=](hc::tiled_index<3>& idx) [[hc]] {
+  hc::parallel_for_each(
+    hc::extent<3>(grid_size_0, grid_size_1, grid_size_2).tile(
+        tile_size_0, tile_size_1, tile_size_2),
+        [=](hc::tiled_index<3>& idx) [[hc]] {
     av1(idx) = idx.global[0];
     av2(idx) = idx.global[1];
     av3(idx) = idx.global[2];
@@ -296,7 +312,15 @@ bool test3D() {
   hc::array_view<int, 3> av23(grid_size_0, grid_size_1, grid_size_2, table23);
   hc::array_view<int, 3> av24(grid_size_0, grid_size_1, grid_size_2, table24);
 
-  hc::completion_future fut = hc::parallel_for_each(hc::tiled_extent<3>(grid_size_0, grid_size_1, grid_size_2, tile_size_0, tile_size_1, tile_size_2), [=](hc::tiled_index<3>& idx) [[hc]] {
+  hc::completion_future fut = hc::parallel_for_each(
+    hc::tiled_extent<3>(
+      grid_size_0,
+      grid_size_1,
+      grid_size_2,
+      tile_size_0,
+      tile_size_1,
+      tile_size_2),
+      [=](hc::tiled_index<3>& idx) [[hc]] {
     av13(idx) = idx.global[0];
     av14(idx) = idx.global[1];
     av15(idx) = idx.global[2];
diff --git a/tests/Unit/DynamicTileStatic/test6.cpp b/tests/Unit/DynamicTileStatic/test6.cpp
index 37b9b2dd634..29125a6e2ae 100644
--- a/tests/Unit/DynamicTileStatic/test6.cpp
+++ b/tests/Unit/DynamicTileStatic/test6.cpp
@@ -1,24 +1,20 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
-#define __KERNEL__ __attribute__((amp))
-
 template<size_t GRID_SIZE, size_t TILE_SIZE>
 bool test() {
   using namespace hc;
 
-
   array_view<int, 1> av(GRID_SIZE);
   tiled_extent<1> ex(GRID_SIZE, TILE_SIZE);
   ex.set_dynamic_group_segment_size(0);
-  
-  completion_future fut = parallel_for_each(hc::accelerator().get_default_view(),
-                    ex,
-                    __KERNEL__ [=](tiled_index<1>& tidx) {
+
+  completion_future fut = parallel_for_each(
+    hc::accelerator().get_default_view(), ex, [=](tiled_index<1>& tidx) [[hc]] {
     tile_static int lds1[TILE_SIZE];
 
     // obtain workitem absolute index and workgroup index
diff --git a/tests/Unit/DynamicTileStatic/test7.cpp b/tests/Unit/DynamicTileStatic/test7.cpp
index eb343ebf18c..bfe0ec1053e 100644
--- a/tests/Unit/DynamicTileStatic/test7.cpp
+++ b/tests/Unit/DynamicTileStatic/test7.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
diff --git a/tests/Unit/DynamicTileStatic/test8.cpp b/tests/Unit/DynamicTileStatic/test8.cpp
index d41a1f7e4f0..ffa645bc808 100644
--- a/tests/Unit/DynamicTileStatic/test8.cpp
+++ b/tests/Unit/DynamicTileStatic/test8.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/DynamicTileStatic/test9.cpp b/tests/Unit/DynamicTileStatic/test9.cpp
index b9513233c76..d18e53add2b 100644
--- a/tests/Unit/DynamicTileStatic/test9.cpp
+++ b/tests/Unit/DynamicTileStatic/test9.cpp
@@ -1,8 +1,8 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <hc.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/FilePath/file path_test2.cpp b/tests/Unit/FilePath/file path_test2.cpp
index e146012e709..58e0a81ef7b 100644
--- a/tests/Unit/FilePath/file path_test2.cpp	
+++ b/tests/Unit/FilePath/file path_test2.cpp	
@@ -8,7 +8,7 @@
 #include <exception>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #define N  (1024 * 500)
 
diff --git a/tests/Unit/FilePath/file_path_test1.cpp b/tests/Unit/FilePath/file_path_test1.cpp
index 9623e30fda8..8d8a09a4a7d 100644
--- a/tests/Unit/FilePath/file_path_test1.cpp
+++ b/tests/Unit/FilePath/file_path_test1.cpp
@@ -8,7 +8,7 @@
 #include <exception>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #define N  (1024 * 500)
 
diff --git a/tests/Unit/FilePath/file_path_test3.cpp b/tests/Unit/FilePath/file_path_test3.cpp
index c74ac00a643..854a3baf1e7 100644
--- a/tests/Unit/FilePath/file_path_test3.cpp
+++ b/tests/Unit/FilePath/file_path_test3.cpp
@@ -5,7 +5,7 @@
 // RUN: %hc %s -L"%T/foo bar/" -lfile_path_test3 -o %t.out && %t.out
 
 #include <cstdio>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int sum(hc::array_view<int,1>& input);
 
diff --git a/tests/Unit/FilePath/file_path_test4.cpp b/tests/Unit/FilePath/file_path_test4.cpp
index 0f7ff33ea9d..4b36459fb0f 100644
--- a/tests/Unit/FilePath/file_path_test4.cpp
+++ b/tests/Unit/FilePath/file_path_test4.cpp
@@ -9,7 +9,7 @@
 
 #if SHARED_LIBRARY
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int foo(int grid_size) {
   using namespace hc;
diff --git a/tests/Unit/HC/accelerator_get_all_views.cpp b/tests/Unit/HC/accelerator_get_all_views.cpp
index 66dcfb9081c..e10878084f3 100644
--- a/tests/Unit/HC/accelerator_get_all_views.cpp
+++ b/tests/Unit/HC/accelerator_get_all_views.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <vector>
 
diff --git a/tests/Unit/HC/accelerator_get_all_views_mt.cpp b/tests/Unit/HC/accelerator_get_all_views_mt.cpp
index b0ea1ad61e5..1e89f91e0db 100644
--- a/tests/Unit/HC/accelerator_get_all_views_mt.cpp
+++ b/tests/Unit/HC/accelerator_get_all_views_mt.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/amdgcn_ds_bpermute.cpp b/tests/Unit/HC/amdgcn_ds_bpermute.cpp
index db531cb4839..1b1094c694f 100644
--- a/tests/Unit/HC/amdgcn_ds_bpermute.cpp
+++ b/tests/Unit/HC/amdgcn_ds_bpermute.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/tests/Unit/HC/amdgcn_ds_permute.cpp b/tests/Unit/HC/amdgcn_ds_permute.cpp
index b565c807d8b..6114a77104e 100644
--- a/tests/Unit/HC/amdgcn_ds_permute.cpp
+++ b/tests/Unit/HC/amdgcn_ds_permute.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/tests/Unit/HC/amdgcn_ds_swizzle_bitmode.cpp b/tests/Unit/HC/amdgcn_ds_swizzle_bitmode.cpp
index bbb46cd8d53..596ce9c6797 100644
--- a/tests/Unit/HC/amdgcn_ds_swizzle_bitmode.cpp
+++ b/tests/Unit/HC/amdgcn_ds_swizzle_bitmode.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/tests/Unit/HC/amdgcn_ds_swizzle_qdmode.cpp b/tests/Unit/HC/amdgcn_ds_swizzle_qdmode.cpp
index 5ee8b9e9c00..36dfd1e8528 100644
--- a/tests/Unit/HC/amdgcn_ds_swizzle_qdmode.cpp
+++ b/tests/Unit/HC/amdgcn_ds_swizzle_qdmode.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/tests/Unit/HC/amdgcn_wave_rl1.cpp b/tests/Unit/HC/amdgcn_wave_rl1.cpp
index 9c16ec7dc93..b053d4357df 100644
--- a/tests/Unit/HC/amdgcn_wave_rl1.cpp
+++ b/tests/Unit/HC/amdgcn_wave_rl1.cpp
@@ -8,7 +8,7 @@
 #include <vector>
 #include <algorithm>
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 //#define DEBUG 2
 
diff --git a/tests/Unit/HC/amdgcn_wave_rr1.cpp b/tests/Unit/HC/amdgcn_wave_rr1.cpp
index fab4130d9c5..83754097ddb 100644
--- a/tests/Unit/HC/amdgcn_wave_rr1.cpp
+++ b/tests/Unit/HC/amdgcn_wave_rr1.cpp
@@ -8,7 +8,7 @@
 #include <vector>
 #include <algorithm>
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 //#define DEBUG 2
 
diff --git a/tests/Unit/HC/amdgcn_wave_sl1.cpp b/tests/Unit/HC/amdgcn_wave_sl1.cpp
index 179235ec2be..f8abcb09666 100644
--- a/tests/Unit/HC/amdgcn_wave_sl1.cpp
+++ b/tests/Unit/HC/amdgcn_wave_sl1.cpp
@@ -8,7 +8,7 @@
 #include <vector>
 #include <algorithm>
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 //#define DEBUG 2
 
diff --git a/tests/Unit/HC/amdgcn_wave_sr1.cpp b/tests/Unit/HC/amdgcn_wave_sr1.cpp
index b4f8618b5f8..798377ebefd 100644
--- a/tests/Unit/HC/amdgcn_wave_sr1.cpp
+++ b/tests/Unit/HC/amdgcn_wave_sr1.cpp
@@ -8,7 +8,7 @@
 #include <vector>
 #include <algorithm>
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 //#define DEBUG 2
 
diff --git a/tests/Unit/HC/array_device_pointer.cpp b/tests/Unit/HC/array_device_pointer.cpp
deleted file mode 100644
index 9ecee5edb99..00000000000
--- a/tests/Unit/HC/array_device_pointer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-
-// RUN: %hc %s -o %t.out && %t.out
-
-#include <hc.hpp>
-
-#include <iostream>
-#include <vector>
-
-// this test case checks:
-// - hc::array::accelerator_pointer() : obtain device memory pointer from an array instance
-// - hc::array(int, void*) : construct another hc::array from a given device memory pointer
-// - hc::array(int, int, void*) : construct another hc::array from a given device memory pointer
-// - hc::array(int, int, int, void*) : construct another hc::array from a given device memory pointer
-// - hc::array(extent&, void*) : construct another hc::array from a given device memory pointer
-
-// this is the 1D case
-template<int N>
-bool test1D() {
-  bool ret = true;
-
-  hc::array<int, 1> array1(N);
-
-  // fetch the device pointer of array1
-  void* array1_devptr = array1.accelerator_pointer();
-
-  // construct another array based on the pointer
-  hc::array<int, 1> array2(N, array1_devptr);
-
-  // execute a kernel, and use the second array
-  hc::completion_future fut = parallel_for_each(hc::extent<1>(N), [&](hc::index<1>& idx) [[hc]] {
-    array2[idx] = idx[0];
-  });
-
-  fut.wait();
-
-  // construct yet another array based on the pointer
-  hc::array<int, 1> array3(array1.get_extent(), array1_devptr);
-
-  // execute a kernel, and use the third array
-  hc::completion_future fut2 = parallel_for_each(hc::extent<1>(N), [&](hc::index<1>& idx) [[hc]] {
-    array3[idx] = -array3[idx];
-  });
-
-  // read out the value from the first array
-  std::vector<int> result1 = array1;
-
-  // read out the value from the second array
-  std::vector<int> result2 = array2;
-
-  // read out the value from the third array
-  std::vector<int> result3 = array3;
-   
-  // verify all three versions are the same
-  ret &= (result1.size() == result2.size());
-  ret &= (result1.size() == result3.size());
-  for (int i = 0; i < result1.size(); ++i) {
-    ret &= (result1[i] == -i);
-    ret &= (result1[i] == result2[i]);
-    ret &= (result1[i] == result3[i]);
-  }
-
-  return ret;
-}
-
-// this is the 2D case
-template<int N, int M>
-bool test2D() {
-  bool ret = true;
-
-  hc::array<int, 2> array1(N, M);
-
-  // fetch the device pointer of array1
-  void* array1_devptr = array1.accelerator_pointer();
-
-  // construct another array based on the pointer
-  hc::array<int, 2> array2(N, M, array1_devptr);
-
-  // execute a kernel, and use the second array
-  hc::completion_future fut = parallel_for_each(hc::extent<2>(N, M), [&](hc::index<2>& idx) [[hc]] {
-    array2[idx] = idx[0] * M + idx[1];
-  });
-
-  fut.wait();
-
-  // construct yet another array based on the pointer
-  hc::array<int, 2> array3(array1.get_extent(), array1_devptr);
-
-  // execute a kernel, and use the third array
-  hc::completion_future fut2 = parallel_for_each(hc::extent<2>(N, M), [&](hc::index<2>& idx) [[hc]] {
-    array3[idx] = -array3[idx];
-  });
-
-  // read out the value from the first array
-  std::vector<int> result1 = array1;
-
-  // read out the value from the second array
-  std::vector<int> result2 = array2;
-
-  // read out the value from the third array
-  std::vector<int> result3 = array3;
-
-  // verify all three versions are the same
-  ret &= (result1.size() == result2.size());
-  ret &= (result1.size() == result3.size());
-  for (int i = 0; i < result1.size(); ++i) {
-    ret &= (result1[i] == -i);
-    ret &= (result1[i] == result2[i]);
-    ret &= (result1[i] == result3[i]);
-  }
-
-  return ret;
-}
-
-// this is the 3D case
-template<int N, int M, int O>
-bool test3D() {
-  bool ret = true;
-
-  hc::array<int, 3> array1(N, M, O);
-
-  // fetch the device pointer of array1
-  void* array1_devptr = array1.accelerator_pointer();
-
-  // construct another array based on the pointer
-  hc::array<int, 3> array2(N, M, O, array1_devptr);
-
-  // execute a kernel, and use the second array
-  hc::completion_future fut = parallel_for_each(hc::extent<3>(N, M, O), [&](hc::index<3>& idx) [[hc]] {
-    array2[idx] = idx[0] * M * O + idx[1] * O + idx[2];
-  });
-
-  fut.wait();
-
-  // construct yet another array based on the pointer
-  hc::array<int, 3> array3(array1.get_extent(), array1_devptr);
-
-  // execute a kernel, and use the third array
-  hc::completion_future fut2 = parallel_for_each(hc::extent<3>(N, M, O), [&](hc::index<3>& idx) [[hc]] {
-    array3[idx] = -array3[idx];
-  });
-
-  // read out the value from the first array
-  std::vector<int> result1 = array1;
-
-  // read out the value from the second array
-  std::vector<int> result2 = array2;
-
-  // read out the value from the third array
-  std::vector<int> result3 = array3;
-
-  // verify all three versions are the same
-  ret &= (result1.size() == result2.size());
-  ret &= (result1.size() == result3.size());
-  for (int i = 0; i < result1.size(); ++i) {
-    ret &= (result1[i] == -i);
-    ret &= (result1[i] == result2[i]);
-    ret &= (result1[i] == result3[i]);
-  }
-
-  return ret;
-}
-
-int main() {
-  bool ret = true;
-
-  ret &= test1D<16>();
-  ret &= test1D<1024>();
-  ret &= test1D<256 * 1024>();
-
-  ret &= test2D<2, 8>();
-  ret &= test2D<16, 64>();
-  ret &= test2D<256, 1024>();
-
-  ret &= test3D<2, 4, 8>();
-  ret &= test3D<4, 8, 32>();
-  ret &= test3D<16, 64, 1024>();
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/HC/array_of_short_types.cpp b/tests/Unit/HC/array_of_short_types.cpp
index ef4f1c94c47..e108c7074c8 100644
--- a/tests/Unit/HC/array_of_short_types.cpp
+++ b/tests/Unit/HC/array_of_short_types.cpp
@@ -2,7 +2,7 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #define NUM_ARRAY 512
 #define ARRAY_SIZE (1 * 1024)
diff --git a/tests/Unit/HC/async_copy.cpp b/tests/Unit/HC/async_copy.cpp
index b82ec66f450..81dd66dd6d4 100644
--- a/tests/Unit/HC/async_copy.cpp
+++ b/tests/Unit/HC/async_copy.cpp
@@ -1,7 +1,7 @@
 // RUN: %hc %s -std=c++14 -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 #include <iostream>
 #include <cassert>
 #include <algorithm>
@@ -37,11 +37,11 @@ int main(int argc, char* argv[]) {
   constexpr int n = 1024 * 4;
   hc::accelerator acc;
 
-  hc_am_buffer<int> pinned_host_a(acc, n, amHostPinned);
+  hc_am_buffer<int> pinned_host_a(acc, n, am_host_pinned);
   hc_am_buffer<int> device_buffer_a(acc, n);
   hc_am_buffer<int> device_buffer_b(acc, n);
   hc_am_buffer<int> device_buffer_c(acc, n);
-  hc_am_buffer<int> pinned_host_b(acc, n, amHostPinned);
+  hc_am_buffer<int> pinned_host_b(acc, n, am_host_pinned);
 
   std::generate_n(pinned_host_a(), n, []() {
     static int n = 0;
diff --git a/tests/Unit/HC/auto_annotate_attribute.cpp b/tests/Unit/HC/auto_annotate_attribute.cpp
deleted file mode 100644
index 6f81faa0568..00000000000
--- a/tests/Unit/HC/auto_annotate_attribute.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// RUN: %hc -Xclang -fauto-compile-for-accelerator %s -o %t.out && %t.out
-
-#include <hc.hpp>
-
-#include <iostream>
-#include <vector>
-
-// foo is a global function which doesn't have [[hc]] attribute
-// if compiled with -Xclang -fauto-compile-for-accelerator, [[hc]] would be
-// annotated automatically
-int foo() {
-  return 1;
-}
-
-template<int GRID_SIZE>
-bool test1() {
-  using namespace hc;
-  bool ret = true;
-  array<int, 1> table(GRID_SIZE);
-  extent<1> ex(GRID_SIZE);
-  parallel_for_each(ex, [&](index<1>& idx) [[hc]] {
-    table[idx] = foo();
-  }).wait();
-
-  std::vector<int> result = table;
-  for (int i = 0; i < GRID_SIZE; ++i) {
-    if (result[i] != 1) {
-      std::cerr << "Verify failed at index: " << i << " , expected: " << 1 << " , actual: " << result[i] << "\n";
-      ret = false;
-      break;
-    }
-  }
-  return ret;
-}
-
-// bar is a static function which doesn't have [[hc]] attribute
-// if compiled with -Xclang -fauto-compile-for-accelerator, [[hc]] would be
-// annotated automatically
-static int bar() {
-  return 1;
-}
-
-template<int GRID_SIZE>
-bool test2() {
-  using namespace hc;
-  bool ret = true;
-  array<int, 1> table(GRID_SIZE);
-  extent<1> ex(GRID_SIZE);
-  parallel_for_each(ex, [&](index<1>& idx) [[hc]] {
-    table[idx] = bar();
-  }).wait();
-
-  std::vector<int> result = table;
-  for (int i = 0; i < GRID_SIZE; ++i) {
-    if (result[i] != 1) {
-      std::cerr << "Verify failed at index: " << i << " , expected: " << 1 << " , actual: " << result[i] << "\n";
-      ret = false;
-      break;
-    }
-  }
-  return ret;
-}
-
-// baz is a class with a member function test() which doesn't have [[hc]] attribute
-// if compiled with -Xclang -fauto-compile-for-accelerator, [[hc]] would be
-// annotated automatically
-class baz {
-public:
-  int test() {
-    return 1;
-  }
-
-  static int test2() {
-    return 1;
-  }
-};
-
-template<int GRID_SIZE>
-bool test3() {
-  using namespace hc;
-  bool ret = true;
-  array<int, 1> table(GRID_SIZE);
-  extent<1> ex(GRID_SIZE);
-  baz obj;
-  parallel_for_each(ex, [&](index<1>& idx) [[hc]] {
-    table[idx] = obj.test();
-  }).wait();
-
-  std::vector<int> result = table;
-  for (int i = 0; i < GRID_SIZE; ++i) {
-    if (result[i] != 1) {
-      std::cerr << "Verify failed at index: " << i << " , expected: " << 1 << " , actual: " << result[i] << "\n";
-      ret = false;
-      break;
-    }
-  }
-  return ret;
-}
-
-template<int GRID_SIZE>
-bool test4() {
-  using namespace hc;
-  bool ret = true;
-  array<int, 1> table(GRID_SIZE);
-  extent<1> ex(GRID_SIZE);
-  parallel_for_each(ex, [&](index<1>& idx) [[hc]] {
-    table[idx] = baz::test2();
-  }).wait();
-
-  std::vector<int> result = table;
-  for (int i = 0; i < GRID_SIZE; ++i) {
-    if (result[i] != 1) {
-      std::cerr << "Verify failed at index: " << i << " , expected: " << 1 << " , actual: " << result[i] << "\n";
-      ret = false;
-      break;
-    }
-  }
-  return ret;
-}
-
-int main() {
-  bool ret = true;
-
-  // test with global function
-  ret &= test1<64>();
-
-  // test with static function
-  ret &= test2<64>();
-
-  // test with member function
-  ret &= test3<64>();
-
-  // test with static member function
-  ret &= test4<64>();
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/HC/capture_struct_with_carray_by_copy.cpp b/tests/Unit/HC/capture_struct_with_carray_by_copy.cpp
index 02341ad0663..51333005285 100644
--- a/tests/Unit/HC/capture_struct_with_carray_by_copy.cpp
+++ b/tests/Unit/HC/capture_struct_with_carray_by_copy.cpp
@@ -1,33 +1,12 @@
-// XFAIL: *
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_am.hpp>
+// RUN: %hc  %s -o %t.out && %t.out
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 struct Foo {
   int table[3];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(int x0, int x1, int x2) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-    table[2] = x2;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(int), &table[0]);
-    s.Append(sizeof(int), &table[1]);
-    s.Append(sizeof(int), &table[2]);
-  }
 };
 
 int main() {
-
-  // XXX the test would cause soft hang now
-  // explicitly disable the test for now
-#if 0
   using namespace hc;
 
   Foo f;
@@ -44,8 +23,8 @@ int main() {
   av.copy(data, data_d, 3 * sizeof(int));
 
   parallel_for_each(extent<1>(3), [=](index<1> idx) [[hc]] {
-                      data_d[idx[0]] = f.table[idx[0]] + 999;
-                    });
+    data_d[idx[0]] = f.table[idx[0]] + 999;
+  });
 
   av.copy(data_d, data, 3 * sizeof(int));
 
@@ -57,7 +36,4 @@ int main() {
   am_free(data_d);
 
   return !(ret == true);
-#else
-  return !(false == true);
-#endif
 }
diff --git a/tests/Unit/HC/capture_struct_with_carray_by_copy2.cpp b/tests/Unit/HC/capture_struct_with_carray_by_copy2.cpp
index 719b9d05606..03eed04dfe9 100644
--- a/tests/Unit/HC/capture_struct_with_carray_by_copy2.cpp
+++ b/tests/Unit/HC/capture_struct_with_carray_by_copy2.cpp
@@ -1,27 +1,10 @@
-// XFAIL: *
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_am.hpp>
+// RUN: %hc  %s -o %t.out && %t.out
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 template<typename T>
 struct Foo {
   T table[3];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0, T x1, T x2) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-    table[2] = x2;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-    s.Append(sizeof(T), &table[1]);
-    s.Append(sizeof(T), &table[2]);
-  }
 };
 
 template<typename T>
@@ -60,16 +43,10 @@ bool test() {
 int main() {
   bool ret = true;
 
-  // XXX the test would cause soft hang now
-  // explicitly disable the test for now
-#if 0
   ret &= test<int>();
   ret &= test<unsigned>();
   ret &= test<float>();
   ret &= test<double>();
 
-  return !(ret == true);
-#else
-  return !(false == true);
-#endif
+  return ret ? EXIT_SUCCESS : EXIT_FAILURE;
 }
diff --git a/tests/Unit/HC/capture_struct_with_carray_by_copy3.cpp b/tests/Unit/HC/capture_struct_with_carray_by_copy3.cpp
index f2d38707f7f..3fb9d94f387 100644
--- a/tests/Unit/HC/capture_struct_with_carray_by_copy3.cpp
+++ b/tests/Unit/HC/capture_struct_with_carray_by_copy3.cpp
@@ -1,7 +1,6 @@
-// XFAIL: *
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_am.hpp>
+// RUN: %hc  %s -o %t.out && %t.out
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 #include <iostream>
 
@@ -16,110 +15,30 @@ struct Foo {
 template<typename T>
 struct Foo<T, 1> {
   T table[1];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0) [[cpu]][[hc]] {
-    table[0] = x0;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-  }
 };
 
 // partial specialization of Foo<T, 2>
 template<typename T>
 struct Foo<T, 2> {
   T table[2];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0, T x1) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-    s.Append(sizeof(T), &table[1]);
-  }
 };
 
 // partial specialization of Foo<T, 3>
 template<typename T>
 struct Foo<T, 3> {
   T table[3];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0, T x1, T x2) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-    table[2] = x2;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-    s.Append(sizeof(T), &table[1]);
-    s.Append(sizeof(T), &table[2]);
-  }
 };
 
 // partial specialization of Foo<T, 4>
 template<typename T>
 struct Foo<T, 4> {
   T table[4];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0, T x1, T x2, T x3) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-    table[2] = x2;
-    table[3] = x3;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-    s.Append(sizeof(T), &table[1]);
-    s.Append(sizeof(T), &table[2]);
-    s.Append(sizeof(T), &table[3]);
-  }
 };
 
 // partial specialization of Foo<T, 5>
 template<typename T>
 struct Foo<T, 5> {
   T table[5];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0, T x1, T x2, T x3, T x4) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-    table[2] = x2;
-    table[3] = x3;
-    table[4] = x4;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-    s.Append(sizeof(T), &table[1]);
-    s.Append(sizeof(T), &table[2]);
-    s.Append(sizeof(T), &table[3]);
-    s.Append(sizeof(T), &table[4]);
-  }
 };
 
 template<typename T, size_t N>
@@ -140,8 +59,8 @@ bool test() {
   av.copy(data, data_d, N * sizeof(T));
 
   parallel_for_each(extent<1>(N), [=](index<1> idx) [[hc]] {
-                      data_d[idx[0]] = f.table[idx[0]] + T(999);
-                    });
+    data_d[idx[0]] = f.table[idx[0]] + T(999);
+  });
 
   av.copy(data_d, data, N * sizeof(T));
 
@@ -158,9 +77,6 @@ bool test() {
 int main() {
   bool ret = true;
 
-  // XXX the test would cause soft hang now
-  // explicitly disable the test for now
-#if 0
   ret &= test<int, 1>();
   ret &= test<int, 2>();
   ret &= test<int, 3>();
@@ -185,8 +101,5 @@ int main() {
   ret &= test<double, 4>();
   ret &= test<double, 5>();
 
-  return !(ret == true);
-#else
-  return !(false == true);
-#endif
+  return ret ? EXIT_SUCCESS : EXIT_FAILURE;
 }
diff --git a/tests/Unit/HC/capture_struct_with_carray_by_copy4.cpp b/tests/Unit/HC/capture_struct_with_carray_by_copy4.cpp
index 2b04359a173..ca6ff08e55e 100644
--- a/tests/Unit/HC/capture_struct_with_carray_by_copy4.cpp
+++ b/tests/Unit/HC/capture_struct_with_carray_by_copy4.cpp
@@ -1,7 +1,6 @@
-// XFAIL: *
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_am.hpp>
+// RUN: %hc  %s -o %t.out && %t.out
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 #include <iostream>
 #include <type_traits>
@@ -17,110 +16,30 @@ struct Foo {
 template<typename T>
 struct Foo<T, 1> {
   T table[1];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0) [[cpu]][[hc]] {
-    table[0] = x0;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-  }
 };
 
 // partial specialization of Foo<T, 2>
 template<typename T>
 struct Foo<T, 2> {
   T table[2];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0, T x1) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-    s.Append(sizeof(T), &table[1]);
-  }
 };
 
 // partial specialization of Foo<T, 3>
 template<typename T>
 struct Foo<T, 3> {
   T table[3];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0, T x1, T x2) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-    table[2] = x2;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-    s.Append(sizeof(T), &table[1]);
-    s.Append(sizeof(T), &table[2]);
-  }
 };
 
 // partial specialization of Foo<T, 4>
 template<typename T>
 struct Foo<T, 4> {
   T table[4];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0, T x1, T x2, T x3) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-    table[2] = x2;
-    table[3] = x3;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-    s.Append(sizeof(T), &table[1]);
-    s.Append(sizeof(T), &table[2]);
-    s.Append(sizeof(T), &table[3]);
-  }
 };
 
 // partial specialization of Foo<T, 5>
 template<typename T>
 struct Foo<T, 5> {
   T table[5];
-
-  Foo() = default;
-
-  __attribute__((annotate("user_deserialize")))
-  Foo(T x0, T x1, T x2, T x3, T x4) [[cpu]][[hc]] {
-    table[0] = x0;
-    table[1] = x1;
-    table[2] = x2;
-    table[3] = x3;
-    table[4] = x4;
-  }
-
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
-    s.Append(sizeof(T), &table[0]);
-    s.Append(sizeof(T), &table[1]);
-    s.Append(sizeof(T), &table[2]);
-    s.Append(sizeof(T), &table[3]);
-    s.Append(sizeof(T), &table[4]);
-  }
 };
 
 // Bar extends Foo
@@ -187,8 +106,8 @@ bool test() {
   av.copy(data, data_d, N * sizeof(T));
 
   parallel_for_each(extent<1>(N), [=](index<1> idx) [[hc]] {
-                      data_d[idx[0]] = f.table[idx[0]] + T(999);
-                    });
+    data_d[idx[0]] = f.table[idx[0]] + T(999);
+  });
 
   av.copy(data_d, data, N * sizeof(T));
 
@@ -205,10 +124,7 @@ bool test() {
 int main() {
   bool ret = true;
 
-  // XXX the test would cause soft hang now
-  // explicitly disable the test for now
-#if 0
- ret &= test<int, 1, Foo<int, 1> >();
+  ret &= test<int, 1, Foo<int, 1> >();
   ret &= test<int, 2, Foo<int, 2> >();
   ret &= test<int, 3, Foo<int, 3> >();
   ret &= test<int, 4, Foo<int, 4> >();
@@ -257,7 +173,4 @@ int main() {
   ret &= test<double, 5, Bar<double, 5> >();
 
   return !(ret == true);
-#else
-  return !(false == true);
-#endif
 }
diff --git a/tests/Unit/HC/completion_future_is_ready.cpp b/tests/Unit/HC/completion_future_is_ready.cpp
index e4f11442a5b..2e427f92484 100644
--- a/tests/Unit/HC/completion_future_is_ready.cpp
+++ b/tests/Unit/HC/completion_future_is_ready.cpp
@@ -1,8 +1,11 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
+#include <atomic>
+#include <memory>
 #include <iostream>
 #include <random>
 
@@ -30,22 +33,32 @@ bool test() {
     table_b[i] = int_dist(rd);
   }
 
+  hc::accelerator acc;
+  hc::accelerator_view av = acc.get_default_view();
+
   // launch kernel
+  std::unique_ptr<std::atomic<std::uint32_t>, decltype(hc::am_free)*> done{
+    hc::am_alloc(sizeof(std::atomic<bool>), acc, am_host_coherent),
+    hc::am_free};
+  *done = 0;
+
   hc::extent<1> e(vecSize);
   hc::completion_future fut = hc::parallel_for_each(
-    e,
-    [=](hc::index<1> idx) __HC__ {
-      for (int i = 0; i < LOOP_COUNT; ++i) 
+      av, e, [=, done = done.get()](hc::index<1> idx) [[hc]] {
+      for (int i = 0; i < LOOP_COUNT; ++i)
         table_c(idx) = table_a(idx) + table_b(idx);
+
+      while (*done == 0);
   });
 
   // create a barrier packet
-  hc::accelerator_view av = hc::accelerator().get_default_view();
   hc::completion_future fut2 = av.create_marker();
 
   ret &= (fut.is_ready() == false);
   ret &= (fut2.is_ready() == false);
 
+  *done = 1;
+
   // wait on the barrier packet
   fut2.wait();
 
@@ -74,6 +87,4 @@ int main() {
   ret &= test();
 
   return !(ret == true);
-}
-
-
+}
\ No newline at end of file
diff --git a/tests/Unit/HC/create_blocking_marker.cpp b/tests/Unit/HC/create_blocking_marker.cpp
index ac6f9f67d29..e202ef433b6 100644
--- a/tests/Unit/HC/create_blocking_marker.cpp
+++ b/tests/Unit/HC/create_blocking_marker.cpp
@@ -1,6 +1,6 @@
-// RUN: %hc %s -I/opt/rocm/hsa/include -L/opt/rocm/lib -lhsa-runtime64 -o %t.out && %t.out
+// RUN: %hc %s -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
@@ -40,7 +40,7 @@ bool test() {
   hc::extent<1> e(vecSize);
   hc::completion_future fut = hc::parallel_for_each(
     e,
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       for (int i = 0; i < LOOP_COUNT; ++i)
         table_c(idx) = table_a(idx) + table_b(idx);
   });
@@ -51,44 +51,10 @@ bool test() {
   hc::accelerator_view av3 = hc::accelerator().create_view();
   hc::completion_future fut2 = av.create_blocking_marker(fut);
 
-  void* nativeHandle = fut.get_native_handle();
-  void* nativeHandle2 = fut2.get_native_handle();
-
-#if TEST_DEBUG
-  std::cout << nativeHandle << "\n";
-  std::cout << nativeHandle2 << "\n";
-#endif
-
-  hsa_signal_value_t signal_value;
-  hsa_signal_value_t signal_value2;
-
-  signal_value = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle));
-#if TEST_DEBUG
-  std::cout << "kernel signal value: " << signal_value << "\n";
-#endif
-
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle2));
-#if TEST_DEBUG
-  std::cout << "blocking barrier signal value: " << signal_value << "\n";
-#endif
-
   // wait on the barrier packet
   fut2.wait();
 
   // the barrier packet would ensure all previous packets were processed
-
-  signal_value = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle));
-#if TEST_DEBUG
-  std::cout << "kernel signal value: " << signal_value << "\n";
-#endif
-  ret &= (signal_value == 0);
-
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle2));
-#if TEST_DEBUG
-  std::cout << "barrier signal value: " << signal_value << "\n";
-#endif
-  ret &= (signal_value2 == 0);
-
   // verify
   int error = 0;
   for(unsigned i = 0; i < vecSize; i++) {
@@ -126,7 +92,7 @@ bool test() {
      
       cf_pfe  = hc::parallel_for_each(av,
         e,
-        [=](hc::index<1> idx) __HC__ {
+        [=](hc::index<1> idx) [[hc]] {
           for (int i = 0; i < LOOP_COUNT; ++i)
             table_c(idx) = table_a(idx) + table_b(idx);
       });
@@ -134,29 +100,17 @@ bool test() {
       cfA = av2.create_blocking_marker({nullcf[0], cf_pfe});
       cfA.wait();
 
-      hsa_signal_value_t signal_value_pfe = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(cf_pfe.get_native_handle()));
-      hsa_signal_value_t signal_value_cbm = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(cfA.get_native_handle()));
-
-      std::cout << "create_blocking_marker on single PFE verify OK\n";
-      
-
-      // Both signals should have completed.
-      assert(signal_value_pfe == 0);
-      assert(signal_value_cbm == 0);
-
-
-
       // Try a 3-way:
       // Two kernels sent to different PFE, then wait on all three
       cf_pfe  = hc::parallel_for_each(av,
         e,
-        [=](hc::index<1> idx) __HC__ {
+        [=](hc::index<1> idx) [[hc]] {
           for (int i = 0; i < LOOP_COUNT; ++i)
             table_c(idx) = table_a(idx) + table_b(idx);
       });
       cf_pfe2  = hc::parallel_for_each(av2,
         e,
-        [=](hc::index<1> idx) __HC__ {
+        [=](hc::index<1> idx) [[hc]] {
           for (int i = 0; i < LOOP_COUNT; ++i)
             table_c(idx) = table_a(idx) + table_b(idx);
       });
@@ -165,9 +119,6 @@ bool test() {
       cfA = av2.create_blocking_marker({nullcf[0], cf_pfe, nullcf[1], cf_pfe2});
       cfA.wait();
 
-      assert (hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(cf_pfe.get_native_handle())) == 0);
-      assert (hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(cf_pfe2.get_native_handle())) == 0);
-      assert (hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(cfA.get_native_handle())) == 0);
       std::cout << "create_blocking_marker on dual PFE verify OK\n";
   }
 
diff --git a/tests/Unit/HC/create_blocking_marker2.cpp b/tests/Unit/HC/create_blocking_marker2.cpp
index cea356012e6..33c3c0c80dd 100644
--- a/tests/Unit/HC/create_blocking_marker2.cpp
+++ b/tests/Unit/HC/create_blocking_marker2.cpp
@@ -1,6 +1,6 @@
-// RUN: %hc %s -I/opt/rocm/hsa/include -L/opt/rocm/lib -lhsa-runtime64 -o %t.out && %t.out
+// RUN: %hc %s -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
@@ -44,35 +44,35 @@ bool test() {
   hc::extent<1> e(vecSize);
   hc::completion_future fut0 = hc::parallel_for_each(
     e,
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       for (int i = 0; i < LOOP_COUNT; ++i)
         table_c(idx) = table_a(idx) + table_b(idx);
   });
 
   hc::completion_future fut1 = hc::parallel_for_each(
     e,
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       for (int i = 0; i < LOOP_COUNT; ++i)
         table_d(idx) = table_a(idx) + table_b(idx);
   });
 
   hc::completion_future fut2 = hc::parallel_for_each(
     e,
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       for (int i = 0; i < LOOP_COUNT; ++i)
         table_e(idx) = table_a(idx) + table_b(idx);
   });
 
   hc::completion_future fut3 = hc::parallel_for_each(
     e,
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       for (int i = 0; i < LOOP_COUNT; ++i)
         table_f(idx) = table_a(idx) + table_b(idx);
   });
 
   hc::completion_future fut4 = hc::parallel_for_each(
     e,
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       for (int i = 0; i < LOOP_COUNT; ++i)
         table_g(idx) = table_a(idx) + table_b(idx);
   });
@@ -81,76 +81,11 @@ bool test() {
   hc::accelerator_view av = hc::accelerator().get_default_view();
   hc::completion_future fut5 = av.create_blocking_marker({fut0, fut1, fut2, fut3, fut4}, hc::system_scope);
 
-  void* nativeHandle0 = fut0.get_native_handle();
-  void* nativeHandle1 = fut1.get_native_handle();
-  void* nativeHandle2 = fut2.get_native_handle();
-  void* nativeHandle3 = fut3.get_native_handle();
-  void* nativeHandle4 = fut4.get_native_handle();
-  void* nativeHandle5 = fut5.get_native_handle();
-
-#if TEST_DEBUG
-  std::cout << nativeHandle0 << "\n";
-  std::cout << nativeHandle1 << "\n";
-  std::cout << nativeHandle2 << "\n";
-  std::cout << nativeHandle3 << "\n";
-  std::cout << nativeHandle4 << "\n";
-  std::cout << nativeHandle5 << "\n";
-#endif
-
-  hsa_signal_value_t signal_value0;
-  hsa_signal_value_t signal_value1;
-  hsa_signal_value_t signal_value2;
-  hsa_signal_value_t signal_value3;
-  hsa_signal_value_t signal_value4;
-  hsa_signal_value_t signal_value5;
-
-  signal_value0 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle0));
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle3));
-  signal_value4 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle4));
-#if TEST_DEBUG
-  std::cout << "kernel signal value: " << signal_value0 << "\n";
-  std::cout << "kernel signal value: " << signal_value1 << "\n";
-  std::cout << "kernel signal value: " << signal_value2 << "\n";
-  std::cout << "kernel signal value: " << signal_value3 << "\n";
-  std::cout << "kernel signal value: " << signal_value4 << "\n";
-#endif
-
-  signal_value5 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle5));
-#if TEST_DEBUG
-  std::cout << "blocking barrier signal value: " << signal_value5 << "\n";
-#endif
-
   // wait on the barrier packet
   fut5.wait();
 
   // the barrier packet would ensure all previous packets were processed
 
-  signal_value0 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle0));
-  signal_value1 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle1));
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle2));
-  signal_value3 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle3));
-  signal_value4 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle4));
-#if TEST_DEBUG
-  std::cout << "kernel signal value: " << signal_value0 << "\n";
-  std::cout << "kernel signal value: " << signal_value1 << "\n";
-  std::cout << "kernel signal value: " << signal_value2 << "\n";
-  std::cout << "kernel signal value: " << signal_value3 << "\n";
-  std::cout << "kernel signal value: " << signal_value4 << "\n";
-#endif
-  ret &= (signal_value0 == 0);
-  ret &= (signal_value1 == 0);
-  ret &= (signal_value2 == 0);
-  ret &= (signal_value3 == 0);
-  ret &= (signal_value4 == 0);
-
-  signal_value5 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle5));
-#if TEST_DEBUG
-  std::cout << "barrier signal value: " << signal_value5 << "\n";
-#endif
-  ret &= (signal_value5 == 0);
-
   // verify
   int error = 0;
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/HC/create_marker.cpp b/tests/Unit/HC/create_marker.cpp
index c632b2a3d90..71697305c06 100644
--- a/tests/Unit/HC/create_marker.cpp
+++ b/tests/Unit/HC/create_marker.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -I%hsa_header_path -L%hsa_library_path -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
@@ -41,7 +41,7 @@ bool test() {
   hc::extent<1> e(vecSize);
   hc::completion_future fut = hc::parallel_for_each(
     e,
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       for (int i = 0; i < LOOP_COUNT; ++i) 
         table_c(idx) = table_a(idx) + table_b(idx);
   });
@@ -50,44 +50,10 @@ bool test() {
   hc::accelerator_view av = hc::accelerator().get_default_view();
   hc::completion_future fut2 = av.create_marker();
 
-  void* nativeHandle = fut.get_native_handle();
-  void* nativeHandle2 = fut2.get_native_handle();
-
-#if TEST_DEBUG
-  std::cout << nativeHandle << "\n";
-  std::cout << nativeHandle2 << "\n";
-#endif
-
-  hsa_signal_value_t signal_value;
-  hsa_signal_value_t signal_value2;
-
-  signal_value = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle));
-#if TEST_DEBUG
-  std::cout << "kernel signal value: " << signal_value << "\n";
-#endif
-
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle2));
-#if TEST_DEBUG
-  std::cout << "barrier signal value: " << signal_value << "\n";
-#endif
-
   // wait on the barrier packet
   fut2.wait();
 
   // the barrier packet would ensure all previous packets were processed
-
-  signal_value = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle));
-#if TEST_DEBUG
-  std::cout << "kernel signal value: " << signal_value << "\n";
-#endif
-  ret &= (signal_value == 0);
-
-  signal_value2 = hsa_signal_load_scacquire(*static_cast<hsa_signal_t*>(nativeHandle2));
-#if TEST_DEBUG
-  std::cout << "barrier signal value: " << signal_value << "\n";
-#endif
-  ret &= (signal_value2 == 0);
-
   // verify
   int error = 0;
   for(unsigned i = 0; i < vecSize; i++) {
diff --git a/tests/Unit/HC/create_marker2.cpp b/tests/Unit/HC/create_marker2.cpp
index 89d94d0d707..fa51809c9a1 100644
--- a/tests/Unit/HC/create_marker2.cpp
+++ b/tests/Unit/HC/create_marker2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
@@ -35,7 +35,7 @@ bool test(bool useWaitMode, hc::memory_scope releaseScope, hc::hcWaitMode mode =
   hc::extent<1> e(vecSize);
   hc::completion_future fut = hc::parallel_for_each(
     e,
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       for (int i = 0; i < LOOP_COUNT; ++i) 
         table_c(idx) = table_a(idx) + table_b(idx);
   });
@@ -48,7 +48,7 @@ bool test(bool useWaitMode, hc::memory_scope releaseScope, hc::hcWaitMode mode =
   if (!useWaitMode) {
     fut2.wait();
   } else {
-    fut2.wait(mode);
+    fut2.wait();
   }
 
   // the barrier packet would ensure all previous packets were processed
diff --git a/tests/Unit/HC/cycle.cpp b/tests/Unit/HC/cycle.cpp
index 2975e7d2334..a2ba4522c47 100644
--- a/tests/Unit/HC/cycle.cpp
+++ b/tests/Unit/HC/cycle.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #define GRID_SIZE (1024)
 
diff --git a/tests/Unit/HC/cycle2.cpp b/tests/Unit/HC/cycle2.cpp
index 0f22322e81c..b99b9d71a09 100644
--- a/tests/Unit/HC/cycle2.cpp
+++ b/tests/Unit/HC/cycle2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <vector>
 #include <algorithm>
 
diff --git a/tests/Unit/HC/execute_order.cpp b/tests/Unit/HC/execute_order.cpp
index deb46846cbd..8c2f4554502 100644
--- a/tests/Unit/HC/execute_order.cpp
+++ b/tests/Unit/HC/execute_order.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test new enumeration in hc::accelerator_view : execute_order
 int main() {
@@ -21,13 +21,13 @@ int main() {
   accelerator_view av_any_order = acc.create_view(execute_any_order);
 
   // test dispatch a kernel to av
-  parallel_for_each(av, extent<1>(1), []() [[hc]] {});
+  parallel_for_each(av, extent<1>(1), [](index<1>) [[hc]] {});
 
   // test dispatch a kernel to av_in_order
-  parallel_for_each(av_in_order, extent<1>(1), []() [[hc]] {});
+  parallel_for_each(av_in_order, extent<1>(1), [](index<1>) [[hc]] {});
 
   // test dispatch a kernel to av_any_order
-  parallel_for_each(av_any_order, extent<1>(1), []() [[hc]] {});
+  parallel_for_each(av_any_order, extent<1>(1), [](index<1>) [[hc]] {});
 
   return 0;
 }
diff --git a/tests/Unit/HC/get_group_segment_sizes.cpp b/tests/Unit/HC/get_group_segment_sizes.cpp
index 33e519e7138..75f1da27e6d 100644
--- a/tests/Unit/HC/get_group_segment_sizes.cpp
+++ b/tests/Unit/HC/get_group_segment_sizes.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cstdlib>
 #include <iostream>
@@ -35,7 +35,7 @@ bool test() {
 
   hc::completion_future fut = hc::parallel_for_each(
     e, 
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       // create a tile_static array
       tile_static volatile int group[groupSize];
       group[idx[0]] = 0;
@@ -44,7 +44,7 @@ bool test() {
       av_a(idx) = hc::get_group_segment_size();
 
       // av_b stores the size of static group segment
-      av_b(idx) = hc::get_static_group_segment_size() + group[idx[0]]; // use group__HC__ so it won't be optimized away
+      av_b(idx) = hc::get_static_group_segment_size() + group[idx[0]]; // use group[[hc]] so it won't be optimized away
   });
 
   // create a barrier packet
diff --git a/tests/Unit/HC/get_use_count.cpp b/tests/Unit/HC/get_use_count.cpp
index 31e09ad2b6d..15768d0ecc3 100644
--- a/tests/Unit/HC/get_use_count.cpp
+++ b/tests/Unit/HC/get_use_count.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <assert.h>
 
 void checkPassByValue (hc::completion_future cf, int expectedCount)
diff --git a/tests/Unit/HC/hc_array_cpu_access.cpp b/tests/Unit/HC/hc_array_cpu_access.cpp
index af872a65516..76c6c5bb2c8 100644
--- a/tests/Unit/HC/hc_array_cpu_access.cpp
+++ b/tests/Unit/HC/hc_array_cpu_access.cpp
@@ -2,7 +2,7 @@
 
 #include <iostream>
 #include <cstdlib>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 int main(int argc, char* argv[]) {
   hc::array<int, 1> a(1);
diff --git a/tests/Unit/HC/hc_atomic_add_float_global.cpp b/tests/Unit/HC/hc_atomic_add_float_global.cpp
index 4f0d71def28..74114cc1e04 100644
--- a/tests/Unit/HC/hc_atomic_add_float_global.cpp
+++ b/tests/Unit/HC/hc_atomic_add_float_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
@@ -14,28 +14,32 @@ using namespace hc;
 #define INIT 0.5f
 
 int main(void) {
-  const int vecSize = 100;
+  #if defined(FLOAT_ATOMICS)
+    const int vecSize = 100;
 
-  // Alloc & init input data
-  std::vector<T> init(vecSize, INIT);
-  array<T, 1> count(vecSize, init.begin());
+    // Alloc & init input data
+    std::vector<T> init(vecSize, INIT);
+    array<T, 1> count(vecSize, init.begin());
 
-  parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] {
-    for(unsigned i = 0; i < vecSize; i++) {
-      atomic_fetch_add(&count[i], INIT);
-    }
-  });
+    parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] {
+      for(unsigned i = 0; i < vecSize; i++) {
+        atomic_fetch_add(&count[i], INIT);
+      }
+    });
 
-  array_view<T, 1> av(count);
+    array_view<T, 1> av(count);
 
-  bool ret = true;
-  float sum = std::accumulate(init.begin(), init.end(), 0.0f);
-  sum += INIT;
-  for(unsigned i = 0; i < vecSize; ++i) {
-      if(fabs(av[i] - sum) > TOLERANCE) {
-        ret = false;
-      }
-  }
+    bool ret = true;
+    float sum = std::accumulate(init.begin(), init.end(), 0.0f);
+    sum += INIT;
+    for(unsigned i = 0; i < vecSize; ++i) {
+        if(fabs(av[i] - sum) > TOLERANCE) {
+          ret = false;
+        }
+    }
 
-  return !(ret == true);
+    return !(ret == true);
+  #else
+    return EXIT_SUCCESS;
+  #endif
 }
diff --git a/tests/Unit/HC/hc_atomic_add_float_local.cpp b/tests/Unit/HC/hc_atomic_add_float_local.cpp
index 698926deb74..451b864c1c7 100644
--- a/tests/Unit/HC/hc_atomic_add_float_local.cpp
+++ b/tests/Unit/HC/hc_atomic_add_float_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <math.h>
@@ -12,43 +12,47 @@ using namespace hc;
 #define TOLERANCE 1e-5
 
 int main(void) {
-  const int vecSize = 100;
-  const int tile_size = 10;
-
-  // Alloc & init input data
-  extent<2> e_a(vecSize, vecSize);
-  std::vector<T> va(vecSize * vecSize, INIT);
-  array_view<T, 2> av_a(e_a, va); 
-
-  extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
-    tile_static T localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
-    tidx.barrier.wait();
-
-    for(int i = 0; i < tile_size; i++) {
-      for(int j = 0; j < tile_size; j++) {
-        atomic_fetch_add(&(localA[i][j]), INIT);
+  #if defined(FLOAT_ATOMICS)
+    const int vecSize = 100;
+    const int tile_size = 10;
+
+    // Alloc & init input data
+    extent<2> e_a(vecSize, vecSize);
+    std::vector<T> va(vecSize * vecSize, INIT);
+    array_view<T, 2> av_a(e_a, va); 
+
+    extent<2> compute_domain(e_a);
+    parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[hc]] {
+      index<2> localIdx = tidx.local;
+      index<2> globalIdx = tidx.global;
+
+      tile_static T localA[tile_size][tile_size];
+      localA[localIdx[0]][localIdx[1]] = 0;
+      tidx.barrier.wait();
+
+      for(int i = 0; i < tile_size; i++) {
+        for(int j = 0; j < tile_size; j++) {
+          atomic_fetch_add(&(localA[i][j]), INIT);
+        }
       }
-    }
-  tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
-  });
-
-  // accumlate tile_size * tile_size times
-  float sum = 0.0f;
-  for (int i = 0; i < tile_size * tile_size; ++i)
-    sum += INIT;
-  for(unsigned i = 0; i < vecSize; i++) {
-    for(unsigned j = 0; j < vecSize; j++) {
-      if(fabs(av_a(i, j) - sum) > TOLERANCE) {
-        return 1;
+    tidx.barrier.wait();
+    av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+    });
+
+    // accumlate tile_size * tile_size times
+    float sum = 0.0f;
+    for (int i = 0; i < tile_size * tile_size; ++i)
+      sum += INIT;
+    for(unsigned i = 0; i < vecSize; i++) {
+      for(unsigned j = 0; j < vecSize; j++) {
+        if(fabs(av_a(i, j) - sum) > TOLERANCE) {
+          return 1;
+        }
       }
     }
-  }
 
-  return 0;
+    return 0;
+  #else
+    return EXIT_SUCCESS;
+  #endif
 }
diff --git a/tests/Unit/HC/hc_atomic_add_global.cpp b/tests/Unit/HC/hc_atomic_add_global.cpp
index 0243d80cad2..67f828b818c 100644
--- a/tests/Unit/HC/hc_atomic_add_global.cpp
+++ b/tests/Unit/HC/hc_atomic_add_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/hc_atomic_add_local.cpp b/tests/Unit/HC/hc_atomic_add_local.cpp
index 9daa3a0b325..0c4830fcbde 100644
--- a/tests/Unit/HC/hc_atomic_add_local.cpp
+++ b/tests/Unit/HC/hc_atomic_add_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
diff --git a/tests/Unit/HC/hc_atomic_and_global.cpp b/tests/Unit/HC/hc_atomic_and_global.cpp
index 568ed4aa0c5..d0e16f006cd 100644
--- a/tests/Unit/HC/hc_atomic_and_global.cpp
+++ b/tests/Unit/HC/hc_atomic_and_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/hc_atomic_and_local.cpp b/tests/Unit/HC/hc_atomic_and_local.cpp
index 75ae8765fa1..b04b2009443 100644
--- a/tests/Unit/HC/hc_atomic_and_local.cpp
+++ b/tests/Unit/HC/hc_atomic_and_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
diff --git a/tests/Unit/HC/hc_atomic_compare_exchange_global.cpp b/tests/Unit/HC/hc_atomic_compare_exchange_global.cpp
index ce667b8e523..8f2afbab76a 100644
--- a/tests/Unit/HC/hc_atomic_compare_exchange_global.cpp
+++ b/tests/Unit/HC/hc_atomic_compare_exchange_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
@@ -11,7 +11,7 @@ bool test() {
   const int vecSize = 100;
 
   // Alloc & init input data
-  int init[vecSize];
+  T init[vecSize];
   for (int i = 0; i < vecSize; ++i) {
     init[i] = (i % 2 == 0) ? T(0) : T(1);
   }
diff --git a/tests/Unit/HC/hc_atomic_compare_exchange_local.cpp b/tests/Unit/HC/hc_atomic_compare_exchange_local.cpp
index a8d92dc322f..182731a659b 100644
--- a/tests/Unit/HC/hc_atomic_compare_exchange_local.cpp
+++ b/tests/Unit/HC/hc_atomic_compare_exchange_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
diff --git a/tests/Unit/HC/hc_atomic_dec_global.cpp b/tests/Unit/HC/hc_atomic_dec_global.cpp
index 770ba1f7914..6fd925c06e5 100644
--- a/tests/Unit/HC/hc_atomic_dec_global.cpp
+++ b/tests/Unit/HC/hc_atomic_dec_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/hc_atomic_dec_local.cpp b/tests/Unit/HC/hc_atomic_dec_local.cpp
index a73b6e20cc7..01e8a6acb2b 100644
--- a/tests/Unit/HC/hc_atomic_dec_local.cpp
+++ b/tests/Unit/HC/hc_atomic_dec_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
diff --git a/tests/Unit/HC/hc_atomic_exchange_float_global.cpp b/tests/Unit/HC/hc_atomic_exchange_float_global.cpp
index d1df21ee3c7..8fd8d37b23f 100644
--- a/tests/Unit/HC/hc_atomic_exchange_float_global.cpp
+++ b/tests/Unit/HC/hc_atomic_exchange_float_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/hc_atomic_exchange_float_local.cpp b/tests/Unit/HC/hc_atomic_exchange_float_local.cpp
index c2d79a6c5ad..72c59afea75 100644
--- a/tests/Unit/HC/hc_atomic_exchange_float_local.cpp
+++ b/tests/Unit/HC/hc_atomic_exchange_float_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <math.h>
diff --git a/tests/Unit/HC/hc_atomic_exchange_global.cpp b/tests/Unit/HC/hc_atomic_exchange_global.cpp
index efdbee019a1..1b4f22d5ea5 100644
--- a/tests/Unit/HC/hc_atomic_exchange_global.cpp
+++ b/tests/Unit/HC/hc_atomic_exchange_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/hc_atomic_exchange_local.cpp b/tests/Unit/HC/hc_atomic_exchange_local.cpp
index 5edd0ed5745..6c29c505806 100644
--- a/tests/Unit/HC/hc_atomic_exchange_local.cpp
+++ b/tests/Unit/HC/hc_atomic_exchange_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
diff --git a/tests/Unit/HC/hc_atomic_inc_global.cpp b/tests/Unit/HC/hc_atomic_inc_global.cpp
index ca0fd77f7ff..18df9da5cf7 100644
--- a/tests/Unit/HC/hc_atomic_inc_global.cpp
+++ b/tests/Unit/HC/hc_atomic_inc_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/hc_atomic_inc_local.cpp b/tests/Unit/HC/hc_atomic_inc_local.cpp
index 12375acf5bb..be305973893 100644
--- a/tests/Unit/HC/hc_atomic_inc_local.cpp
+++ b/tests/Unit/HC/hc_atomic_inc_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
diff --git a/tests/Unit/HC/hc_atomic_max_global.cpp b/tests/Unit/HC/hc_atomic_max_global.cpp
index 0d8f98f4ff7..386147c5fa6 100644
--- a/tests/Unit/HC/hc_atomic_max_global.cpp
+++ b/tests/Unit/HC/hc_atomic_max_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
@@ -47,7 +47,9 @@ int main() {
 
   ret &= test<unsigned int>();
   ret &= test<int>();
-  ret &= test<uint64_t>();
+  #if defined(EXTENDED_ATOMICS)
+    ret &= test<uint64_t>();
+  #endif
 
   return !(ret == true);
 }
diff --git a/tests/Unit/HC/hc_atomic_max_local.cpp b/tests/Unit/HC/hc_atomic_max_local.cpp
index 6b4c8ade861..6ff1161cc61 100644
--- a/tests/Unit/HC/hc_atomic_max_local.cpp
+++ b/tests/Unit/HC/hc_atomic_max_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -54,7 +54,9 @@ int main() {
 
   ret &= test<unsigned int>();
   ret &= test<int>();
-  ret &= test<uint64_t>();
+  #if defined(EXTENDED_ATOMICS)
+    ret &= test<uint64_t>();
+  #endif
 
   return !(ret == true);
 }
diff --git a/tests/Unit/HC/hc_atomic_min_global.cpp b/tests/Unit/HC/hc_atomic_min_global.cpp
index 0727f45f1ad..ff6fab3c840 100644
--- a/tests/Unit/HC/hc_atomic_min_global.cpp
+++ b/tests/Unit/HC/hc_atomic_min_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
@@ -47,7 +47,9 @@ int main() {
 
   ret &= test<unsigned int>();
   ret &= test<int>();
-  ret &= test<uint64_t>();
+  #if defined(EXTENDED_ATOMICS)
+    ret &= test<uint64_t>();
+  #endif
 
   return !(ret == true);
 }
diff --git a/tests/Unit/HC/hc_atomic_min_local.cpp b/tests/Unit/HC/hc_atomic_min_local.cpp
index 29fd73ff916..c120d277899 100644
--- a/tests/Unit/HC/hc_atomic_min_local.cpp
+++ b/tests/Unit/HC/hc_atomic_min_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
@@ -54,7 +54,9 @@ int main() {
 
   ret &= test<unsigned int>();
   ret &= test<int>();
-  ret &= test<uint64_t>();
+  #if defined(EXTENDED_ATOMICS)
+    ret &= test<uint64_t>();
+  #endif
 
   return !(ret == true);
 }
diff --git a/tests/Unit/HC/hc_atomic_or_global.cpp b/tests/Unit/HC/hc_atomic_or_global.cpp
index a8df47e03c5..0e720dfdf9c 100644
--- a/tests/Unit/HC/hc_atomic_or_global.cpp
+++ b/tests/Unit/HC/hc_atomic_or_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/hc_atomic_or_local.cpp b/tests/Unit/HC/hc_atomic_or_local.cpp
index 5a59800a1c6..171e6ed1418 100644
--- a/tests/Unit/HC/hc_atomic_or_local.cpp
+++ b/tests/Unit/HC/hc_atomic_or_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
diff --git a/tests/Unit/HC/hc_atomic_sub_float_global.cpp b/tests/Unit/HC/hc_atomic_sub_float_global.cpp
index 922a4524f14..be46d5d29ad 100644
--- a/tests/Unit/HC/hc_atomic_sub_float_global.cpp
+++ b/tests/Unit/HC/hc_atomic_sub_float_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
@@ -14,28 +14,32 @@ using namespace hc;
 #define INIT 0.5f
 
 int main(void) {
-  const int vecSize = 100;
+  #if defined(FLOAT_ATOMICS)
+    const int vecSize = 100;
 
-  // Alloc & init input data
-  std::vector<T> init(vecSize, INIT);
-  array<T, 1> count(vecSize, init.begin());
+    // Alloc & init input data
+    std::vector<T> init(vecSize, INIT);
+    array<T, 1> count(vecSize, init.begin());
 
-  parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] {
-    for(unsigned i = 0; i < vecSize; i++) {
-      atomic_fetch_sub(&count[i], INIT);
-    }
-  });
+    parallel_for_each(count.get_extent(), [=, &count](index<1> idx) [[hc]] {
+      for(unsigned i = 0; i < vecSize; i++) {
+        atomic_fetch_sub(&count[i], INIT);
+      }
+    });
 
-  array_view<T, 1> av(count);
+    array_view<T, 1> av(count);
 
-  bool ret = true;
-  float sum = -std::accumulate(init.begin(), init.end(), 0.0f);
-  sum += INIT;
-  for(unsigned i = 0; i < vecSize; ++i) {
-      if(fabs(av[i] - sum) > TOLERANCE) {
-        ret = false;
-      }
-  }
+    bool ret = true;
+    float sum = -std::accumulate(init.begin(), init.end(), 0.0f);
+    sum += INIT;
+    for(unsigned i = 0; i < vecSize; ++i) {
+        if(fabs(av[i] - sum) > TOLERANCE) {
+          ret = false;
+        }
+    }
 
-  return !(ret == true);
+    return !(ret == true);
+  #else
+    return EXIT_SUCCESS;
+  #endif
 }
diff --git a/tests/Unit/HC/hc_atomic_sub_float_local.cpp b/tests/Unit/HC/hc_atomic_sub_float_local.cpp
index 442d608c495..f4cfb1251e1 100644
--- a/tests/Unit/HC/hc_atomic_sub_float_local.cpp
+++ b/tests/Unit/HC/hc_atomic_sub_float_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <math.h>
@@ -12,43 +12,47 @@ using namespace hc;
 #define TOLERANCE 1e-5
 
 int main(void) {
-  const int vecSize = 100;
-  const int tile_size = 10;
-
-  // Alloc & init input data
-  extent<2> e_a(vecSize, vecSize);
-  std::vector<T> va(vecSize * vecSize, INIT);
-  array_view<T, 2> av_a(e_a, va); 
-
-  extent<2> compute_domain(e_a);
-  parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[hc]] {
-    index<2> localIdx = tidx.local;
-    index<2> globalIdx = tidx.global;
-
-    tile_static T localA[tile_size][tile_size];
-    localA[localIdx[0]][localIdx[1]] = 0;
-    tidx.barrier.wait();
-
-    for(int i = 0; i < tile_size; i++) {
-      for(int j = 0; j < tile_size; j++) {
-        atomic_fetch_sub(&(localA[i][j]), INIT);
+  #if defined(FLOAT_ATOMICS)
+    const int vecSize = 100;
+    const int tile_size = 10;
+
+    // Alloc & init input data
+    extent<2> e_a(vecSize, vecSize);
+    std::vector<T> va(vecSize * vecSize, INIT);
+    array_view<T, 2> av_a(e_a, va); 
+
+    extent<2> compute_domain(e_a);
+    parallel_for_each(compute_domain.tile(tile_size, tile_size), [=] (tiled_index<2> tidx) [[hc]] {
+      index<2> localIdx = tidx.local;
+      index<2> globalIdx = tidx.global;
+
+      tile_static T localA[tile_size][tile_size];
+      localA[localIdx[0]][localIdx[1]] = 0;
+      tidx.barrier.wait();
+
+      for(int i = 0; i < tile_size; i++) {
+        for(int j = 0; j < tile_size; j++) {
+          atomic_fetch_sub(&(localA[i][j]), INIT);
+        }
       }
-    }
-  tidx.barrier.wait();
-  av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
-  });
-
-  // accumlate tile_size * tile_size times
-  float sum = 0.0f;
-  for (int i = 0; i < tile_size * tile_size; ++i)
-    sum -= INIT;
-  for(unsigned i = 0; i < vecSize; i++) {
-    for(unsigned j = 0; j < vecSize; j++) {
-      if(fabs(av_a(i, j) - sum) > TOLERANCE) {
-        return 1;
+    tidx.barrier.wait();
+    av_a[globalIdx[0]][globalIdx[1]] = localA[localIdx[0]][localIdx[1]];
+    });
+
+    // accumlate tile_size * tile_size times
+    float sum = 0.0f;
+    for (int i = 0; i < tile_size * tile_size; ++i)
+      sum -= INIT;
+    for(unsigned i = 0; i < vecSize; i++) {
+      for(unsigned j = 0; j < vecSize; j++) {
+        if(fabs(av_a(i, j) - sum) > TOLERANCE) {
+          return 1;
+        }
       }
     }
-  }
 
-  return 0;
+    return 0;
+  #else
+    return EXIT_SUCCESS;
+  #endif
 }
diff --git a/tests/Unit/HC/hc_atomic_sub_global.cpp b/tests/Unit/HC/hc_atomic_sub_global.cpp
index 418c81d3ec7..7f6383c7c04 100644
--- a/tests/Unit/HC/hc_atomic_sub_global.cpp
+++ b/tests/Unit/HC/hc_atomic_sub_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/hc_atomic_sub_local.cpp b/tests/Unit/HC/hc_atomic_sub_local.cpp
index d45164d4375..127aeda76d7 100644
--- a/tests/Unit/HC/hc_atomic_sub_local.cpp
+++ b/tests/Unit/HC/hc_atomic_sub_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
diff --git a/tests/Unit/HC/hc_atomic_wrapinc_wrapdec.cpp b/tests/Unit/HC/hc_atomic_wrapinc_wrapdec.cpp
deleted file mode 100644
index 6ff58093315..00000000000
--- a/tests/Unit/HC/hc_atomic_wrapinc_wrapdec.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-
-// RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
-
-#include <iostream>
-#include <vector>
-
-#define TEST_DEBUG (0)
-
-#define GRID_SIZE (1024)
-#define TILE_SIZE (64)
-#define CLAMP_VALUE_GLOBAL (GRID_SIZE / 2)
-#define CLAMP_VALUE_TILE (TILE_SIZE / 2)
-
-using namespace hc;
-
-bool test_atomic_wrapinc_global() {
-  bool ret = true;
-
-  array<unsigned int, 1> data1(GRID_SIZE);
-  array<unsigned int, 1> data2(GRID_SIZE);
-  extent<1> ex(GRID_SIZE);
-
-  parallel_for_each(ex, [&](index<1>& idx) [[hc]] {
-    // initialize value
-    data1(idx) = idx[0]; // data1 initialized as workitem index
-    data2(idx) = 0;      // data2 initialized as 0
-
-    // do atomic wrap inc
-    data2(idx) = __atomic_wrapinc(&data1(idx), CLAMP_VALUE_GLOBAL);
-  }).wait();
-
-  std::vector<unsigned int> result1 = data1;
-  std::vector<unsigned int> result2 = data2;
-
-  for (int i = 0; i < GRID_SIZE; ++i) {
-#if TEST_DEBUG
-    std::cout << result1[i] << " " << result2[i] << "\n";
-#endif
-
-    // data1 should honor rules set forth by wrapinc
-    ret &= (i < CLAMP_VALUE_GLOBAL) ? (result1[i] == i + 1) // for values smaller then CLAMP_VALUE_GLOBAL, they would be incremented
-                                    : (result1[i] == 0);    // otherwise clamped to 0
-    // data2 should hold old values from data1
-    ret &= (result2[i] == i);
-  }
-
-  return ret;
-}
-
-bool test_atomic_wrapinc_local() {
-  bool ret = true;
-
-  array<unsigned int, 1> data1(GRID_SIZE);
-  array<unsigned int, 1> data2(GRID_SIZE);
-  extent<1> ex(GRID_SIZE);
-
-  parallel_for_each(ex.tile(TILE_SIZE), [&](tiled_index<1>& tidx) [[hc]] {
-    tile_static unsigned int lds[TILE_SIZE];
-
-    int group_index = tidx.local[0];
-    int global_index = tidx.global[0];
-
-    lds[group_index] = group_index;
-
-    tidx.barrier.wait();
-
-    data2(global_index) = __atomic_wrapinc(&lds[group_index], CLAMP_VALUE_TILE);
-    data1(global_index) = lds[group_index];
-  }).wait();
-
-  std::vector<unsigned int> result1 = data1;
-  std::vector<unsigned int> result2 = data2;
-
-  for (int i = 0; i < GRID_SIZE / TILE_SIZE; ++i) {
-    for (int j = 0; j < TILE_SIZE; ++j) {
-#if TEST_DEBUG
-      std::cout << result1[i * TILE_SIZE + j] << " " << result2[i * TILE_SIZE + j] << "\n";
-#endif
-
-      // data1 should honor rules set forth by wrapinc
-      ret &= (j < CLAMP_VALUE_TILE) ? (result1[i * TILE_SIZE + j] == j + 1) // for values smaller then CLAMP_VALUE_TILE, they would be incremented
-                                    : (result1[i * TILE_SIZE + j] == 0);    // otherwise clamped to 0
-      // data2 should hold old values from lds, which are group index value
-      ret &= (result2[i * TILE_SIZE + j] == j);
-    }
-  }
-
-  return ret;
-}
-
-bool test_atomic_wrapdec_global() {
-  bool ret = true;
-
-  array<unsigned int, 1> data1(GRID_SIZE);
-  array<unsigned int, 1> data2(GRID_SIZE);
-  extent<1> ex(GRID_SIZE);
-
-  parallel_for_each(ex, [&](index<1>& idx) [[hc]] {
-    // initialize value
-    data1(idx) = idx[0]; // data1 initialized as workitem index
-    data2(idx) = 0;      // data2 initialized as 0
-
-    // do atomic wrap dec
-    data2(idx) = __atomic_wrapdec(&data1(idx), CLAMP_VALUE_GLOBAL);
-  }).wait();
-
-  std::vector<unsigned int> result1 = data1;
-  std::vector<unsigned int> result2 = data2;
-
-  for (int i = 0; i < GRID_SIZE; ++i) {
-#if TEST_DEBUG
-    std::cout << result1[i] << " " << result2[i] << "\n";
-#endif
-
-    // data1 should honor rules set forth by wrapdec
-    ret &= (i == 0) ? (result1[i] == CLAMP_VALUE_GLOBAL) // if old value is 0, it should carry the clamp value
-                    : (i > CLAMP_VALUE_GLOBAL) ? (result1[i] == CLAMP_VALUE_GLOBAL) // for old values larger than the clamp value
-                                                                                    // they would be clamped
-                                               : (result1[i] == (i - 1));           // otherwise they would be decremented by 1
-    // data2 should hold old values from data1
-    ret &= (result2[i] == i);
-  }
-
-  return ret;
-}
-
-bool test_atomic_wrapdec_local() {
-  bool ret = true;
-
-  array<unsigned int, 1> data1(GRID_SIZE);
-  array<unsigned int, 1> data2(GRID_SIZE);
-  extent<1> ex(GRID_SIZE);
-
-  parallel_for_each(ex.tile(TILE_SIZE), [&](tiled_index<1>& tidx) [[hc]] {
-    tile_static unsigned int lds[TILE_SIZE];
-
-    int group_index = tidx.local[0];
-    int global_index = tidx.global[0];
-
-    lds[group_index] = group_index;
-
-    tidx.barrier.wait();
-
-    data2(global_index) = __atomic_wrapdec(&lds[group_index], CLAMP_VALUE_TILE);
-    data1(global_index) = lds[group_index];
-  }).wait();
-
-  std::vector<unsigned int> result1 = data1;
-  std::vector<unsigned int> result2 = data2;
-
-  for (int i = 0; i < GRID_SIZE / TILE_SIZE; ++i) {
-    for (int j = 0; j < TILE_SIZE; ++j) {
-#if TEST_DEBUG
-      std::cout << result1[i * TILE_SIZE + j] << " " << result2[i * TILE_SIZE + j] << "\n";
-#endif
-
-      // data1 should honor rules set forth by wrapdec
-      ret &= (i == 0) ? (result1[i] == CLAMP_VALUE_TILE) // if old value is 0, it should carry the clamp value
-                      : (i > CLAMP_VALUE_TILE) ? (result1[i] == CLAMP_VALUE_TILE) // for old values larger than the clamp value
-                                                                                  // they would be clamped
-                                               : (result1[i] == (i - 1));         // otherwise they would be decremented by 1
-      // data2 should hold old values from lds, which are group index value
-      ret &= (result2[i] == i);
-    }
-  }
-
-  return ret;
-}
-
-int main() {
-  bool ret = true;
-
-  ret &= test_atomic_wrapinc_global();
-  ret &= test_atomic_wrapdec_global();
-
-  ret &= test_atomic_wrapinc_local();
-  ret &= test_atomic_wrapdec_local();
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/HC/hc_atomic_xor_global.cpp b/tests/Unit/HC/hc_atomic_xor_global.cpp
index 5379ddf7786..c2985d763ac 100644
--- a/tests/Unit/HC/hc_atomic_xor_global.cpp
+++ b/tests/Unit/HC/hc_atomic_xor_global.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
diff --git a/tests/Unit/HC/hc_atomic_xor_local.cpp b/tests/Unit/HC/hc_atomic_xor_local.cpp
index c5d747f2ad0..6ce321631cf 100644
--- a/tests/Unit/HC/hc_atomic_xor_local.cpp
+++ b/tests/Unit/HC/hc_atomic_xor_local.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 
diff --git a/tests/Unit/HC/hc_math.cpp b/tests/Unit/HC/hc_math.cpp
index 9ac2b6284b3..4118d25ea75 100644
--- a/tests/Unit/HC/hc_math.cpp
+++ b/tests/Unit/HC/hc_math.cpp
@@ -1,8 +1,8 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <algorithm>
 #include <iostream>
@@ -29,7 +29,7 @@ void report_delta(
     #endif
 }
 
-// a test case which uses hc_math, which overrides math functions in the global namespace
+// a test case which uses hc/hc_math.hpp, which overrides math functions in the global namespace
 template<typename T, std::size_t grid_sz, typename F, typename G>
 bool test_math_fn(const char* name, F f, G ref_f)
 {   // TODO: ideally this should be refactored to use proper approximate
@@ -38,7 +38,7 @@ bool test_math_fn(const char* name, F f, G ref_f)
 
     array_view<T> table(grid_sz);
 
-    parallel_for_each(table.get_extent(), [=](const index<1>& idx) __HC__ {
+    parallel_for_each(table.get_extent(), [=](const index<1>& idx) [[hc]] {
        table[idx] = f(static_cast<T>(idx[0] + 1));
     });
 
@@ -61,37 +61,38 @@ bool test()
 {   // TODO: ideally this should be refactored to use iteration through the
     //       collection of tested functions, as opposed to this verbose form.
     using namespace hc;
+    using namespace precise_math;
 
     return test_math_fn<T, grid_sz>(
         "sqrt",
-        [](T x) __HC__ { return sqrt(x); }, [](T x) { return std::sqrt(x); })
+        [](T x) [[hc]] { return sqrt(x); }, [](T x) { return std::sqrt(x); })
         && test_math_fn<T, grid_sz>(
         "fabs",
-        [](T x) __HC__ { return fabs(x); }, [](T x) { return std::fabs(x); })
+        [](T x) [[hc]] { return fabs(x); }, [](T x) { return std::fabs(x); })
         && test_math_fn<T, grid_sz>(
         "cbrt",
-        [](T x) __HC__ { return cbrt(x); }, [](T x) { return std::cbrt(x); })
+        [](T x) [[hc]] { return cbrt(x); }, [](T x) { return std::cbrt(x); })
         && test_math_fn<T, grid_sz>(
         "log",
-        [](T x) __HC__ { return log(x); }, [](T x) { return std::log(x); })
+        [](T x) [[hc]] { return log(x); }, [](T x) { return std::log(x); })
         && test_math_fn<T, grid_sz>(
         "ilogb",
-        [](T x) __HC__ { return ilogb(x); }, [](T x) { return std::ilogb(x); })
+        [](T x) [[hc]] { return ilogb(x); }, [](T x) { return std::ilogb(x); })
         && test_math_fn<T, grid_sz>(
         "isnormal",
-        [](T x) __HC__ { return isnormal(x); },
+        [](T x) [[hc]] { return isnormal(x); },
         [](T x) { return std::isnormal(x); })
         && test_math_fn<T, grid_sz>(
         "cospi",
-        [](T x) __HC__ { return cospi(x); },
+        [](T x) [[hc]] { return cospi(x); },
         [](T x) { return std::cos(static_cast<T>(M_PI) * x); })
         && test_math_fn<T, grid_sz>(
         "sinpi",
-        [](T x) __HC__ { return sinpi(x); },
+        [](T x) [[hc]] { return sinpi(x); },
         [](T x) { return std::sin(static_cast<T>(M_PI) * x); })
         && test_math_fn<T, grid_sz>(
         "rsqrt",
-        [](T x) __HC__ { return rsqrt(x); },
+        [](T x) [[hc]] { return rsqrt(x); },
         [](T x) { return static_cast<T>(1) / std::sqrt(x); });
 }
 
diff --git a/tests/Unit/HC/hc_math2.cpp b/tests/Unit/HC/hc_math2.cpp
index b392b960164..91997eb34da 100644
--- a/tests/Unit/HC/hc_math2.cpp
+++ b/tests/Unit/HC/hc_math2.cpp
@@ -3,13 +3,13 @@
 #if !DISABLED_PENDING_REMOVAL
   // RUN: %hc %s -o %t.out && %t.out
 
-  #include <hc.hpp>
-  #include <hc_math.hpp>
+  #include <hc/hc.hpp>
+  #include <hc/hc_math.hpp>
 
   #include <algorithm>
   #include <random>
 
-  // a test case which uses hc_math, which overrides math functions in the global namespace
+  // a test case which uses hc/hc_math.hpp, which overrides math functions in the global namespace
   // in this test case we check min / max specically
   template<size_t GRID_SIZE, typename T>
   bool test() {
@@ -34,7 +34,7 @@
 
   #define TEST(func) \
     { \
-      parallel_for_each(ex, [=](index<1>& idx) __HC__ { \
+      parallel_for_each(ex, [=](index<1>& idx) [[hc]] { \
         table3(idx) = func(table1(idx), table2(idx)); \
       }); \
       accelerator().get_default_view().wait(); \
diff --git a/tests/Unit/HC/hc_math3.cpp b/tests/Unit/HC/hc_math3.cpp
index c30809b795f..37f2491d380 100644
--- a/tests/Unit/HC/hc_math3.cpp
+++ b/tests/Unit/HC/hc_math3.cpp
@@ -1,19 +1,20 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_math.hpp>
 
 #include <algorithm>
 #include <cmath>
 #include <random>
 
 
-// a test case which uses hc_math, which overrides math functions in the global namespace
+// a test case which uses hc/hc_math.hpp, which overrides math functions in the global namespace
 // in this test case we check min / max specically
 template<size_t GRID_SIZE, typename T, typename Q, typename R>
 bool test() {
   using namespace hc;
+  using namespace precise_math;
   bool ret = true;
 
   array_view<T, 1> table1(GRID_SIZE); // input vector 1
@@ -36,7 +37,7 @@ bool test() {
 
 #define TEST(func) \
   { \
-    parallel_for_each(ex, [=](index<1>& idx) __HC__ { \
+    parallel_for_each(ex, [=](index<1>& idx) [[hc]] { \
       table3(idx) = func(table1(idx), table2(idx)); \
     }).wait(); \
     int error = 0; \
@@ -61,20 +62,20 @@ int main() {
   ret &= test<16, float,float,float>();
   ret &= test<16, int,float,float>();
   ret &= test<16, float,int,float>();
-  ret &= test<16, int,int,float>();
+  //ret &= test<16, int,int,float>();
   ret &= test<16, double,double,double>();
   ret &= test<16, int,double,double>();
   ret &= test<16, double,int,double>();
-  ret &= test<16, int,int,double>();
+  //ret &= test<16, int,int,double>();
 
   ret &= test<4096, float,float,float>();
   ret &= test<4096, int,float,float>();
   ret &= test<4096, float,int,float>();
-  ret &= test<4096, int,int,float>();
+  //ret &= test<4096, int,int,float>();
   ret &= test<4096, double,double,double>();
   ret &= test<4096, int,double,double>();
   ret &= test<4096, double,int,double>();
-  ret &= test<4096, int,int,double>();
+  //ret &= test<4096, int,int,double>();
 
   return !(ret == true);
 }
diff --git a/tests/Unit/HC/indivisible_tiled_extent.cpp b/tests/Unit/HC/indivisible_tiled_extent.cpp
index ef050cdc3b1..dae0dd16f0c 100644
--- a/tests/Unit/HC/indivisible_tiled_extent.cpp
+++ b/tests/Unit/HC/indivisible_tiled_extent.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include<hc.hpp>
+#include<hc/hc.hpp>
 
 #include<iostream>
 
@@ -19,7 +19,7 @@ bool test1D() {
 
   array_view<int, 1> table(GRID_SIZE);
 
-  completion_future fut = parallel_for_each(tiled_ex, [=](tiled_index<1>& idx) __HC__ {
+  completion_future fut = parallel_for_each(tiled_ex, [=](tiled_index<1>& idx) [[hc]] {
     table(idx) = idx.global[0];
   });
 
@@ -46,7 +46,7 @@ bool test2D() {
 
   array_view<int, 1> table(GRID_SIZE_Y * GRID_SIZE_X);
 
-  completion_future fut = parallel_for_each(tiled_ex, [=](tiled_index<2>& idx) __HC__ {
+  completion_future fut = parallel_for_each(tiled_ex, [=](tiled_index<2>& idx) [[hc]] {
     size_t index = idx.global[0] * GRID_SIZE_X + idx.global[1];
     table(index) = index;
   });
@@ -75,7 +75,7 @@ bool test3D() {
 
   array_view<int, 1> table(GRID_SIZE_Z * GRID_SIZE_Y * GRID_SIZE_X);
 
-  completion_future fut = parallel_for_each(tiled_ex, [=](tiled_index<3>& idx) __HC__ {
+  completion_future fut = parallel_for_each(tiled_ex, [=](tiled_index<3>& idx) [[hc]] {
     size_t index = idx.global[0] * GRID_SIZE_X * GRID_SIZE_Y + idx.global[1] * GRID_SIZE_X + idx.global[2];
     table(index) = index;
   });
diff --git a/tests/Unit/HC/kernarg_pool_size.cpp b/tests/Unit/HC/kernarg_pool_size.cpp
index ab1909cdeba..cbd10d92fed 100644
--- a/tests/Unit/HC/kernarg_pool_size.cpp
+++ b/tests/Unit/HC/kernarg_pool_size.cpp
@@ -1,6 +1,6 @@
 
 // RUN: %hc %s -o %t.out && %t.out
-#include <hc.hpp>                                                               
+#include <hc/hc.hpp>                                                               
                                                                                 
 // a test which deliberately dispatch multiple kernels in a number
 // which exceeds the size of builtin kernarg pool
diff --git a/tests/Unit/HC/kernel-call-undefined-func.cpp b/tests/Unit/HC/kernel-call-undefined-func.cpp
index e6beafaf799..f9dcc00989b 100644
--- a/tests/Unit/HC/kernel-call-undefined-func.cpp
+++ b/tests/Unit/HC/kernel-call-undefined-func.cpp
@@ -1,5 +1,5 @@
 // RUN: %not %hc %s -o %t.out 2>&1 | %not grep 'Segmentation fault'
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <vector>
 
 #define GRID_SIZE (1024)
diff --git a/tests/Unit/HC/mad24.cpp b/tests/Unit/HC/mad24.cpp
index c2be075fc2d..45379a66b88 100644
--- a/tests/Unit/HC/mad24.cpp
+++ b/tests/Unit/HC/mad24.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/tests/Unit/HC/memcpy_symbol1.cpp b/tests/Unit/HC/memcpy_symbol1.cpp
deleted file mode 100644
index 4052415fdde..00000000000
--- a/tests/Unit/HC/memcpy_symbol1.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-
-// RUN: %hc %s -o %t.out && %t.out
-
-#include <hc.hpp>
-
-#include <iostream>
-
-#define GRID_SIZE (16)
-
-// globalVar would be agent-allocated global variable with program linkage
-[[hc]] int tableGlobal[GRID_SIZE];
-
-using namespace hc;
-
-bool test1() {
-
-  bool ret = true;
-
-  // array which would be copied into the global variable array
-  int tableInput[GRID_SIZE] { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 };
-
-  // array to store the outputs from the kernel
-  array_view<int, 1> tableOutput1(GRID_SIZE);
-
-  // array to store the result copied from device memory
-  int tableOutput2[GRID_SIZE] { 0 };
-
-  // use hc::accelerator::memcpySymbol() to copy testValue to globalVar
-  // get the default accelerator
-  accelerator acc = accelerator();
-  acc.memcpy_symbol("tableGlobal", tableInput, sizeof(int) * GRID_SIZE);
-
-  // dispatch a kernel which reads from globalVar and stores result to table1
-  extent<1> ex(GRID_SIZE);
-  completion_future fut = parallel_for_each(ex, [=](index<1>& idx) __attribute__((hc)) {
-    tableOutput1(idx) = tableGlobal[idx[0]];
-  });
-
-  // wait for the kernel to be completed
-  fut.wait();
-
-  // copy data from device -> host
-  acc.memcpy_symbol("tableGlobal", tableOutput2, sizeof(int) * GRID_SIZE, 0, hcMemcpyDeviceToHost);
-
-  // read out the outputs, it should agree with testValue
-  for (int i = 0; i < GRID_SIZE; ++i) {
-    ret &= (tableInput[i] == tableOutput1[i]);
-    ret &= (tableInput[i] == tableOutput2[i]);
-  } 
-
-  return ret;
-}
-
-int main() {
-  bool ret = true;
-
-  ret &= test1();
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/HC/memcpy_symbol2.cpp b/tests/Unit/HC/memcpy_symbol2.cpp
deleted file mode 100644
index f3701d77546..00000000000
--- a/tests/Unit/HC/memcpy_symbol2.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-
-// RUN: %hc %s -o %t.out && %t.out
-
-#include <hc.hpp>
-
-#include <iostream>
-
-#define GRID_SIZE (16)
-
-// globalVar would be agent-allocated global variable with program linkage
-[[hc]] int tableGlobal[GRID_SIZE];
-
-using namespace hc;
-
-bool test2() {
-
-  bool ret = true;
-
-  // array which would be copied into the global variable array
-  int tableInput[GRID_SIZE] { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 };
-
-  // array to store the outputs from the kernel
-  array_view<int, 1> tableOutput1(GRID_SIZE);
-
-  // array to store the result copied from device memory
-  int tableOutput2[GRID_SIZE] { 0 };
-
-  // use get_symbol_address() and hc::accelerator::memcpySymbol() to copy testValue to globalVar
-  // get the default accelerator
-  accelerator acc = accelerator();
-  void* tableGlobalDevicePtr = GET_SYMBOL_ADDRESS(acc, tableGlobal);
-  acc.memcpy_symbol(tableGlobalDevicePtr, tableInput, sizeof(int) * GRID_SIZE);
-
-  // dispatch a kernel which reads from globalVar and stores result to table1
-  extent<1> ex(GRID_SIZE);
-  completion_future fut = parallel_for_each(ex, [=](index<1>& idx) __attribute__((hc)) {
-    tableOutput1(idx) = tableGlobal[idx[0]];
-  });
-
-  // wait for the kernel to be completed
-  fut.wait();
-
-  // copy data from device -> host
-  acc.memcpy_symbol(tableGlobalDevicePtr, tableOutput2, sizeof(int) * GRID_SIZE, 0, hcMemcpyDeviceToHost);
-
-  // read out the outputs, it should agree with testValue
-  for (int i = 0; i < GRID_SIZE; ++i) {
-    ret &= (tableInput[i] == tableOutput1[i]);
-    ret &= (tableInput[i] == tableOutput2[i]);
-  } 
-
-  return ret;
-}
-
-int main() {
-  bool ret = true;
-
-  ret &= test2();
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/HC/memcpy_symbol3.cpp b/tests/Unit/HC/memcpy_symbol3.cpp
deleted file mode 100644
index 57ecdd85ae8..00000000000
--- a/tests/Unit/HC/memcpy_symbol3.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-
-// RUN: %hc %s -o %t.out && %t.out
-
-#include <hc.hpp>
-
-#include <iostream>
-
-#define GRID_SIZE (16)
-
-// globalVar would be agent-allocated global variable with program linkage
-// add one initial value to prevent a bug in HLC
-[[hc]] float tableGlobal[GRID_SIZE] = { 0.1 };
-
-using namespace hc;
-
-bool test1() {
-
-  bool ret = true;
-
-  // array which would be copied into the global variable array
-  float tableInput[GRID_SIZE] { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 };
-
-  // array to store the outputs from the kernel
-  array_view<float, 1> tableOutput1(GRID_SIZE);
-
-  // array to store the result copied from device memory
-  float tableOutput2[GRID_SIZE] { 0 };
-
-  // use hc::accelerator::memcpySymbol() to copy testValue to globalVar
-  // get the default accelerator
-  accelerator acc = accelerator();
-  acc.memcpy_symbol("tableGlobal", tableInput, sizeof(float) * GRID_SIZE);
-
-  // dispatch a kernel which reads from globalVar and stores result to table1
-  extent<1> ex(GRID_SIZE);
-  completion_future fut = parallel_for_each(ex, [=](index<1>& idx) __attribute__((hc)) {
-    tableOutput1(idx) = tableGlobal[idx[0]];
-  });
-
-  // wait for the kernel to be completed
-  fut.wait();
-
-  // copy data from device -> host
-  acc.memcpy_symbol("tableGlobal", tableOutput2, sizeof(float) * GRID_SIZE, 0, hcMemcpyDeviceToHost);
-
-  // read out the outputs, it should agree with testValue
-  for (int i = 0; i < GRID_SIZE; ++i) {
-    ret &= (tableInput[i] == tableOutput1[i]);
-    ret &= (tableInput[i] == tableOutput2[i]);
-  } 
-
-  return ret;
-}
-
-int main() {
-  bool ret = true;
-
-  ret &= test1();
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/HC/memcpy_symbol4.cpp b/tests/Unit/HC/memcpy_symbol4.cpp
deleted file mode 100644
index dcc4a286c08..00000000000
--- a/tests/Unit/HC/memcpy_symbol4.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-
-// RUN: %hc %s -o %t.out && %t.out
-
-#include <hc.hpp>
-
-#include <iostream>
-
-#define GRID_SIZE (16)
-
-// globalVar would be agent-allocated global variable with program linkage
-// add an initial value to prevent a bug in HLC
-[[hc]] float tableGlobal[GRID_SIZE] = { 0.1 };
-
-using namespace hc;
-
-bool test2() {
-
-  bool ret = true;
-
-  // array which would be copied into the global variable array
-  float tableInput[GRID_SIZE] { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 };
-
-  // array to store the outputs from the kernel
-  array_view<float, 1> tableOutput1(GRID_SIZE);
-
-  // array to store the result copied from device memory
-  float tableOutput2[GRID_SIZE] { 0 };
-
-  // use get_symbol_address() and hc::accelerator::memcpySymbol() to copy testValue to globalVar
-  // get the default accelerator
-  accelerator acc = accelerator();
-  void* tableGlobalDevicePtr = GET_SYMBOL_ADDRESS(acc, tableGlobal);
-  acc.memcpy_symbol(tableGlobalDevicePtr, tableInput, sizeof(float) * GRID_SIZE);
-
-  // dispatch a kernel which reads from globalVar and stores result to table1
-  extent<1> ex(GRID_SIZE);
-  completion_future fut = parallel_for_each(ex, [=](index<1>& idx) __attribute__((hc)) {
-    tableOutput1(idx) = tableGlobal[idx[0]];
-  });
-
-  // wait for the kernel to be completed
-  fut.wait();
-
-  // copy data from device -> host
-  acc.memcpy_symbol(tableGlobalDevicePtr, tableOutput2, sizeof(float) * GRID_SIZE, 0, hcMemcpyDeviceToHost);
-
-  // read out the outputs, it should agree with testValue
-  for (int i = 0; i < GRID_SIZE; ++i) {
-    ret &= (tableInput[i] == tableOutput1[i]);
-    ret &= (tableInput[i] == tableOutput2[i]);
-  } 
-
-  return ret;
-}
-
-int main() {
-  bool ret = true;
-
-  ret &= test2();
-
-  return !(ret == true);
-}
-
diff --git a/tests/Unit/HC/mul24.cpp b/tests/Unit/HC/mul24.cpp
index 4f5e2cb9cc2..0ae15284133 100644
--- a/tests/Unit/HC/mul24.cpp
+++ b/tests/Unit/HC/mul24.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/tests/Unit/HC/multi_acc.cpp b/tests/Unit/HC/multi_acc.cpp
index 014bc2d5a86..4d28d3c8521 100644
--- a/tests/Unit/HC/multi_acc.cpp
+++ b/tests/Unit/HC/multi_acc.cpp
@@ -7,7 +7,7 @@
 #include <cmath>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 int main() {
 
@@ -29,15 +29,13 @@ int main() {
     host_result_y[i] = a * host_x[i] + host_y[i];
   }
   
-  std::vector<hc::accelerator> all_accelerators = hc::accelerator::get_all();
-  std::vector<hc::accelerator> accelerators;
-  for (auto a = all_accelerators.begin(); a != all_accelerators.end(); a++) {
-
-    // only pick accelerators supported by the HSA runtime
-    if (a->is_hsa_accelerator()) {
-      accelerators.push_back(*a);
-    }
-  }
+  std::vector<hc::accelerator> accelerators = hc::accelerator::get_all();
+  accelerators.erase(
+      std::remove_if(
+          accelerators.begin(),
+          accelerators.end(),
+          [](const hc::accelerator& acc) { return acc.get_is_emulated(); }),
+      accelerators.end());
 
   constexpr int numViewPerAcc = 2;
   int numSaxpyPerView = N/(accelerators.size() * numViewPerAcc);
diff --git a/tests/Unit/HC/multi_acc2.cpp b/tests/Unit/HC/multi_acc2.cpp
index 9999710b358..d4b8d9262f9 100644
--- a/tests/Unit/HC/multi_acc2.cpp
+++ b/tests/Unit/HC/multi_acc2.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 int main() {
 
@@ -28,15 +28,13 @@ int main() {
     host_result_y[i] = a * host_x[i] + host_y[i];
   }
 
-  std::vector<hc::accelerator> all_accelerators = hc::accelerator::get_all();
-  std::vector<hc::accelerator> accelerators;
-  for (auto a = all_accelerators.begin(); a != all_accelerators.end(); a++) {
-
-    // only pick accelerators supported by the HSA runtime
-    if (a->is_hsa_accelerator()) {
-      accelerators.push_back(*a);
-    }
-  }
+  std::vector<hc::accelerator> accelerators = hc::accelerator::get_all();
+  accelerators.erase(
+      std::remove_if(
+          accelerators.begin(),
+          accelerators.end(),
+          [](const hc::accelerator& acc) { return acc.get_is_emulated(); }),
+      accelerators.end());
 
   constexpr int numViewPerAcc = 2;
   int numSaxpyPerView = N/(accelerators.size() * numViewPerAcc);
diff --git a/tests/Unit/HC/multi_acc_array.cpp b/tests/Unit/HC/multi_acc_array.cpp
index 44cb7529d2d..103ee8d5734 100644
--- a/tests/Unit/HC/multi_acc_array.cpp
+++ b/tests/Unit/HC/multi_acc_array.cpp
@@ -7,7 +7,7 @@
 #include <cmath>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 int main() {
 
@@ -30,15 +30,13 @@ int main() {
     host_result_y[i] = a * host_x[i] + host_y[i];
   }
   
-  std::vector<hc::accelerator> all_accelerators = hc::accelerator::get_all();
-  std::vector<hc::accelerator> accelerators;
-  for (auto a = all_accelerators.begin(); a != all_accelerators.end(); a++) {
-
-    // only pick accelerators supported by the HSA runtime
-    if (a->is_hsa_accelerator()) {
-      accelerators.push_back(*a);
-    }
-  }
+  std::vector<hc::accelerator> accelerators = hc::accelerator::get_all();
+  accelerators.erase(
+      std::remove_if(
+          accelerators.begin(),
+          accelerators.end(),
+          [](const hc::accelerator& acc) { return acc.get_is_emulated(); }),
+      accelerators.end());
 
   constexpr int numViewPerAcc = 2;
   int numSaxpyPerView = N/(accelerators.size() * numViewPerAcc);
diff --git a/tests/Unit/HC/multi_acc_array2.cpp b/tests/Unit/HC/multi_acc_array2.cpp
index 650a46b5c86..6063667d1d8 100644
--- a/tests/Unit/HC/multi_acc_array2.cpp
+++ b/tests/Unit/HC/multi_acc_array2.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 int main() {
 
@@ -29,15 +29,13 @@ int main() {
     host_result_y[i] = a * host_x[i] + host_y[i];
   }
 
-  std::vector<hc::accelerator> all_accelerators = hc::accelerator::get_all();
-  std::vector<hc::accelerator> accelerators;
-  for (auto a = all_accelerators.begin(); a != all_accelerators.end(); a++) {
-
-    // only pick accelerators supported by the HSA runtime
-    if (a->is_hsa_accelerator()) {
-      accelerators.push_back(*a);
-    }
-  }
+  std::vector<hc::accelerator> accelerators = hc::accelerator::get_all();
+  accelerators.erase(
+      std::remove_if(
+          accelerators.begin(),
+          accelerators.end(),
+          [](const hc::accelerator& acc) { return acc.get_is_emulated(); }),
+      accelerators.end());
 
   constexpr int numViewPerAcc = 2;
   int numSaxpyPerView = N/(accelerators.size() * numViewPerAcc);
diff --git a/tests/Unit/HC/pinned_vector.cpp b/tests/Unit/HC/pinned_vector.cpp
index b21f016b2f1..a09a5865a9c 100644
--- a/tests/Unit/HC/pinned_vector.cpp
+++ b/tests/Unit/HC/pinned_vector.cpp
@@ -1,8 +1,8 @@
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 #include <pinned_vector.hpp>
 
 constexpr size_t small_size = 1024;
@@ -18,12 +18,12 @@ bool test_data_ptr() {
   hc::accelerator acc;
   hc::AmPointerInfo ap(nullptr, nullptr, nullptr, 0, acc);
 
-  if(am_memtracker_getinfo(&ap, v.data()) != AM_SUCCESS){
+  if(am_memtracker_get_info(&ap, v.data()) != AM_SUCCESS){
     std::cout << "pinned_vector memory not tracked by AmPointerTracker\n";
     return false;
   }
 
-  if(ap._hostPointer != ap._devicePointer
+  if(ap.host_pointer != ap.device_pointer
      or ap._isInDeviceMem
      or not ap._isAmManaged){
     std::cout << "sanity check on tracked pinned_vector memory failed\n";
diff --git a/tests/Unit/HC/placement_new.cpp b/tests/Unit/HC/placement_new.cpp
index bd32f60b383..c9afe48e17c 100644
--- a/tests/Unit/HC/placement_new.cpp
+++ b/tests/Unit/HC/placement_new.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc %s -lhc_am -o %t.out && %t.out
+// RUN: %hc %s  -o %t.out && %t.out
 
 #include <cstdlib>
 #include <cstdio>
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 class Point {
 public:
diff --git a/tests/Unit/HC/pointer_to_different_addrspaces.cpp b/tests/Unit/HC/pointer_to_different_addrspaces.cpp
index 63655e8bc8b..fc28b86e2a1 100644
--- a/tests/Unit/HC/pointer_to_different_addrspaces.cpp
+++ b/tests/Unit/HC/pointer_to_different_addrspaces.cpp
@@ -1,11 +1,11 @@
 // XFAIL: *
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 bool test() {
   // define inputs and output
diff --git a/tests/Unit/HC/reduction_hc.cpp b/tests/Unit/HC/reduction_hc.cpp
index 8d1df481f65..0df28a65efa 100644
--- a/tests/Unit/HC/reduction_hc.cpp
+++ b/tests/Unit/HC/reduction_hc.cpp
@@ -14,7 +14,7 @@
 //----------------------------------------------------------------------------
 
 #define NOMINMAX
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <iostream>
 #include <numeric>
 #include <assert.h>
@@ -72,7 +72,7 @@ float reduction_simple_1(const std::vector<float>& source)
     // back only the first element.
     array<float, 1> a(element_count, source.begin());
 
-    // Takes care of odd input elements � we could completely avoid tail sum
+    // Takes care of odd input elements; we could completely avoid tail sum
     // if we would require source to have even number of elements.
     float tail_sum = (element_count % 2) ? source[element_count - 1] : 0;
     array_view<float, 1> av_tail_sum(1, &tail_sum);
@@ -161,7 +161,7 @@ float reduction_simple_2(const std::vector<float>& source)
 }
 
 //----------------------------------------------------------------------------
-// This is an implementation of the reduction algorithm which uses tiling and 
+// This is an implementation of the reduction algorithm which uses tiling and
 // the shared memory.
 //----------------------------------------------------------------------------
 template <unsigned _tile_size>
@@ -181,7 +181,7 @@ float reduction_tiled_1(const std::vector<float>& source)
     // Using arrays as temporary memory.
     array<float, 1> arr_1(element_count, source.begin());
     array<float, 1> arr_2((element_count / _tile_size) ? (element_count / _tile_size) : 1);
-    
+
     // array_views may be swapped after each iteration.
     array_view<float, 1> av_src(arr_1);
     array_view<float, 1> av_dst(arr_2);
@@ -191,9 +191,11 @@ float reduction_tiled_1(const std::vector<float>& source)
     // is evenly divisable to the number of threads in the tile.
     while ((element_count % _tile_size) == 0)
     {
-        parallel_for_each(extent<1>(element_count).tile(_tile_size),
-                          [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]]
-        {
+        parallel_for_each(
+            extent<1>(element_count).tile(_tile_size),
+            make_callable_with_AMDGPU_attributes<
+                Flat_workgroup_size<_tile_size, _tile_size>>(
+                    [=](tiled_index<1> tidx) [[hc]] {
             // Use tile_static as a scratchpad memory.
             tile_static float tile_data[_tile_size];
 
@@ -208,7 +210,7 @@ float reduction_tiled_1(const std::vector<float>& source)
                 {
                     tile_data[local_idx] += tile_data[local_idx + s];
                 }
-                
+
                 tidx.barrier.wait();
             }
 
@@ -217,7 +219,7 @@ float reduction_tiled_1(const std::vector<float>& source)
             {
                 av_dst[tidx.tile] = tile_data[0];
             }
-        }).wait();
+        }));
 
         // Update the sequence length, swap source with destination.
         element_count /= _tile_size;
@@ -263,9 +265,11 @@ float reduction_tiled_2(const std::vector<float>& source)
     // is evenly divisable to the number of threads in the tile.
     while ((element_count % _tile_size) == 0)
     {
-        parallel_for_each(extent<1>(element_count).tile(_tile_size),
-                          [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]]
-        {
+        parallel_for_each(
+            extent<1>(element_count).tile(_tile_size),
+            make_callable_with_AMDGPU_attributes<
+                Flat_workgroup_size<_tile_size, _tile_size>>(
+                    [=](tiled_index<1> tidx) [[hc]] {
             // Use tile_static as a scratchpad memory.
             tile_static float tile_data[_tile_size];
 
@@ -290,7 +294,7 @@ float reduction_tiled_2(const std::vector<float>& source)
             {
                 av_dst[tidx.tile] = tile_data[0];
             }
-        }).wait();
+        }));
 
         // Update the sequence length, swap source with destination.
         element_count /= _tile_size;
@@ -334,8 +338,11 @@ float reduction_tiled_3(const std::vector<float>& source)
     // is evenly divisable to the number of threads in the tile.
     while ((element_count % _tile_size) == 0)
     {
-        parallel_for_each(extent<1>(element_count).tile(_tile_size), [=] (tiled_index<1> tidx) [[hc]]
-        {
+        parallel_for_each(
+            extent<1>(element_count).tile(_tile_size),
+            make_callable_with_AMDGPU_attributes<
+                Flat_workgroup_size<_tile_size, _tile_size>>(
+                    [=](tiled_index<1> tidx) [[hc]] {
             // Use tile_static as a scratchpad memory.
             tile_static float tile_data[_tile_size];
 
@@ -359,7 +366,7 @@ float reduction_tiled_3(const std::vector<float>& source)
             {
                 av_dst[tidx.tile] = tile_data[0];
             }
-        }).wait();
+        }));
 
         // Update the sequence length, swap source with destination.
         element_count /= _tile_size;
@@ -394,7 +401,7 @@ float reduction_tiled_4(const std::vector<float>& source)
     // Using arrays as temporary memory.
     array<float, 1> arr_1(element_count, source.begin());
     array<float, 1> arr_2((element_count / _tile_size) ? (element_count / _tile_size) : 1);
-    
+
     // array_views may be swapped after each iteration.
     array_view<float, 1> av_src(arr_1);
     array_view<float, 1> av_dst(arr_2);
@@ -406,9 +413,11 @@ float reduction_tiled_4(const std::vector<float>& source)
     while (element_count >= _tile_size
         && (element_count % (_tile_size * 2)) == 0)
     {
-        parallel_for_each(extent<1>(element_count / 2).tile(_tile_size),
-                          [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]]
-        {
+        parallel_for_each(
+            extent<1>(element_count / 2).tile(_tile_size),
+            make_callable_with_AMDGPU_attributes<
+                Flat_workgroup_size<_tile_size, _tile_size>>(
+                    [=](tiled_index<1> tidx) [[hc]] {
             // Use tile_static as a scratchpad memory.
             tile_static float tile_data[_tile_size];
 
@@ -436,7 +445,7 @@ float reduction_tiled_4(const std::vector<float>& source)
             {
                 av_dst[tidx.tile] = tile_data[0];
             }
-        }).wait();
+        }));
 
         // Update the sequence length, swap source with destination.
         element_count /= _tile_size * 2;
@@ -483,9 +492,11 @@ float reduction_cascade(const std::vector<float>& source)
     array<float, 1> a(element_count, source.begin());
     array<float, 1> a_partial_result(_tile_count);
 
-    parallel_for_each(extent<1>(_tile_count * _tile_size).tile(_tile_size),
-                     [=, &a, &a_partial_result] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]]
-    {
+    parallel_for_each(
+        extent<1>(_tile_count * _tile_size).tile(_tile_size),
+        make_callable_with_AMDGPU_attributes<
+            Flat_workgroup_size<_tile_size, _tile_size>>(
+                [=, &a, &a_partial_result](tiled_index<1> tidx) [[hc]] {
         // Use tile_static as a scratchpad memory.
         tile_static float tile_data[_tile_size];
 
@@ -496,7 +507,7 @@ float reduction_cascade(const std::vector<float>& source)
         tile_data[local_idx] = 0;
         do
         {
-            tile_data[local_idx] += a[input_idx] + a[input_idx + _tile_size]; 
+            tile_data[local_idx] += a[input_idx] + a[input_idx + _tile_size];
             input_idx += stride;
         } while (input_idx < element_count);
 
@@ -518,7 +529,7 @@ float reduction_cascade(const std::vector<float>& source)
         {
             a_partial_result[tidx.tile[0]] = tile_data[0];
         }
-    }).wait();
+    }));
 
     // Reduce results from all tiles on the CPU.
     std::vector<float> v_partial_result(_tile_count);
diff --git a/tests/Unit/HC/reduction_tile_static.cpp b/tests/Unit/HC/reduction_tile_static.cpp
index b36be686b6f..b9d753c4644 100644
--- a/tests/Unit/HC/reduction_tile_static.cpp
+++ b/tests/Unit/HC/reduction_tile_static.cpp
@@ -15,7 +15,7 @@
 
 #define NOMINMAX
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cassert>
 #include <climits>
@@ -85,9 +85,11 @@ float reduction_tiled_1(const std::vector<float>& source)
     // is evenly divisable to the number of threads in the tile.
     while ((element_count % _tile_size) == 0)
     {
-        parallel_for_each(extent<1>(element_count).tile(_tile_size),
-                          [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]]
-        {
+        parallel_for_each(
+            extent<1>(element_count).tile(_tile_size),
+            make_callable_with_AMDGPU_attributes<
+                Flat_workgroup_size<_tile_size, _tile_size>>(
+                    [=](tiled_index<1> tidx) [[hc]] {
             // Use tile_static as a scratchpad memory.
             tile_static float tile_data[_tile_size];
 
@@ -111,7 +113,7 @@ float reduction_tiled_1(const std::vector<float>& source)
             {
                 av_dst[tidx.tile] = tile_data[0];
             }
-        });
+        }));
 
         // Update the sequence length, swap source with destination.
         element_count /= _tile_size;
@@ -157,9 +159,11 @@ float reduction_tiled_2(const std::vector<float>& source)
     // is evenly divisable to the number of threads in the tile.
     while ((element_count % _tile_size) == 0)
     {
-        parallel_for_each(extent<1>(element_count).tile(_tile_size),
-                          [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]]
-        {
+        parallel_for_each(
+            extent<1>(element_count).tile(_tile_size),
+            make_callable_with_AMDGPU_attributes<
+                Flat_workgroup_size<_tile_size, _tile_size>>(
+                    [=](tiled_index<1> tidx) [[hc]] {
             // Use tile_static as a scratchpad memory.
             tile_static float tile_data[_tile_size];
 
@@ -184,7 +188,7 @@ float reduction_tiled_2(const std::vector<float>& source)
             {
                 av_dst[tidx.tile] = tile_data[0];
             }
-        });
+        }));
 
         // Update the sequence length, swap source with destination.
         element_count /= _tile_size;
@@ -228,9 +232,11 @@ float reduction_tiled_3(const std::vector<float>& source)
     // is evenly divisable to the number of threads in the tile.
     while ((element_count % _tile_size) == 0)
     {
-        parallel_for_each(extent<1>(element_count).tile(_tile_size),
-                          [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]]
-        {
+        parallel_for_each(
+            extent<1>(element_count).tile(_tile_size),
+            make_callable_with_AMDGPU_attributes<
+                Flat_workgroup_size<_tile_size, _tile_size>>(
+                    [=](tiled_index<1> tidx) [[hc]] {
             // Use tile_static as a scratchpad memory.
             tile_static float tile_data[_tile_size];
 
@@ -254,7 +260,7 @@ float reduction_tiled_3(const std::vector<float>& source)
             {
                 av_dst[tidx.tile] = tile_data[0];
             }
-        });
+        }));
 
         // Update the sequence length, swap source with destination.
         element_count /= _tile_size;
@@ -301,9 +307,11 @@ float reduction_tiled_4(const std::vector<float>& source)
     while (element_count >= _tile_size
         && (element_count % (_tile_size * 2)) == 0)
     {
-        parallel_for_each(extent<1>(element_count / 2).tile(_tile_size),
-                          [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]]
-        {
+        parallel_for_each(
+            extent<1>(element_count / 2).tile(_tile_size),
+            make_callable_with_AMDGPU_attributes<
+                Flat_workgroup_size<_tile_size, _tile_size>>(
+                    [=](tiled_index<1> tidx) [[hc]] {
             // Use tile_static as a scratchpad memory.
             tile_static float tile_data[_tile_size];
 
@@ -331,7 +339,7 @@ float reduction_tiled_4(const std::vector<float>& source)
             {
                 av_dst[tidx.tile] = tile_data[0];
             }
-        });
+        }));
 
         // Update the sequence length, swap source with destination.
         element_count /= _tile_size * 2;
@@ -378,9 +386,11 @@ float reduction_cascade(const std::vector<float>& source)
     array<float, 1> a(element_count, source.begin());
     array<float, 1> a_partial_result(_tile_count);
 
-    parallel_for_each(extent<1>(_tile_count * _tile_size).tile(_tile_size),
-                      [=, &a, &a_partial_result] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]]
-    {
+    parallel_for_each(
+        extent<1>(_tile_count * _tile_size).tile(_tile_size),
+        make_callable_with_AMDGPU_attributes<
+            Flat_workgroup_size<_tile_size, _tile_size>>(
+                [=, &a, &a_partial_result](tiled_index<1> tidx) [[hc]] {
         // Use tile_static as a scratchpad memory.
         tile_static float tile_data[_tile_size];
 
@@ -413,7 +423,7 @@ float reduction_cascade(const std::vector<float>& source)
         {
             a_partial_result[tidx.tile[0]] = tile_data[0];
         }
-    });
+    }));
 
     // Reduce results from all tiles on the CPU.
     std::vector<float> v_partial_result(_tile_count);
diff --git a/tests/Unit/HC/saxpy_array.cpp b/tests/Unit/HC/saxpy_array.cpp
index 8e5a060ba42..0ff1377f164 100644
--- a/tests/Unit/HC/saxpy_array.cpp
+++ b/tests/Unit/HC/saxpy_array.cpp
@@ -7,7 +7,7 @@
 #include <exception>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #define N  (1024 * 500)
 
diff --git a/tests/Unit/HC/saxpy_arrayview.cpp b/tests/Unit/HC/saxpy_arrayview.cpp
index edf708c559a..2b0952cafd0 100644
--- a/tests/Unit/HC/saxpy_arrayview.cpp
+++ b/tests/Unit/HC/saxpy_arrayview.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #define N  (1024 * 500)
 
diff --git a/tests/Unit/HC/subword_types.cpp b/tests/Unit/HC/subword_types.cpp
index f7740614025..c662facb9dd 100644
--- a/tests/Unit/HC/subword_types.cpp
+++ b/tests/Unit/HC/subword_types.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <cstddef>
 #include <cstdint>
diff --git a/tests/Unit/HC/test1.cpp b/tests/Unit/HC/test1.cpp
index 313f624956a..ed50f5af074 100644
--- a/tests/Unit/HC/test1.cpp
+++ b/tests/Unit/HC/test1.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -14,7 +14,7 @@ bool test() {
   bool ret = true;
 
   // check if the queue is HSA
-  ret &= av.is_hsa_accelerator();
+  ret &= av.get_accelerator().is_hsa_accelerator();
 
   std::cout << ret << "\n";
 
diff --git a/tests/Unit/HC/test2.cpp b/tests/Unit/HC/test2.cpp
index 9b707f5fa98..80d7f350aba 100644
--- a/tests/Unit/HC/test2.cpp
+++ b/tests/Unit/HC/test2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -42,7 +42,7 @@ int main() {
     for (int i = 0; i < LOOP_COUNT; ++i)
       av3(idx) = av1(idx) + av2(idx);
   });
-  
+
   accelerator_view.create_marker();
 
   hc::parallel_for_each(hc::extent<1>(GRID_SIZE), [=](hc::index<1>& idx) [[hc]] {
@@ -62,9 +62,12 @@ int main() {
   // wait for async operations to complete
   hc::accelerator().get_default_view().wait();
 
-  // now there must be 0 pending async operations for the accelerator_view
-  ret &= (accelerator_view.get_pending_async_ops() == 0);
+  for (decltype(GRID_SIZE) i = 0; i != GRID_SIZE; ++i) {
+    if (av3[i] != 2 * i) return EXIT_FAILURE;
+    if (av4[i] != 2 * i) return EXIT_FAILURE;
+    if (av5[i] != 2 * i) return EXIT_FAILURE;
+  }
 
-  return !(ret == true);
+  return EXIT_SUCCESS;
 }
 
diff --git a/tests/Unit/HC/test3.cpp b/tests/Unit/HC/test3.cpp
index 487822e0079..e8bd2addcdc 100644
--- a/tests/Unit/HC/test3.cpp
+++ b/tests/Unit/HC/test3.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -15,19 +15,20 @@ bool test() {
   bool ret = true;
 
   // check if the queue is HSA
-  ret &= av.is_hsa_accelerator();
+  ret &= av.get_accelerator().is_hsa_accelerator();
 
   std::cout << ret << "\n";
 
   // checks if we can get underlying native HSA agent
-  void* native_agent = av.get_hsa_agent();
+  void* native_agent = av.get_accelerator().get_hsa_agent();
   ret &= (native_agent != nullptr);
 
-  void* native_agent2 = av2.get_hsa_agent();
+  void* native_agent2 = av2.get_accelerator().get_hsa_agent();
   ret &= (native_agent2 != nullptr);
 
   // native_agent and native_agent2 should point to the same agent
-  ret &= (native_agent == native_agent2);
+  ret &= static_cast<hsa_agent_t*>(native_agent)->handle ==
+    static_cast<hsa_agent_t*>(native_agent2)->handle;
 
   std::cout << ret << "\n";
 
diff --git a/tests/Unit/HC/test4.cpp b/tests/Unit/HC/test4.cpp
index 65e6019cf5a..a51c3dfc9aa 100644
--- a/tests/Unit/HC/test4.cpp
+++ b/tests/Unit/HC/test4.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -16,15 +16,15 @@ bool test() {
   bool ret = true;
 
   // check if the queue is HSA
-  ret &= av.is_hsa_accelerator();
+  ret &= av.get_accelerator().is_hsa_accelerator();
 
   std::cout << ret << "\n";
 
   // checks if we can get AM region
-  void* am_region = av.get_hsa_am_region();
+  void* am_region = av.get_accelerator().get_hsa_am_region();
   ret &= (am_region != nullptr);
 
-  void* am_region2 = av2.get_hsa_am_region();
+  void* am_region2 = av2.get_accelerator().get_hsa_am_region();
   ret &= (am_region2 != nullptr);
 
   // am_region and am_region2 should point to the same agent
@@ -32,18 +32,6 @@ bool test() {
 
   std::cout << ret << "\n";
 
-  // checks if we can get Kernarg region
-  void* kernarg_region = av.get_hsa_kernarg_region();
-  ret &= (kernarg_region != nullptr);
-
-  void* kernarg_region2 = av2.get_hsa_kernarg_region();
-  ret &= (kernarg_region2 != nullptr);
-
-  // kernarg_region and kernarg_region2 should point to the same agent
-  ret &= (kernarg_region == kernarg_region2);
-
-  std::cout << ret << "\n";
-
   return ret;
 }
 
diff --git a/tests/Unit/HC/test5.cpp b/tests/Unit/HC/test5.cpp
index 17d5cbb84b8..868c8f2e85b 100644
--- a/tests/Unit/HC/test5.cpp
+++ b/tests/Unit/HC/test5.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/HC/test7.cpp b/tests/Unit/HC/test7.cpp
index 4791ce600ff..d76bdaa421b 100644
--- a/tests/Unit/HC/test7.cpp
+++ b/tests/Unit/HC/test7.cpp
@@ -1,12 +1,11 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
 // a test which checks accelerator::get_hsa_am_region()
-// a test which checks accelerator::get_hsa_kernarg_region()
 bool test() {
 
   hc::accelerator acc;
@@ -24,12 +23,6 @@ bool test() {
 
   std::cout << ret << "\n";
 
-  // checks if we can get Kernarg region
-  void* kernarg_region = acc.get_hsa_kernarg_region();
-  ret &= (kernarg_region != nullptr);
-
-  std::cout << ret << "\n";
-
   return ret;
 }
 
diff --git a/tests/Unit/HC/test8.cpp b/tests/Unit/HC/test8.cpp
index 9f131f1b95c..48d521e9a85 100644
--- a/tests/Unit/HC/test8.cpp
+++ b/tests/Unit/HC/test8.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -14,7 +14,7 @@ bool test() {
   bool ret = true;
 
   // check if the queue is HSA
-  ret &= av.is_hsa_accelerator();
+  ret &= av.get_accelerator().is_hsa_accelerator();
 
   std::cout << ret << "\n";
 
@@ -22,11 +22,12 @@ bool test() {
   void* native_agent = acc.get_hsa_agent();
   ret &= (native_agent != nullptr);
 
-  void* native_agent2 = av.get_hsa_agent();
+  void* native_agent2 = av.get_accelerator().get_hsa_agent();
   ret &= (native_agent2 != nullptr);
 
   // native_agent and native_agent2 should point to the same agent
-  ret &= (native_agent == native_agent2);
+  ret &= static_cast<hsa_agent_t*>(native_agent)->handle ==
+    static_cast<hsa_agent_t*>(native_agent2)->handle;
 
   std::cout << ret << "\n";
 
diff --git a/tests/Unit/HC/test9.cpp b/tests/Unit/HC/test9.cpp
index 16ad764af8f..fadf98ec791 100644
--- a/tests/Unit/HC/test9.cpp
+++ b/tests/Unit/HC/test9.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -16,10 +16,10 @@ bool test() {
   if (acc.is_hsa_accelerator()) {
 
     // check if we can get its profile
-    hc::hcAgentProfile profile = acc.get_profile();
+    hc::accelerator_profile profile = acc.get_profile();
 
     // an HSA agent must have its profile specified
-    ret &= (profile != hc::hcAgentProfileNone);
+    ret &= (profile != hc::accelerator_profile_none);
   }
 
   return ret;
diff --git a/tests/Unit/HC/test_fp16.cpp b/tests/Unit/HC/test_fp16.cpp
index 9ff5f42edc0..94ed1eb5fc8 100644
--- a/tests/Unit/HC/test_fp16.cpp
+++ b/tests/Unit/HC/test_fp16.cpp
@@ -3,7 +3,7 @@
 
 // a test to check FP16 type can be used in HCC
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <cstdio>
 
 typedef __fp16 hcc_fp16;
diff --git a/tests/Unit/HC/test_i16.cpp b/tests/Unit/HC/test_i16.cpp
index bc8c477fff1..39de1315593 100644
--- a/tests/Unit/HC/test_i16.cpp
+++ b/tests/Unit/HC/test_i16.cpp
@@ -3,7 +3,7 @@
 
 // a test to check I16 type can be used in HCC
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <cstdio>
 
 
diff --git a/tests/Unit/HC/tick.cpp b/tests/Unit/HC/tick.cpp
index 3fddce7f8f7..7abd5d218b7 100644
--- a/tests/Unit/HC/tick.cpp
+++ b/tests/Unit/HC/tick.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 #include <random>
@@ -35,7 +35,7 @@ bool test() {
   hc::extent<1> e(vecSize);
   hc::completion_future fut = hc::parallel_for_each(
     e,
-    [=](hc::index<1> idx) __HC__ {
+    [=](hc::index<1> idx) [[hc]] {
       for (int i = 0; i < LOOP_COUNT; ++i) 
         table_c(idx) = table_a(idx) + table_b(idx);
 
diff --git a/tests/Unit/HC/tick2.cpp b/tests/Unit/HC/tick2.cpp
index f879fd884d0..a2c2ad7640c 100644
--- a/tests/Unit/HC/tick2.cpp
+++ b/tests/Unit/HC/tick2.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -16,7 +16,7 @@ bool test() {
   bool ret = true;
 
   // check if the queue is HSA
-  ret &= av.is_hsa_accelerator();
+  ret &= av.get_accelerator().is_hsa_accelerator();
 
   std::cout << ret << "\n";
 
diff --git a/tests/Unit/HC/tiled_index_copy_ctor.cpp b/tests/Unit/HC/tiled_index_copy_ctor.cpp
index 27eb66388a2..e10717b9c64 100644
--- a/tests/Unit/HC/tiled_index_copy_ctor.cpp
+++ b/tests/Unit/HC/tiled_index_copy_ctor.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 using namespace hc;
 
diff --git a/tests/Unit/HC/ubsan.cpp b/tests/Unit/HC/ubsan.cpp
index 1bf4320a4c0..d2b938ee824 100644
--- a/tests/Unit/HC/ubsan.cpp
+++ b/tests/Unit/HC/ubsan.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined -fno-sanitize=vptr -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <cstdlib>
 
 void fill(hc::array_view<int,1>& input, int x) {
diff --git a/tests/Unit/HC/wg_register_limit1.cpp b/tests/Unit/HC/wg_register_limit1.cpp
index b457b3b8b96..c21d6f77e9b 100644
--- a/tests/Unit/HC/wg_register_limit1.cpp
+++ b/tests/Unit/HC/wg_register_limit1.cpp
@@ -2,10 +2,10 @@
 // XFAIL: *
 // SWDEV-170201
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <string>
 #include <cmath>
-#include <hc_math.hpp>
+#include <hc/hc_math.hpp>
 
 int main() {
   bool pass = false;
@@ -48,7 +48,7 @@ int main() {
       }
     });
 
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass = err_str.find("The number of work items") != std::string::npos &&
     err_str.find("per work group exceeds the limit") != std::string::npos;
diff --git a/tests/Unit/HC/wg_size1.cpp b/tests/Unit/HC/wg_size1.cpp
index 61deff97e83..7ebda91540b 100644
--- a/tests/Unit/HC/wg_size1.cpp
+++ b/tests/Unit/HC/wg_size1.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <string>
 #include <iostream>
 
diff --git a/tests/Unit/HC/wg_size2.cpp b/tests/Unit/HC/wg_size2.cpp
index c7757f73104..efdb1911906 100644
--- a/tests/Unit/HC/wg_size2.cpp
+++ b/tests/Unit/HC/wg_size2.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <string>
 #include <iostream>
 
diff --git a/tests/Unit/HC/wg_size3.cpp b/tests/Unit/HC/wg_size3.cpp
index 268bbb119c2..6a9ed0a7b98 100644
--- a/tests/Unit/HC/wg_size3.cpp
+++ b/tests/Unit/HC/wg_size3.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <string>
 #include <iostream>
 
diff --git a/tests/Unit/HC/wg_size_unsupported1.cpp b/tests/Unit/HC/wg_size_unsupported1.cpp
index c58691be80a..a30dd0bc94c 100644
--- a/tests/Unit/HC/wg_size_unsupported1.cpp
+++ b/tests/Unit/HC/wg_size_unsupported1.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <string>
 #include <iostream>
 
@@ -9,7 +9,7 @@ int main() {
   try  {
     // We expect the runtime will fire an exception due to a large work group size
     hc::parallel_for_each(hc::extent<1>(8192).tile(8192), [](hc::tiled_index<1> i) [[hc]] {});
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass = err_str.find("The extent of the tile") != std::string::npos &&
     err_str.find("exceeds the device limit") != std::string::npos;
diff --git a/tests/Unit/HC/wg_size_unsupported2.cpp b/tests/Unit/HC/wg_size_unsupported2.cpp
index 4c7e8f032bc..a7e3ce09699 100644
--- a/tests/Unit/HC/wg_size_unsupported2.cpp
+++ b/tests/Unit/HC/wg_size_unsupported2.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <string>
 #include <iostream>
 
@@ -10,7 +10,7 @@ int main() {
   try  {
     // We expect the runtime will fire an exception due to a large work group size
     hc::parallel_for_each(hc::extent<2>(8192,1).tile(8192,1), [](hc::tiled_index<2> i) [[hc]] {});
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass = err_str.find("The extent of the tile") != std::string::npos &&
     err_str.find("exceeds the device limit") != std::string::npos;
@@ -19,7 +19,7 @@ int main() {
   try  {
     // We expect the runtime will fire an exception due to a large work group size
     hc::parallel_for_each(hc::extent<2>(1,8192).tile(1,8192), [](hc::tiled_index<2> i) [[hc]] {});
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass &= err_str.find("The extent of the tile") != std::string::npos &&
     err_str.find("exceeds the device limit") != std::string::npos;
diff --git a/tests/Unit/HC/wg_size_unsupported3.cpp b/tests/Unit/HC/wg_size_unsupported3.cpp
index 06e5420e588..6da3a5d68e7 100644
--- a/tests/Unit/HC/wg_size_unsupported3.cpp
+++ b/tests/Unit/HC/wg_size_unsupported3.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <string>
 #include <iostream>
 
@@ -10,7 +10,7 @@ int main() {
   try  {
     // We expect the runtime will fire an exception due to a large work group size
     hc::parallel_for_each(hc::extent<3>(8192,1,1).tile(8192,1,1), [](hc::tiled_index<3> i) [[hc]] {});
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass = err_str.find("The extent of the tile") != std::string::npos &&
     err_str.find("exceeds the device limit") != std::string::npos;
@@ -19,7 +19,7 @@ int main() {
   try  {
     // We expect the runtime will fire an exception due to a large work group size
     hc::parallel_for_each(hc::extent<3>(1,8192,1).tile(1,8192,1), [](hc::tiled_index<3> i) [[hc]] {});
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass = err_str.find("The extent of the tile") != std::string::npos &&
     err_str.find("exceeds the device limit") != std::string::npos;
@@ -28,7 +28,7 @@ int main() {
   try  {
     // We expect the runtime will fire an exception due to a large work group size
     hc::parallel_for_each(hc::extent<3>(1,1,8192).tile(1,1,8192), [](hc::tiled_index<3> i) [[hc]] {});
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass = err_str.find("The extent of the tile") != std::string::npos &&
     err_str.find("exceeds the device limit") != std::string::npos;
diff --git a/tests/Unit/HC/wg_size_unsupported4.cpp b/tests/Unit/HC/wg_size_unsupported4.cpp
index 069128f4748..98464ec7a75 100644
--- a/tests/Unit/HC/wg_size_unsupported4.cpp
+++ b/tests/Unit/HC/wg_size_unsupported4.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <string>
 #include <iostream>
 
@@ -9,7 +9,7 @@ int main() {
 
   try  {
     hc::parallel_for_each(hc::extent<3>(16,16,16).tile(32,1,1), [](hc::tiled_index<3> i) [[hc]] {});
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass = err_str.find("The extent of the tile") != std::string::npos &&
     err_str.find("exceeds the compute grid extent") != std::string::npos;
@@ -17,7 +17,7 @@ int main() {
 
   try  {
     hc::parallel_for_each(hc::extent<3>(16,16,16).tile(1,32,1), [](hc::tiled_index<3> i) [[hc]] {});
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass = err_str.find("The extent of the tile") != std::string::npos &&
     err_str.find("exceeds the compute grid extent") != std::string::npos;
@@ -25,7 +25,7 @@ int main() {
 
   try  {
     hc::parallel_for_each(hc::extent<3>(16,16,16).tile(1,1,32), [](hc::tiled_index<3> i) [[hc]] {});
-  } catch (Kalmar::runtime_exception e) {
+  } catch (hc::detail::runtime_exception e) {
     std::string err_str = e.what();
     pass = err_str.find("The extent of the tile") != std::string::npos &&
     err_str.find("exceeds the compute grid extent") != std::string::npos;
diff --git a/tests/Unit/HC/zero_extent.cpp b/tests/Unit/HC/zero_extent.cpp
index 6f40577502f..6ddb054e03c 100644
--- a/tests/Unit/HC/zero_extent.cpp
+++ b/tests/Unit/HC/zero_extent.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 
@@ -22,7 +22,7 @@ bool test1D() {
 
   // 1D non-tiled
   extent<1> ex1d(0);
-  completion_future fut1 = parallel_for_each(ex1d, [&](index<1>& idx) __HC__ {
+  completion_future fut1 = parallel_for_each(ex1d, [&](index<1>& idx) [[hc]] {
     table[idx[0]] = 1;
   });
 
@@ -35,7 +35,7 @@ bool test1D() {
 
   // 1D tiled
   tiled_extent<1> tiled_ex1d = ex1d.tile(0);
-  completion_future fut2 = parallel_for_each(tiled_ex1d, [&](tiled_index<1>& idx) __HC__ {
+  completion_future fut2 = parallel_for_each(tiled_ex1d, [&](tiled_index<1>& idx) [[hc]] {
     table[idx.global[0]] = 1;
   });
 
@@ -47,7 +47,7 @@ bool test1D() {
   ret &= (std::count(std::begin(table), std::end(table), 0) == TABLE_X);
 
   // 1D non-tiled
-  completion_future fut3 = parallel_for_each(ex1d, [&](index<1>& idx) __HC__ {
+  completion_future fut3 = parallel_for_each(ex1d, [&](index<1>& idx) [[hc]] {
     table[idx[0]] = 1;
   });
 
@@ -59,7 +59,7 @@ bool test1D() {
   ret &= (std::count(std::begin(table), std::end(table), 0) == TABLE_X);
 
   // 1D tiled
-  completion_future fut4 = parallel_for_each(tiled_ex1d, [&](tiled_index<1>& idx) __HC__ {
+  completion_future fut4 = parallel_for_each(tiled_ex1d, [&](tiled_index<1>& idx) [[hc]] {
     table[idx.global[0]] = 1;
   });
 
@@ -83,7 +83,7 @@ bool test2D() {
 
   // 2D non-tiled
   extent<2> ex2d(0, 0);
-  completion_future fut1 = parallel_for_each(ex2d, [&](index<2>& idx) __HC__ {
+  completion_future fut1 = parallel_for_each(ex2d, [&](index<2>& idx) [[hc]] {
     table[idx[0] * TABLE_Y + idx[1]] = 1;
   });
 
@@ -96,7 +96,7 @@ bool test2D() {
 
   // 2D tiled
   tiled_extent<2> tiled_ex2d = ex2d.tile(0, 0);
-  completion_future fut2 = parallel_for_each(tiled_ex2d, [&](tiled_index<2>& idx) __HC__ {
+  completion_future fut2 = parallel_for_each(tiled_ex2d, [&](tiled_index<2>& idx) [[hc]] {
     table[idx.global[0] * TABLE_Y + idx.global[1]] = 1;
   });
 
@@ -108,7 +108,7 @@ bool test2D() {
   ret &= (std::count(std::begin(table), std::end(table), 0) == TABLE_X * TABLE_Y);
 
   // 2D non-tiled
-  completion_future fut3 = parallel_for_each(ex2d, [&](index<2>& idx) __HC__ {
+  completion_future fut3 = parallel_for_each(ex2d, [&](index<2>& idx) [[hc]] {
     table[idx[0] * TABLE_Y + idx[1]] = 1;
   });
 
@@ -120,7 +120,7 @@ bool test2D() {
   ret &= (std::count(std::begin(table), std::end(table), 0) == TABLE_X * TABLE_Y);
 
   // 2D tiled
-  completion_future fut4 = parallel_for_each(tiled_ex2d, [&](tiled_index<2>& idx) __HC__ {
+  completion_future fut4 = parallel_for_each(tiled_ex2d, [&](tiled_index<2>& idx) [[hc]] {
     table[idx.global[0] * TABLE_Y + idx.global[1]] = 1;
   });
 
@@ -144,7 +144,7 @@ bool test3D() {
 
   // 3D non-tiled
   extent<3> ex3d(0, 0, 0);
-  completion_future fut1 = parallel_for_each(ex3d, [&](index<3>& idx) __HC__ {
+  completion_future fut1 = parallel_for_each(ex3d, [&](index<3>& idx) [[hc]] {
     table[idx[0] * TABLE_X * TABLE_Y + idx[1] * TABLE_Y + idx[2]] = 1;
   });
 
@@ -157,7 +157,7 @@ bool test3D() {
 
   // 3D tiled
   tiled_extent<3> tiled_ex3d = ex3d.tile(0, 0, 0);
-  completion_future fut2 = parallel_for_each(tiled_ex3d, [&](tiled_index<3>& idx) __HC__ {
+  completion_future fut2 = parallel_for_each(tiled_ex3d, [&](tiled_index<3>& idx) [[hc]] {
     table[idx.global[0] * TABLE_X * TABLE_Y + idx.global[1] * TABLE_Y + idx.global[2]] = 1;
   });
 
@@ -169,7 +169,7 @@ bool test3D() {
   ret &= (std::count(std::begin(table), std::end(table), 0) == TABLE_X * TABLE_Y * TABLE_Z);
 
   // 2D non-tiled
-  completion_future fut3 = parallel_for_each(ex3d, [&](index<3>& idx) __HC__ {
+  completion_future fut3 = parallel_for_each(ex3d, [&](index<3>& idx) [[hc]] {
     table[idx[0] * TABLE_X * TABLE_Y + idx[1] * TABLE_Y + idx[2]] = 1;
   });
 
@@ -181,7 +181,7 @@ bool test3D() {
   ret &= (std::count(std::begin(table), std::end(table), 0) == TABLE_X * TABLE_Y * TABLE_Z);
 
   // 2D tiled
-  completion_future fut4 = parallel_for_each(tiled_ex3d, [&](tiled_index<3>& idx) __HC__ {
+  completion_future fut4 = parallel_for_each(tiled_ex3d, [&](tiled_index<3>& idx) [[hc]] {
     table[idx.global[0] * TABLE_X * TABLE_Y + idx.global[1] * TABLE_Y + idx.global[2]] = 1;
   });
 
diff --git a/tests/Unit/HSA/functor1.cpp b/tests/Unit/HSA/functor1.cpp
index 946801f2a16..9c65a128d02 100644
--- a/tests/Unit/HSA/functor1.cpp
+++ b/tests/Unit/HSA/functor1.cpp
@@ -1,13 +1,10 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
-// added for checking HSA profile
-#include <hc.hpp>
-
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
 
@@ -22,7 +19,7 @@ class prog {
 public:
   prog(int (&t)[SIZE]) [[cpu, hc]] : input(t) {}
 
-  void operator() (index<1>& idx) [[hc]] {
+  void operator()(index<1>& idx) const [[hc]] {
     input[idx[0]] = idx[0];
   }
 
@@ -49,7 +46,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     // prepare test data
     int input[SIZE] { 0 };
diff --git a/tests/Unit/HSA/functor2.cpp b/tests/Unit/HSA/functor2.cpp
index 886c3c7061e..6fa604f6a87 100644
--- a/tests/Unit/HSA/functor2.cpp
+++ b/tests/Unit/HSA/functor2.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -19,7 +19,7 @@ using namespace hc;
 // the class will call a separate functor
 class user_functor {
 public:
-  void operator() (index<1>& idx, int (&input)[SIZE]) [[hc]] {
+  void operator()(index<1>& idx, int (&input)[SIZE]) const [[hc]] {
     input[idx[0]] = idx[0];
   }
 };
@@ -32,7 +32,7 @@ class prog {
   prog(int (&t)[SIZE], user_functor& f) [[cpu, hc]] : input(t), kernel(f) {
   }
 
-  void operator() (index<1>& idx) [[hc]] {
+  void operator()(index<1>& idx) const [[hc]] {
     kernel(idx, input);
   }
 
@@ -59,7 +59,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     // prepare test data
     int input[SIZE] { 0 };
diff --git a/tests/Unit/HSA/functor3.cpp b/tests/Unit/HSA/functor3.cpp
index 5c013ca94f5..e11a93e6ce6 100644
--- a/tests/Unit/HSA/functor3.cpp
+++ b/tests/Unit/HSA/functor3.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -23,7 +23,7 @@ class user_functor {
 
   user_functor(int (&t)[SIZE]) [[cpu, hc]] : input(t) {}
 
-  void operator() (index<1>& idx) [[hc]] {
+  void operator() (index<1>& idx) const [[hc]] {
     input[idx[0]] = idx[0];
   }
 };
@@ -35,7 +35,7 @@ class prog {
   prog(user_functor& f) [[cpu, hc]] : kernel(f) {
   }
 
-  void operator() (index<1>& idx) [[hc]] {
+  void operator() (index<1>& idx) const [[hc]] {
     kernel(idx);
   }
 
@@ -62,7 +62,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     // prepare test data
     int input[SIZE] { 0 };
diff --git a/tests/Unit/HSA/functor4.cpp b/tests/Unit/HSA/functor4.cpp
index 65552730266..1e2c136857d 100644
--- a/tests/Unit/HSA/functor4.cpp
+++ b/tests/Unit/HSA/functor4.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -24,7 +24,7 @@ class prog {
   prog(_Tp (&t)[N]) [[cpu, hc]] : input(t) {
   }
 
-  void operator() (index<1>& idx) [[hc]] {
+  void operator() (index<1>& idx) const [[hc]] {
     input[idx[0]] = idx[0];
   }
 
@@ -51,7 +51,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     // prepare test data
     int input_int[SIZE] { 0 };
diff --git a/tests/Unit/HSA/functor5.cpp b/tests/Unit/HSA/functor5.cpp
index abf189ed973..862d2f7e295 100644
--- a/tests/Unit/HSA/functor5.cpp
+++ b/tests/Unit/HSA/functor5.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -35,7 +35,7 @@ class prog {
   prog(_Tp (&t)[N], user_functor<_Tp, N>& f) [[cpu, hc]] : input(t), kernel(f) {
   }
 
-  void operator() (index<1>& idx) [[hc]] {
+  void operator() (index<1>& idx) const [[hc]] {
     kernel(idx, input);
   }
 
@@ -62,7 +62,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     // prepare test data
     int input_int[SIZE] { 0 };
diff --git a/tests/Unit/HSA/functor6.cpp b/tests/Unit/HSA/functor6.cpp
index 0bc9e4955e9..0bb5eb6933f 100644
--- a/tests/Unit/HSA/functor6.cpp
+++ b/tests/Unit/HSA/functor6.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -25,7 +25,7 @@ class user_functor {
 
   user_functor(_Tp (&t)[N]) [[cpu, hc]] : input(t) {}
 
-  void operator() (index<1>& idx) [[hc]] {
+  void operator() (index<1>& idx) const [[hc]] {
     input[idx[0]] = idx[0];
   }
 };
@@ -38,7 +38,7 @@ class prog {
   prog(_Tp& f) [[cpu, hc]] : kernel(f) {
   }
 
-  void operator() (index<1>& idx) [[hc]] {
+  void operator() (index<1>& idx) const [[hc]] {
     kernel(idx);
   }
 
@@ -65,7 +65,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     // prepare test data
     int input_int[SIZE] { 0 };
diff --git a/tests/Unit/HSA/list.cpp b/tests/Unit/HSA/list.cpp
index dab10a719b5..95ef17c024b 100644
--- a/tests/Unit/HSA/list.cpp
+++ b/tests/Unit/HSA/list.cpp
@@ -3,10 +3,10 @@
 
 #include <vector>
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -68,7 +68,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/HSA/list2.cpp b/tests/Unit/HSA/list2.cpp
index bac288fe92b..2661caa2bae 100644
--- a/tests/Unit/HSA/list2.cpp
+++ b/tests/Unit/HSA/list2.cpp
@@ -3,12 +3,12 @@
 
 #include <vector>
 #include <iostream>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <malloc.h>
 #include <string.h>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -56,7 +56,7 @@ class list_head {
 #define NUM_LIST_NODES (20)
 
 
-list_head *list_insert_new(list_head *llist_head, list_head *newitem, list_data *info, int idx) restrict (amp, cpu) {
+list_head *list_insert_new(list_head *llist_head, list_head *newitem, list_data *info, int idx) [[cpu, hc]] {
 
 	newitem->next=llist_head[idx].next;
 	llist_head[idx].next=newitem;
@@ -128,7 +128,7 @@ int main()
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/HSA/no_printf.cpp b/tests/Unit/HSA/no_printf.cpp
index f627a979055..76547cbd9f3 100644
--- a/tests/Unit/HSA/no_printf.cpp
+++ b/tests/Unit/HSA/no_printf.cpp
@@ -1,10 +1,10 @@
-// RUN: %hc %s -lhc_am -o %t.out && %t.out | %FileCheck -allow-empty %s
+// RUN: %hc %s  -o %t.out && %t.out | %FileCheck -allow-empty %s
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <hc_printf.hpp>
 
 int main() {
-  hc::parallel_for_each(hc::extent<1>(1), []() [[hc]] {
+  hc::parallel_for_each(hc::extent<1>(1), [](hc::index<1>) [[hc]] {
       hc::printf("Accelerator: Hello World!\n");
   }).wait();
   return 0;
diff --git a/tests/Unit/HSA/printf.cpp b/tests/Unit/HSA/printf.cpp
index 0cde182c411..dcf5823ca77 100644
--- a/tests/Unit/HSA/printf.cpp
+++ b/tests/Unit/HSA/printf.cpp
@@ -1,8 +1,8 @@
-// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF -lhc_am -o %t.out && %t.out | %FileCheck %s
+// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF  -o %t.out && %t.out | %FileCheck %s
 
 #include <cassert>
-#include <hc.hpp>
-#include <hc_printf.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_printf.hpp>
 
 // create 2 tiles of 64 threads
 #define TILE (64)
diff --git a/tests/Unit/HSA/printf_error_check.cpp b/tests/Unit/HSA/printf_error_check.cpp
index d689972e599..245791df4be 100644
--- a/tests/Unit/HSA/printf_error_check.cpp
+++ b/tests/Unit/HSA/printf_error_check.cpp
@@ -1,8 +1,8 @@
-// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF -DCHECK_PRINTF_ERROR -lhc_am -o %t.out && %t.out | %FileCheck %s
+// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF -DCHECK_PRINTF_ERROR  -o %t.out && %t.out | %FileCheck %s
 
 #include <cassert>
-#include <hc.hpp>
-#include <hc_printf.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_printf.hpp>
 
 // create 2 tiles of 64 threads
 #define TILE (64)
diff --git a/tests/Unit/HSA/printf_excess_args.cpp b/tests/Unit/HSA/printf_excess_args.cpp
index f70cdc392d1..26f29d66b5d 100644
--- a/tests/Unit/HSA/printf_excess_args.cpp
+++ b/tests/Unit/HSA/printf_excess_args.cpp
@@ -1,8 +1,8 @@
-// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF -lhc_am -o %t.out && %t.out | %FileCheck %s
+// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF  -o %t.out && %t.out | %FileCheck %s
 
 #include <cassert>
-#include <hc.hpp>
-#include <hc_printf.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_printf.hpp>
 
 // create 2 tiles of 64 threads
 #define TILE (64)
diff --git a/tests/Unit/HSA/printf_minimal.cpp b/tests/Unit/HSA/printf_minimal.cpp
index a06d50a61a0..2bcf6cd4371 100644
--- a/tests/Unit/HSA/printf_minimal.cpp
+++ b/tests/Unit/HSA/printf_minimal.cpp
@@ -1,10 +1,10 @@
-// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF -lhc_am -o %t.out && %t.out | %FileCheck %s
+// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF  -o %t.out && %t.out | %FileCheck %s
 
-#include <hc.hpp>
-#include <hc_printf.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_printf.hpp>
 
 int main() {
-  hc::parallel_for_each(hc::extent<1>(1), []() [[hc]] {
+  hc::parallel_for_each(hc::extent<1>(1), [](hc::index<1>) [[hc]] {
       hc::printf("Accelerator: Hello World!\n");
   }).wait();
   return 0;
diff --git a/tests/Unit/HSA/printf_ptr_addr.cpp b/tests/Unit/HSA/printf_ptr_addr.cpp
index 7dfc8e1aa17..4e49e9c7403 100644
--- a/tests/Unit/HSA/printf_ptr_addr.cpp
+++ b/tests/Unit/HSA/printf_ptr_addr.cpp
@@ -1,7 +1,7 @@
-// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF -lhc_am -o %t.out && %t.out | %FileCheck %s
+// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF  -o %t.out && %t.out | %FileCheck %s
 
-#include <hc.hpp>
-#include <hc_printf.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_printf.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/HSA/printf_supported_types.cpp b/tests/Unit/HSA/printf_supported_types.cpp
index e2c770e73c5..f11b032bce0 100644
--- a/tests/Unit/HSA/printf_supported_types.cpp
+++ b/tests/Unit/HSA/printf_supported_types.cpp
@@ -1,8 +1,8 @@
-// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF -lhc_am -o %t.out && %t.out | %FileCheck %s
+// RUN: %hc %s -DHCC_ENABLE_ACCELERATOR_PRINTF  -o %t.out && %t.out | %FileCheck %s
 
 #include <cassert>
-#include <hc.hpp>
-#include <hc_printf.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_printf.hpp>
 
 // create 2 tiles of 64 threads
 #define TILE (8)
diff --git a/tests/Unit/HSA/sizeof.cpp b/tests/Unit/HSA/sizeof.cpp
index 2aadd180328..123139f4060 100644
--- a/tests/Unit/HSA/sizeof.cpp
+++ b/tests/Unit/HSA/sizeof.cpp
@@ -1,12 +1,12 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -33,7 +33,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
 
     ret &= test<char>();
     ret &= test<int>();
diff --git a/tests/Unit/HSA/string.cpp b/tests/Unit/HSA/string.cpp
index 3858ee8c7f9..979f8f4cc09 100644
--- a/tests/Unit/HSA/string.cpp
+++ b/tests/Unit/HSA/string.cpp
@@ -2,10 +2,9 @@
 // RUN: %hc %s -o %t.out && %t.out
 
 #include <iostream>
-#include <hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -60,7 +59,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/HSA/volatile_union.cpp b/tests/Unit/HSA/volatile_union.cpp
index f71212fac87..1647f5d1fbb 100644
--- a/tests/Unit/HSA/volatile_union.cpp
+++ b/tests/Unit/HSA/volatile_union.cpp
@@ -1,10 +1,10 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -16,11 +16,11 @@
 
 #define SIZE (128)
 
-__attribute__((amp,cpu)) void p(VOLATILE float* fp) {
+[[cpu, hc]] void p(VOLATILE float* fp) {
   *fp = 100.0f;
 }
 
-__attribute__((amp,cpu)) float foo1(float a) {
+[[cpu, hc]] float foo1(float a) {
   union {
     VOLATILE float* fp;
     VOLATILE int* ip;
@@ -33,7 +33,7 @@ __attribute__((amp,cpu)) float foo1(float a) {
   return *(u.fp);
 }
 
-__attribute__((amp,cpu)) float foo2(float a) {
+[[cpu, hc]] float foo2(float a) {
 
   VOLATILE float* fp;
   VOLATILE int* ip;
@@ -90,7 +90,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/Indexing/extent.cpp b/tests/Unit/Indexing/extent.cpp
index e496c65a533..ec67fa4b9ab 100644
--- a/tests/Unit/Indexing/extent.cpp
+++ b/tests/Unit/Indexing/extent.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 #include <iostream> 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <vector>
 using namespace hc; 
 int main() 
diff --git a/tests/Unit/Indexing/index.cpp b/tests/Unit/Indexing/index.cpp
index 8a65ce5eb72..b622fac56d8 100644
--- a/tests/Unit/Indexing/index.cpp
+++ b/tests/Unit/Indexing/index.cpp
@@ -1,8 +1,8 @@
-// RUN: %amp_device -c -S -D__KALMAR_ACCELERATOR__ -emit-llvm %s -O -o -|%cppfilt|%FileCheck %s
+// RUN: %amp_device -c -S -D__HCC_ACCELERATOR__ -emit-llvm %s -O -o -|%cppfilt|%FileCheck %s
 // RUN: %gtest_amp %s -o %t && %t
 // Testing if an efficient (i.e. fully inlined version) of hc::index
-#include <hc.hpp>
-#ifndef __KALMAR_ACCELERATOR__ //Device mode compilation cannot have RTTI
+#include <hc/hc.hpp>
+#ifndef __HCC_ACCELERATOR__ //Device mode compilation cannot have RTTI
 #include <gtest/gtest.h>
 #endif
 #define N0 10
@@ -18,7 +18,7 @@ int foo(int k) [[hc]]{
 //CHECK-NOT: load
 //CHECK: }
 
-#ifndef __KALMAR_ACCELERATOR__ //Device mode compilation cannot have RTTI
+#ifndef __HCC_ACCELERATOR__ //Device mode compilation cannot have RTTI
 // Test correctness
 TEST(ClassIndex, Index1D) {
   int n0 = N0;
diff --git a/tests/Unit/Indexing/tile_index.cpp b/tests/Unit/Indexing/tile_index.cpp
index 64bb3707420..98e54a5dcad 100644
--- a/tests/Unit/Indexing/tile_index.cpp
+++ b/tests/Unit/Indexing/tile_index.cpp
@@ -1,7 +1,7 @@
 // XFAIL: *
 // RUN: %cxxamp %s -o %t.out && %t.out
 #include <iostream> 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <vector>
 using namespace hc;
 int test_1d() {
@@ -11,10 +11,9 @@ int test_1d() {
 
   extent<1> e(100);
   {
-    array_view<int, 1> av(e, vv.data()); 
-    parallel_for_each(av.get_extent().tile(5),
-      [=](tiled_index<1> idx) [[hc]] { 
-	av(idx) = 
+    array_view<int, 1> av(e, vv.data());
+    parallel_for_each(av.get_extent().tile(5), [=](tiled_index<1> idx) [[hc]] {
+	av(idx) =
           idx.tile[0] +
           idx.tile_origin[0] * 100;
       });
@@ -32,10 +31,10 @@ int test_2d()
 
   extent<2> e(10, 20);
   {
-    array_view<int, 2> av(e, vv.data()); 
-    parallel_for_each(av.get_extent().tile(5,5),
-      [=](tiled_index<2> idx) [[hc]] { 
-	av(idx) = 
+    array_view<int, 2> av(e, vv.data());
+    parallel_for_each(
+      av.get_extent().tile(5, 5), [=](tiled_index<2> idx) [[hc]] {
+	av(idx) =
           idx.tile[0] +
           idx.tile[1] * 10 +
           idx.tile_origin[0] * 100 +
@@ -69,7 +68,7 @@ int test_tiled_extent_1d(void) {
 
 int test_tiled_extent_2d(void) {
   extent<2> e(123, 456);
-  tiled_extent<2> myTileExtent(e.tile(10,30));
+  tiled_extent<2> myTileExtent(e.tile(10, 30));
   auto padded = myTileExtent.pad();
   assert(padded[0] == 130);
   assert(padded[1] == 480);
diff --git a/tests/Unit/InlineASM/inline_asm_vaddf32.cpp b/tests/Unit/InlineASM/inline_asm_vaddf32.cpp
index 5cdf18e92e1..eb23bf07804 100644
--- a/tests/Unit/InlineASM/inline_asm_vaddf32.cpp
+++ b/tests/Unit/InlineASM/inline_asm_vaddf32.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #define N  (1024 * 500)
 
diff --git a/tests/Unit/InlineASM/inline_asm_vmacf32.cpp b/tests/Unit/InlineASM/inline_asm_vmacf32.cpp
index f77bdaf6f04..f8f3f94fee6 100644
--- a/tests/Unit/InlineASM/inline_asm_vmacf32.cpp
+++ b/tests/Unit/InlineASM/inline_asm_vmacf32.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 
 // header file for the hc API
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #define N  (1024 * 500)
 
diff --git a/tests/Unit/InvalidLambda/empty_lambda2.cpp b/tests/Unit/InvalidLambda/empty_lambda2.cpp
index 1b8ac9b7848..b40fb6bffee 100644
--- a/tests/Unit/InvalidLambda/empty_lambda2.cpp
+++ b/tests/Unit/InvalidLambda/empty_lambda2.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 int main()
 {
     // This test outlines a subtle issue with how we obtain mangled kernel names
@@ -8,5 +8,5 @@ int main()
 	hc::array_view<int> gpu_resultsv(1, &gpu_result);
     gpu_resultsv.discard_data();
     static auto fun = [&]() [[cpu, hc]] { return 0; };
-    hc::parallel_for_each(gpu_resultsv.get_extent(), [=] (hc::index<1> idx) restrict (amp) { gpu_resultsv[idx] = fun(); });
+    hc::parallel_for_each(gpu_resultsv.get_extent(), [=] (hc::index<1> idx) [[hc]] { gpu_resultsv[idx] = fun(); });
 }
diff --git a/tests/Unit/InvalidLambda/qq.cpp b/tests/Unit/InvalidLambda/qq.cpp
index d993e144e7d..bc3a2139bf1 100644
--- a/tests/Unit/InvalidLambda/qq.cpp
+++ b/tests/Unit/InvalidLambda/qq.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <stdlib.h>
 #include <iostream>
 int main(void){
@@ -17,7 +17,7 @@ int main(void){
   for (hc::index<1> i(0); i[0] < vecSize; i++) {
     ga[i] = 100.0f * rand() / RAND_MAX;
     gb[i] = 100.0f * rand() / RAND_MAX;
-    sum += a[i] + b[i];
+    sum += ga[i] + gb[i];
   }
 
   hc::parallel_for_each(
diff --git a/tests/Unit/Lock/lock_host_pointer.cpp b/tests/Unit/Lock/lock_host_pointer.cpp
index 9bafbc23efe..f4f4050f0db 100644
--- a/tests/Unit/Lock/lock_host_pointer.cpp
+++ b/tests/Unit/Lock/lock_host_pointer.cpp
@@ -1,8 +1,8 @@
 
-// RUN: %hc %s -lhc_am -o %t.out; %t.out
+// RUN: %hc %s  -o %t.out; %t.out
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 int main()
 {
diff --git a/tests/Unit/Lock/unlock_host_pointer.cpp b/tests/Unit/Lock/unlock_host_pointer.cpp
index 97675dabe36..54a8f6cd593 100644
--- a/tests/Unit/Lock/unlock_host_pointer.cpp
+++ b/tests/Unit/Lock/unlock_host_pointer.cpp
@@ -1,8 +1,8 @@
 
-// RUN: %hc %s -lhc_am -o %t.out; %t.out
+// RUN: %hc %s  -o %t.out; %t.out
 
-#include <hc.hpp>
-#include <hc_am.hpp>
+#include <hc/hc.hpp>
+#include <hc/hc_am.hpp>
 
 int main()
 {
diff --git a/tests/Unit/Macro/check_hcc.cpp b/tests/Unit/Macro/check_hcc.cpp
index c5273dd47f3..53a96e2cbdc 100644
--- a/tests/Unit/Macro/check_hcc.cpp
+++ b/tests/Unit/Macro/check_hcc.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
-#include <hc_defines.h>
+#include <hc/hc_defines.hpp>
 #include <iostream>
 
 #ifndef __HCC__
diff --git a/tests/Unit/Macro/check_hcc_accelerator.cpp b/tests/Unit/Macro/check_hcc_accelerator.cpp
index 662e78c4670..54b616f6238 100644
--- a/tests/Unit/Macro/check_hcc_accelerator.cpp
+++ b/tests/Unit/Macro/check_hcc_accelerator.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 int main() {
 
diff --git a/tests/Unit/Macro/check_hcc_amp.cpp b/tests/Unit/Macro/check_hcc_amp.cpp
deleted file mode 100644
index 19109786260..00000000000
--- a/tests/Unit/Macro/check_hcc_amp.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: %cxxamp %s -o %t.out && %t.out
-
-#ifndef __HCC_AMP__
-#error __HCC_AMP__ is not defined!
-#endif
-
-// __HCC_HC__ and __HCC_AMP__ are mutually exclusive
-#ifdef __HCC_HC__
-#error __HCC_HC__ is defined!
-#endif
-
-int main() {
-  return 0;
-}
-
diff --git a/tests/Unit/Macro/check_hcc_cpu.cpp b/tests/Unit/Macro/check_hcc_cpu.cpp
index c315b819144..12d6820b445 100644
--- a/tests/Unit/Macro/check_hcc_cpu.cpp
+++ b/tests/Unit/Macro/check_hcc_cpu.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 int main() {
 
diff --git a/tests/Unit/Macro/check_kalmar_cc.cpp b/tests/Unit/Macro/check_kalmar_cc.cpp
index 37977446212..a2c2e66ad06 100644
--- a/tests/Unit/Macro/check_kalmar_cc.cpp
+++ b/tests/Unit/Macro/check_kalmar_cc.cpp
@@ -1,7 +1,7 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
-#ifndef __KALMAR_CC__
-#error __KALMAR_CC__ is not defined!
+#ifndef __HC_CC__
+  #error __HC_CC__ is not defined!
 #endif
 
 int main() {
diff --git a/tests/Unit/NamespaceScopeVariables/Inputs/shared_object_needs_global.cc b/tests/Unit/NamespaceScopeVariables/Inputs/shared_object_needs_global.cc
index 7ecd7190404..8601d5bf4cb 100644
--- a/tests/Unit/NamespaceScopeVariables/Inputs/shared_object_needs_global.cc
+++ b/tests/Unit/NamespaceScopeVariables/Inputs/shared_object_needs_global.cc
@@ -1,6 +1,6 @@
 #include "test_parameters.hpp"
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 
@@ -16,7 +16,7 @@ bool test_scalar()
 
     if (read_scalar[0] != global_scalar) return false;
 
-    parallel_for_each(hc::extent<1>{1}, [=](index<1>) [[hc]] {
+    parallel_for_each(hc::extent<1>{1}, [](index<1>) [[hc]] {
         ++global_scalar;
     });
 
@@ -41,7 +41,7 @@ bool test_array()
         return false;
     }
 
-    parallel_for_each(hc::extent<1>{1}, [=](index<1>) [[hc]] {
+    parallel_for_each(hc::extent<1>{1}, [](index<1>) [[hc]] {
         for (auto&& x : global_array) ++x;
     });
 
diff --git a/tests/Unit/NamespaceScopeVariables/Inputs/shared_object_needs_namespace.cc b/tests/Unit/NamespaceScopeVariables/Inputs/shared_object_needs_namespace.cc
index e7ea933c6ce..1f4c0fb78d9 100644
--- a/tests/Unit/NamespaceScopeVariables/Inputs/shared_object_needs_namespace.cc
+++ b/tests/Unit/NamespaceScopeVariables/Inputs/shared_object_needs_namespace.cc
@@ -1,6 +1,6 @@
 #include "test_parameters.hpp"
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 
@@ -17,7 +17,7 @@ bool test_scalar()
 
     if (read_scalar[0] != namespace_scalar) return false;
 
-    parallel_for_each(hc::extent<1>{1}, [=](index<1>) [[hc]] {
+    parallel_for_each(hc::extent<1>{1}, [](index<1>) [[hc]] {
         ++namespace_scalar;
     });
 
@@ -46,7 +46,7 @@ bool test_array()
         return false;
     }
 
-    parallel_for_each(hc::extent<1>{1}, [=](index<1>) [[hc]] {
+    parallel_for_each(hc::extent<1>{1}, [](index<1>) [[hc]] {
         for (auto&& x : namespace_array) ++x;
     });
 
diff --git a/tests/Unit/NamespaceScopeVariables/global.cpp b/tests/Unit/NamespaceScopeVariables/global.cpp
index 203abfa99c4..57ab3a70152 100644
--- a/tests/Unit/NamespaceScopeVariables/global.cpp
+++ b/tests/Unit/NamespaceScopeVariables/global.cpp
@@ -2,7 +2,7 @@
 
 #include "Inputs/test_parameters.hpp"
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/tests/Unit/NamespaceScopeVariables/global_different_translation_units.cpp b/tests/Unit/NamespaceScopeVariables/global_different_translation_units.cpp
index 42fcf9a4dad..8f2e09684db 100644
--- a/tests/Unit/NamespaceScopeVariables/global_different_translation_units.cpp
+++ b/tests/Unit/NamespaceScopeVariables/global_different_translation_units.cpp
@@ -2,7 +2,7 @@
 
 #include "Inputs/test_parameters.hpp"
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/tests/Unit/NamespaceScopeVariables/global_from_shared_object.cpp b/tests/Unit/NamespaceScopeVariables/global_from_shared_object.cpp
index cbd4ff6fa42..8040b991a23 100644
--- a/tests/Unit/NamespaceScopeVariables/global_from_shared_object.cpp
+++ b/tests/Unit/NamespaceScopeVariables/global_from_shared_object.cpp
@@ -3,7 +3,7 @@
 
 #include "Inputs/test_parameters.hpp"
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/tests/Unit/NamespaceScopeVariables/namespace.cpp b/tests/Unit/NamespaceScopeVariables/namespace.cpp
index 45bed64ecfc..8efc73f7707 100644
--- a/tests/Unit/NamespaceScopeVariables/namespace.cpp
+++ b/tests/Unit/NamespaceScopeVariables/namespace.cpp
@@ -2,7 +2,7 @@
 
 #include "Inputs/test_parameters.hpp"
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/tests/Unit/NamespaceScopeVariables/namespace_different_translation_units.cpp b/tests/Unit/NamespaceScopeVariables/namespace_different_translation_units.cpp
index e7b14b4b93a..fa9e265b170 100644
--- a/tests/Unit/NamespaceScopeVariables/namespace_different_translation_units.cpp
+++ b/tests/Unit/NamespaceScopeVariables/namespace_different_translation_units.cpp
@@ -2,7 +2,7 @@
 
 #include "Inputs/test_parameters.hpp"
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/tests/Unit/NamespaceScopeVariables/namespace_from_shared_object.cpp b/tests/Unit/NamespaceScopeVariables/namespace_from_shared_object.cpp
index ba252c39526..2c48479a1fb 100644
--- a/tests/Unit/NamespaceScopeVariables/namespace_from_shared_object.cpp
+++ b/tests/Unit/NamespaceScopeVariables/namespace_from_shared_object.cpp
@@ -3,7 +3,7 @@
 
 #include "Inputs/test_parameters.hpp"
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/tests/Unit/OCKL/AsyncStreams/as_consumer.h b/tests/Unit/OCKL/AsyncStreams/as_consumer.h
index d8d09eafe38..65cb38bb8c7 100644
--- a/tests/Unit/OCKL/AsyncStreams/as_consumer.h
+++ b/tests/Unit/OCKL/AsyncStreams/as_consumer.h
@@ -1,7 +1,7 @@
 #ifndef __AS_CONSUMER_H__
 #define __AS_CONSUMER_H__
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <hsa/hsa.h>
 
 #include <iostream>
diff --git a/tests/Unit/Overload/Caller-amp-only-Callee-global-cpu-only.cpp b/tests/Unit/Overload/Caller-amp-only-Callee-global-cpu-only.cpp
index da68e56c800..54da7d58779 100644
--- a/tests/Unit/Overload/Caller-amp-only-Callee-global-cpu-only.cpp
+++ b/tests/Unit/Overload/Caller-amp-only-Callee-global-cpu-only.cpp
@@ -1,6 +1,6 @@
 // XFAIL: *
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 void foo()
diff --git a/tests/Unit/Overload/Disjoint_restrict.cpp b/tests/Unit/Overload/Disjoint_restrict.cpp
index 5385529ae16..7b2fea113e5 100644
--- a/tests/Unit/Overload/Disjoint_restrict.cpp
+++ b/tests/Unit/Overload/Disjoint_restrict.cpp
@@ -1,7 +1,8 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-#include <hc_math.hpp>
+#include <hc/hc.hpp>
+
 #include <iostream>
+
 using namespace hc;
 
 int test() [[cpu, hc]]
@@ -49,10 +50,9 @@ void runall_result::verify_exit_code() [[cpu]]
 int main()
 {
 	runall_result gpu_result;
-	hc::array_view<runall_result> gpu_resultsv(1, &gpu_result);
+	array_view<runall_result> gpu_resultsv(1, &gpu_result);
 
-	hc::parallel_for_each(gpu_resultsv.get_extent(), [=](hc::index<1> idx) [[hc]]
-	{
+	parallel_for_each(gpu_resultsv.get_extent(), [=](index<1> idx) [[hc]]	{
 		gpu_resultsv[idx] = test();
 	});
-}
+}
\ No newline at end of file
diff --git a/tests/Unit/Overload/Negative/call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp b/tests/Unit/Overload/Negative/call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp
index b6148cc4274..39bd6970d3b 100644
--- a/tests/Unit/Overload/Negative/call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp
+++ b/tests/Unit/Overload/Negative/call_amp_function_in_cpu_function_or_lambda_or_pfe.cpp
@@ -4,7 +4,7 @@
 // Do not delete or add any line; it is referred to by absolute line number in the
 // FileCheck lines below
 //////////////////////////////////////////////////////////////////////////////////
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 void foo() [[hc]]
diff --git a/tests/Unit/Overload/Negative/call_amp_function_in_main.cpp b/tests/Unit/Overload/Negative/call_amp_function_in_main.cpp
index 615c7a13191..79c819f3743 100644
--- a/tests/Unit/Overload/Negative/call_amp_function_in_main.cpp
+++ b/tests/Unit/Overload/Negative/call_amp_function_in_main.cpp
@@ -4,7 +4,7 @@
 // Do not delete or add any line; it is referred to by absolute line number in the
 // FileCheck lines below
 //////////////////////////////////////////////////////////////////////////////
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 int foo() [[hc]]
diff --git a/tests/Unit/Overload/Negative/call_amp_linking_error.cpp b/tests/Unit/Overload/Negative/call_amp_linking_error.cpp
index 0d52983ce1f..44908829f4d 100644
--- a/tests/Unit/Overload/Negative/call_amp_linking_error.cpp
+++ b/tests/Unit/Overload/Negative/call_amp_linking_error.cpp
@@ -4,7 +4,7 @@
 // Do not delete or add any line; it is referred to by absolute line number in the
 // FileCheck lines below
 //////////////////////////////////////////////////////////////////////////////////
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 int f1() [[hc]] {return 1;} 
diff --git a/tests/Unit/Overload/Negative/call_cpu_funtion_in_amp_function_or_lambda_or_pfe.cpp b/tests/Unit/Overload/Negative/call_cpu_funtion_in_amp_function_or_lambda_or_pfe.cpp
index f8658645949..c65969a7371 100644
--- a/tests/Unit/Overload/Negative/call_cpu_funtion_in_amp_function_or_lambda_or_pfe.cpp
+++ b/tests/Unit/Overload/Negative/call_cpu_funtion_in_amp_function_or_lambda_or_pfe.cpp
@@ -1,10 +1,10 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
+// RUN: %amp_device -D__HCC_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
 
 //////////////////////////////////////////////////////////////////////////////////
 // Do not delete or add any line; it is referred to by absolute line number in the
 // FileCheck lines below
 //////////////////////////////////////////////////////////////////////////////////
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 void foo()
diff --git a/tests/Unit/Overload/Negative/call_distinct_from_dual_context.cpp b/tests/Unit/Overload/Negative/call_distinct_from_dual_context.cpp
index 3da02280f6c..c211b78b5eb 100644
--- a/tests/Unit/Overload/Negative/call_distinct_from_dual_context.cpp
+++ b/tests/Unit/Overload/Negative/call_distinct_from_dual_context.cpp
@@ -1,10 +1,10 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
+// RUN: %amp_device -D__HCC_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
 
 //////////////////////////////////////////////////////////////////////////////////
 // Do not delete or add any line; it is referred to by absolute line number in the
 // FileCheck lines below
 //////////////////////////////////////////////////////////////////////////////////
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 int f1() [[cpu]] {return 1;} 
diff --git a/tests/Unit/Overload/Negative/linking_error.cpp b/tests/Unit/Overload/Negative/linking_error.cpp
index 91687e22563..3de901c5146 100644
--- a/tests/Unit/Overload/Negative/linking_error.cpp
+++ b/tests/Unit/Overload/Negative/linking_error.cpp
@@ -4,7 +4,7 @@
 // Do not delete or add any line; it is referred to by absolute line number in the
 // FileCheck lines below
 //////////////////////////////////////////////////////////////////////////////////
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 int f2() [[hc]] {return 2;}
diff --git a/tests/Unit/Overload/Test_Overload.cpp b/tests/Unit/Overload/Test_Overload.cpp
index e663aef3cbb..8b68d3d192b 100644
--- a/tests/Unit/Overload/Test_Overload.cpp
+++ b/tests/Unit/Overload/Test_Overload.cpp
@@ -1,5 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
+
 using namespace hc;
 
 #define TEST_CPU
@@ -58,7 +59,7 @@ bool AMP_Func() [[hc]]
 #ifdef TEST_BOTH
 bool BOTH_CPU_AND_AMP() [[cpu, hc]]
 {
-#if __KALMAR_ACCELERATOR__
+#if __HCC_ACCELERATOR__
     if (callee() != 1)
 #else
     if (callee() != 2)
@@ -71,7 +72,7 @@ bool BOTH_CPU_AND_AMP() [[cpu, hc]]
 }
 #endif
 
-int main(int argc, char **argv)
+int main()
 {
     int flag;
 #ifdef TEST_CPU
@@ -86,12 +87,11 @@ int main(int argc, char **argv)
     // directly called is not allowed, we use pfe
     {
       int result;
-      hc::array_view<int> gpu_resultsv(1, &result);
-      hc::parallel_for_each(gpu_resultsv.get_extent(), [=](hc::index<1> idx) [[hc]]
-      {
+      array_view<int> gpu_resultsv(1, &result);
+      parallel_for_each(gpu_resultsv.get_extent(), [=](index<1> idx) [[hc]] {
         gpu_resultsv[idx] = AMP_Func();
       });
-    
+
        if(gpu_resultsv[0] == 0) { printf("AMP_Func Error! exit!\n"); exit(1);}
      }
 #endif
@@ -99,12 +99,12 @@ int main(int argc, char **argv)
 #ifdef TEST_BOTH
     {
       int result;
-      hc::array_view<int> gpu_resultsv(1, &result);
-      hc::parallel_for_each(gpu_resultsv.get_extent(), [=](hc::index<1> idx) [[cpu, hc]]
+      array_view<int> gpu_resultsv(1, &result);
+      parallel_for_each(gpu_resultsv.get_extent(), [=](index<1> idx) [[hc]]
       {
         gpu_resultsv[idx] = BOTH_CPU_AND_AMP();
       });
-    
+
        if(gpu_resultsv[0] == 0) { printf("BOTH_CPU_AND_AMP Error! exit!\n"); exit(1);}
      }
 #endif
diff --git a/tests/Unit/Overload/amp-lambda_or_pfe_in_main.cpp b/tests/Unit/Overload/amp-lambda_or_pfe_in_main.cpp
index be1e1202176..49c78dd1028 100644
--- a/tests/Unit/Overload/amp-lambda_or_pfe_in_main.cpp
+++ b/tests/Unit/Overload/amp-lambda_or_pfe_in_main.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 
diff --git a/tests/Unit/Overload/amp_lambda_or_pfe_in_a_cpu_or_cpu_elided_function_or_lambda.cpp b/tests/Unit/Overload/amp_lambda_or_pfe_in_a_cpu_or_cpu_elided_function_or_lambda.cpp
index 27baebea175..9c8c2962014 100644
--- a/tests/Unit/Overload/amp_lambda_or_pfe_in_a_cpu_or_cpu_elided_function_or_lambda.cpp
+++ b/tests/Unit/Overload/amp_lambda_or_pfe_in_a_cpu_or_cpu_elided_function_or_lambda.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 int CPU_Func()
diff --git a/tests/Unit/Overload/cpu_caller_distinct_callees.cpp b/tests/Unit/Overload/cpu_caller_distinct_callees.cpp
index 66efaa1958c..d91953a3ad5 100644
--- a/tests/Unit/Overload/cpu_caller_distinct_callees.cpp
+++ b/tests/Unit/Overload/cpu_caller_distinct_callees.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 
diff --git a/tests/Unit/Overload/cpu_function_or_lambda_in_main.cpp b/tests/Unit/Overload/cpu_function_or_lambda_in_main.cpp
index 4e27a6f76c0..e96370bb814 100644
--- a/tests/Unit/Overload/cpu_function_or_lambda_in_main.cpp
+++ b/tests/Unit/Overload/cpu_function_or_lambda_in_main.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 
diff --git a/tests/Unit/Overload/cpu_lambda_in_amp_function.cpp b/tests/Unit/Overload/cpu_lambda_in_amp_function.cpp
index d6a8e03dc3f..4e727452196 100644
--- a/tests/Unit/Overload/cpu_lambda_in_amp_function.cpp
+++ b/tests/Unit/Overload/cpu_lambda_in_amp_function.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 using namespace hc;
 
 
diff --git a/tests/Unit/Parse/amp_header_test.cpp b/tests/Unit/Parse/amp_header_test.cpp
index 872143eef57..f3acdf2dd16 100644
--- a/tests/Unit/Parse/amp_header_test.cpp
+++ b/tests/Unit/Parse/amp_header_test.cpp
@@ -1,4 +1,4 @@
 // RUN: %cxxamp -c %s
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 
diff --git a/tests/Unit/Parse/class_cross_referencing.cpp b/tests/Unit/Parse/class_cross_referencing.cpp
index 8f19614df41..04e976e31f0 100644
--- a/tests/Unit/Parse/class_cross_referencing.cpp
+++ b/tests/Unit/Parse/class_cross_referencing.cpp
@@ -1,5 +1,5 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 class unorm;
 class norm {
diff --git a/tests/Unit/Parse/lambda_attribute.cpp b/tests/Unit/Parse/lambda_attribute.cpp
index 9c2448fd461..cb315b51ef6 100644
--- a/tests/Unit/Parse/lambda_attribute.cpp
+++ b/tests/Unit/Parse/lambda_attribute.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
@@ -15,7 +15,7 @@ int main() {
 
   {
     // case 1: placed between parameter list and left bracket
-    auto k1 = [=] (const index<1>& idx) __attribute__((amp)) {
+    auto k1 = [=] (const index<1>& idx) [[hc]] {
       av[idx] = idx[0];
     };
   
@@ -31,7 +31,7 @@ int main() {
 
   {
     // case 2: placed between lambda introducer and parameter list
-    auto k2 = [=] __attribute__((amp)) (const index<1>& idx) {
+    auto k2 = [=] __attribute__((hc)) (const index<1>& idx) {
       av[idx] = idx[0];
     };
   
@@ -47,7 +47,7 @@ int main() {
 
   {
     // case 3: placed in front of lambda introducer
-    auto k3 = __attribute__((amp)) [=] (const index<1>& idx) {
+    auto k3 = __attribute__((hc)) [=] (const index<1>& idx) {
       av[idx] = idx[0];
     };
   
diff --git a/tests/Unit/Parse/lambda_attribute_hc.cpp b/tests/Unit/Parse/lambda_attribute_hc.cpp
index c6d132ee01d..6063a562572 100644
--- a/tests/Unit/Parse/lambda_attribute_hc.cpp
+++ b/tests/Unit/Parse/lambda_attribute_hc.cpp
@@ -1,6 +1,6 @@
 // RUN: %cxxamp %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 #include <iostream>
 
diff --git a/tests/Unit/Parse/method_declarator.cpp b/tests/Unit/Parse/method_declarator.cpp
index 6be7bb00586..83535c8502e 100644
--- a/tests/Unit/Parse/method_declarator.cpp
+++ b/tests/Unit/Parse/method_declarator.cpp
@@ -9,8 +9,6 @@ class AClass {
   int method_1() const;               // not a problem
 
   int method_2() [[cpu, hc]];  // should accept
-
-  int method_3() restrict;            // not to be confused with C++AMP restrict.
 };
 
 int func() [[hc]] {
diff --git a/tests/Unit/PlatformAtomics/atomic_int.cpp b/tests/Unit/PlatformAtomics/atomic_int.cpp
index cbc79e7da5c..e3ed8e82596 100644
--- a/tests/Unit/PlatformAtomics/atomic_int.cpp
+++ b/tests/Unit/PlatformAtomics/atomic_int.cpp
@@ -4,10 +4,9 @@
 #include <iostream>
 #include <random>
 #include <atomic>
-#include <hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with platform atomics functions
 // requires HSA Full Profile to operate successfully
@@ -67,7 +66,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/PlatformAtomics/pingpong.cpp b/tests/Unit/PlatformAtomics/pingpong.cpp
index 08f12c1633b..77a09456fbf 100644
--- a/tests/Unit/PlatformAtomics/pingpong.cpp
+++ b/tests/Unit/PlatformAtomics/pingpong.cpp
@@ -7,10 +7,9 @@
 #include <atomic>
 #include <thread>
 #include <chrono>
-#include <hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with platform atomics functions
 // requires HSA Full Profile to operate successfully
@@ -160,7 +159,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/PlatformAtomics/sync_1way.cpp b/tests/Unit/PlatformAtomics/sync_1way.cpp
index 22ba4ad2f76..904b26b9240 100644
--- a/tests/Unit/PlatformAtomics/sync_1way.cpp
+++ b/tests/Unit/PlatformAtomics/sync_1way.cpp
@@ -7,10 +7,9 @@
 #include <atomic>
 #include <thread>
 #include <chrono>
-#include <hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with platform atomics functions
 // requires HSA Full Profile to operate successfully
@@ -101,7 +100,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/PlatformAtomics/sync_2way.cpp b/tests/Unit/PlatformAtomics/sync_2way.cpp
index 83f057560af..77baaf5c323 100644
--- a/tests/Unit/PlatformAtomics/sync_2way.cpp
+++ b/tests/Unit/PlatformAtomics/sync_2way.cpp
@@ -7,10 +7,9 @@
 #include <atomic>
 #include <thread>
 #include <chrono>
-#include <hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with platform atomics functions
 // requires HSA Full Profile to operate successfully
@@ -121,7 +120,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/PlatformAtomics/syscall.cpp b/tests/Unit/PlatformAtomics/syscall.cpp
index 399deb9b317..c101e2aeebd 100644
--- a/tests/Unit/PlatformAtomics/syscall.cpp
+++ b/tests/Unit/PlatformAtomics/syscall.cpp
@@ -6,10 +6,9 @@
 #include <iomanip>
 #include <atomic>
 #include <thread>
-#include <hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with platform atomics functions
 // requires HSA Full Profile to operate successfully
@@ -172,7 +171,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/Pool/accelerator_get_compute_unit_count_api.cpp b/tests/Unit/Pool/accelerator_get_compute_unit_count_api.cpp
index 2e512b9479a..787444911df 100644
--- a/tests/Unit/Pool/accelerator_get_compute_unit_count_api.cpp
+++ b/tests/Unit/Pool/accelerator_get_compute_unit_count_api.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 /**
  * Test if hc::accelerator::get_compute_unit_count() works fine.
  * Create the default accelerator and check if the tested api returns
diff --git a/tests/Unit/Pool/accelerator_get_is_peer_api.cpp b/tests/Unit/Pool/accelerator_get_is_peer_api.cpp
index 340c661d4e6..1335efbf8c9 100644
--- a/tests/Unit/Pool/accelerator_get_is_peer_api.cpp
+++ b/tests/Unit/Pool/accelerator_get_is_peer_api.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -I%hsa_header_path -L%hsa_library_path -lhsa-runtime64 -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <hsa/hsa.h>
 #include <hsa/hsa_ext_amd.h>
 
diff --git a/tests/Unit/Pool/accelerator_get_is_peer_cpu1.cpp b/tests/Unit/Pool/accelerator_get_is_peer_cpu1.cpp
index e44144cc6b8..680f9d530f7 100644
--- a/tests/Unit/Pool/accelerator_get_is_peer_cpu1.cpp
+++ b/tests/Unit/Pool/accelerator_get_is_peer_cpu1.cpp
@@ -1,6 +1,6 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 /**
  * So far, CPU accelerator is not peer of any other
diff --git a/tests/Unit/Pool/accelerator_get_is_peer_cpu2.cpp b/tests/Unit/Pool/accelerator_get_is_peer_cpu2.cpp
index 3204800458d..af6512a2d88 100644
--- a/tests/Unit/Pool/accelerator_get_is_peer_cpu2.cpp
+++ b/tests/Unit/Pool/accelerator_get_is_peer_cpu2.cpp
@@ -1,11 +1,12 @@
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
+#include <cassert>
 /**
  * So far, CPU accelerator is not peer of any other
  * accelerator. This test will pass CPU accelerator
- * to default accelerator and check if 
+ * to default accelerator and check if
  * get_is_peer() will return false.
  */
 
@@ -13,19 +14,9 @@ int main()
 {
     // Get Default accelerator.
     hc::accelerator acc;
+    hc::accelerator cpu{hc::accelerator::cpu_accelerator()};
 
-    const auto& all = hc::accelerator::get_all();
-
-    hc::accelerator cpu;
-
-    for(auto iter = all.begin(); iter != all.end(); iter++)
-    {
-        if(iter->get_is_emulated())
-        {
-            cpu = *iter;
-            break;
-        }
-    }
+    assert(acc != cpu);
 
     // Check get_is_peer() return value, if it is true,
     // then, test fails, return -1.
diff --git a/tests/Unit/Pool/accelerator_get_peers.cpp b/tests/Unit/Pool/accelerator_get_peers.cpp
index 81b8e363211..dccf4412f59 100644
--- a/tests/Unit/Pool/accelerator_get_peers.cpp
+++ b/tests/Unit/Pool/accelerator_get_peers.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 /**
  * Test if hc::accelerator::get_peers() works fine.
diff --git a/tests/Unit/Pool/accelerator_view_set_cu_mask.cpp b/tests/Unit/Pool/accelerator_view_set_cu_mask.cpp
index b953242a514..f71c8848749 100644
--- a/tests/Unit/Pool/accelerator_view_set_cu_mask.cpp
+++ b/tests/Unit/Pool/accelerator_view_set_cu_mask.cpp
@@ -1,7 +1,7 @@
 
 // RUN: %hc %s -o %t.out && %t.out
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 #include <vector>
 
 /**
@@ -50,7 +50,7 @@ int main()
 
     hc::extent<1> e(vec_size);
     hc::completion_future fut = hc::parallel_for_each(acc_view, e,
-                                [=](hc::index<1> idx) __HC__ {
+                                [=](hc::index<1> idx) [[hc]] {
                                   table_c[idx[0]] = table_a[idx[0]] + table_b[idx[0]];
                                 });
 
diff --git a/tests/Unit/Pool/map_to_peers_device_ptr.cpp b/tests/Unit/Pool/map_to_peers_device_ptr.cpp
index e2c4250c48d..5db9cb20559 100644
--- a/tests/Unit/Pool/map_to_peers_device_ptr.cpp
+++ b/tests/Unit/Pool/map_to_peers_device_ptr.cpp
@@ -1,8 +1,8 @@
 
-// RUN: %hc %s -lhc_am -o %t.out && %t.out
+// RUN: %hc %s  -o %t.out && %t.out
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 int main()
 {
diff --git a/tests/Unit/Pool/map_to_peers_host_ptr.cpp b/tests/Unit/Pool/map_to_peers_host_ptr.cpp
index 42b37abec4e..42a3388cda3 100644
--- a/tests/Unit/Pool/map_to_peers_host_ptr.cpp
+++ b/tests/Unit/Pool/map_to_peers_host_ptr.cpp
@@ -1,14 +1,14 @@
 
-// RUN: %hc %s -lhc_am -o %t.out && %t.out
+// RUN: %hc %s  -o %t.out && %t.out
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 int main()
 {
     hc::accelerator acc;
 
-    void* host_ptr = am_alloc(1, acc, amHostPinned);
+    void* host_ptr = am_alloc(1, acc, am_host_pinned);
 
     // allocation fails if return NULL.
     if(host_ptr == NULL)
diff --git a/tests/Unit/RawGenericPointer/array_add_am.cpp b/tests/Unit/RawGenericPointer/array_add_am.cpp
index 84d6068ba98..5af08a1cb41 100644
--- a/tests/Unit/RawGenericPointer/array_add_am.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 bool test() {
   // define inputs and output
diff --git a/tests/Unit/RawGenericPointer/array_add_am10.cpp b/tests/Unit/RawGenericPointer/array_add_am10.cpp
index dcf48bcf0fd..95277f8883e 100644
--- a/tests/Unit/RawGenericPointer/array_add_am10.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am10.cpp
@@ -1,11 +1,11 @@
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
 #include <type_traits>
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 template<typename T>
 bool test() {
diff --git a/tests/Unit/RawGenericPointer/array_add_am11.cpp b/tests/Unit/RawGenericPointer/array_add_am11.cpp
index ab039ff2f9c..47b7b19141c 100644
--- a/tests/Unit/RawGenericPointer/array_add_am11.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am11.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 template<typename TTTT>
 [[hc]] void kfunc(TTTT* a, TTTT* b, TTTT* c, int idx) {
diff --git a/tests/Unit/RawGenericPointer/array_add_am12.cpp b/tests/Unit/RawGenericPointer/array_add_am12.cpp
index a17fd6c4640..ad0c2772c4a 100644
--- a/tests/Unit/RawGenericPointer/array_add_am12.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am12.cpp
@@ -1,11 +1,11 @@
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
 #include <type_traits>
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 template<typename T>
 [[hc]] void kfunc(T* a, T* b, T* c, int idx) {
diff --git a/tests/Unit/RawGenericPointer/array_add_am13.cpp b/tests/Unit/RawGenericPointer/array_add_am13.cpp
index b9abc1996a5..b6bd1d1d253 100644
--- a/tests/Unit/RawGenericPointer/array_add_am13.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am13.cpp
@@ -1,11 +1,11 @@
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
 #include <type_traits>
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 template<typename T>
 [[hc]] void setC(T* a, T* b, T* c, int idx) {
diff --git a/tests/Unit/RawGenericPointer/array_add_am14.cpp b/tests/Unit/RawGenericPointer/array_add_am14.cpp
index 950e144fe67..ddfdffc8735 100644
--- a/tests/Unit/RawGenericPointer/array_add_am14.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am14.cpp
@@ -1,11 +1,11 @@
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
 #include <type_traits>
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 template<typename T>
 [[hc]] void setC(T a, T b, T* c, int idx) {
diff --git a/tests/Unit/RawGenericPointer/array_add_am15.cpp b/tests/Unit/RawGenericPointer/array_add_am15.cpp
index dbbd920a541..e5cff68a023 100644
--- a/tests/Unit/RawGenericPointer/array_add_am15.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am15.cpp
@@ -1,11 +1,11 @@
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
 #include <type_traits>
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 class Bar {
 public:
diff --git a/tests/Unit/RawGenericPointer/array_add_am16.cpp b/tests/Unit/RawGenericPointer/array_add_am16.cpp
index 325b8484696..01c8a4ee700 100644
--- a/tests/Unit/RawGenericPointer/array_add_am16.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am16.cpp
@@ -1,11 +1,11 @@
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
 #include <type_traits>
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 class Bar {
 public:
diff --git a/tests/Unit/RawGenericPointer/array_add_am17.cpp b/tests/Unit/RawGenericPointer/array_add_am17.cpp
index 000f64ee0f2..02753d0e071 100644
--- a/tests/Unit/RawGenericPointer/array_add_am17.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am17.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 // file-scope global variable
 [[hc]] int g_d = 100;
diff --git a/tests/Unit/RawGenericPointer/array_add_am2.cpp b/tests/Unit/RawGenericPointer/array_add_am2.cpp
index 29b96ab9793..3c49790d499 100644
--- a/tests/Unit/RawGenericPointer/array_add_am2.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am2.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 bool test() {
   // define inputs and output
diff --git a/tests/Unit/RawGenericPointer/array_add_am3.cpp b/tests/Unit/RawGenericPointer/array_add_am3.cpp
index 184d41ea340..9cf1af08fe1 100644
--- a/tests/Unit/RawGenericPointer/array_add_am3.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am3.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 [[hc]] void kfunc(int* a, int* b, int* c, int idx) {
   c[idx] = a[idx] + b[idx];
diff --git a/tests/Unit/RawGenericPointer/array_add_am4.cpp b/tests/Unit/RawGenericPointer/array_add_am4.cpp
index fead8c3cfa8..fda338a4618 100644
--- a/tests/Unit/RawGenericPointer/array_add_am4.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am4.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 [[hc]] void setC(int a, int b, int* c, int idx) {
   c[idx] = a + b;
diff --git a/tests/Unit/RawGenericPointer/array_add_am5.cpp b/tests/Unit/RawGenericPointer/array_add_am5.cpp
index ea80840cfbe..6c565ac6623 100644
--- a/tests/Unit/RawGenericPointer/array_add_am5.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am5.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 [[hc]] void setC(int* a, int* b, int* c, int idx) {
   c[idx] = *a + *b;
diff --git a/tests/Unit/RawGenericPointer/array_add_am6.cpp b/tests/Unit/RawGenericPointer/array_add_am6.cpp
index 3fe419ffa52..35fdbc6de34 100644
--- a/tests/Unit/RawGenericPointer/array_add_am6.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am6.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 [[hc]] void setC(int a, int b, int* c, int idx) {
   c[idx] = a + b;
diff --git a/tests/Unit/RawGenericPointer/array_add_am7.cpp b/tests/Unit/RawGenericPointer/array_add_am7.cpp
index d00d0e45f34..ea3bac6d3e0 100644
--- a/tests/Unit/RawGenericPointer/array_add_am7.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am7.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 class Bar {
 public:
diff --git a/tests/Unit/RawGenericPointer/array_add_am8.cpp b/tests/Unit/RawGenericPointer/array_add_am8.cpp
index f7114a52ccd..ac1e994e83e 100644
--- a/tests/Unit/RawGenericPointer/array_add_am8.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am8.cpp
@@ -1,10 +1,10 @@
 
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 class Bar {
 public:
diff --git a/tests/Unit/RawGenericPointer/array_add_am9.cpp b/tests/Unit/RawGenericPointer/array_add_am9.cpp
index 9b71b8bb76e..384cf91f820 100644
--- a/tests/Unit/RawGenericPointer/array_add_am9.cpp
+++ b/tests/Unit/RawGenericPointer/array_add_am9.cpp
@@ -1,11 +1,11 @@
-// RUN: %hc -lhc_am %s -o %t.out && %t.out
+// RUN: %hc  %s -o %t.out && %t.out
 
 #include <iostream>
 #include <random>
 #include <type_traits>
 
-#include <hc_am.hpp>
-#include <hc.hpp>
+#include <hc/hc_am.hpp>
+#include <hc/hc.hpp>
 
 template<typename T>
 bool test() {
diff --git a/tests/Unit/RawPointer/array_add.cpp b/tests/Unit/RawPointer/array_add.cpp
index 08b2ca7e803..e669178e8fb 100644
--- a/tests/Unit/RawPointer/array_add.cpp
+++ b/tests/Unit/RawPointer/array_add.cpp
@@ -3,10 +3,9 @@
 
 #include <iostream>
 #include <random>
-#include <hc.hpp>
 
 // added for checking HSA profile
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 // test C++AMP with fine-grained SVM
 // requires HSA Full Profile to operate successfully
@@ -60,7 +59,7 @@ int main() {
   // only conduct the test in case we are running on a HSA full profile stack
   hc::accelerator acc;
   if (acc.is_hsa_accelerator() &&
-      acc.get_profile() == hc::hcAgentProfileFull) {
+      acc.get_profile() == hc::accelerator_profile_full) {
     ret &= test();
   }
 
diff --git a/tests/Unit/RestrictionSpecifier/Negative/empty_restriction.cpp b/tests/Unit/RestrictionSpecifier/Negative/empty_restriction.cpp
deleted file mode 100644
index 029b80bd4d8..00000000000
--- a/tests/Unit/RestrictionSpecifier/Negative/empty_restriction.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
-
-//////////////////////////////////////////////////////////////////////////////////
-// Do not delete or add any line; it is referred to by absolute line number in the
-// FileCheck lines below
-//////////////////////////////////////////////////////////////////////////////////
-#include <hc.hpp>
-
-int foo() restrict()
-{
-  return 1;
-}
-// CHECK: empty_restriction.cpp:[[@LINE-4]]:20: error: empty restriction sepcifier is not allowed
-// CHECK-NEXT:int foo() restrict()
-// CHECK-NEXT:                   ^
-
-int main(void)
-{
-  return 2;
-}
-
diff --git a/tests/Unit/RestrictionSpecifier/Negative/id_is_unrecognized.cpp b/tests/Unit/RestrictionSpecifier/Negative/id_is_unrecognized.cpp
deleted file mode 100644
index 6fa1c8cf76a..00000000000
--- a/tests/Unit/RestrictionSpecifier/Negative/id_is_unrecognized.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
-
-//////////////////////////////////////////////////////////////////////////////////
-// Do not delete or add any line; it is referred to by absolute line number in the
-// FileCheck lines below
-//////////////////////////////////////////////////////////////////////////////////
-
-#include <hc.hpp>
-
-int f1() restrict(cpu,auto1)
-{
-  return 1;
-}
-// CHECK: id_is_unrecognized.cpp:[[@LINE-4]]:23: error: 'auto1' : unrecognized restriction specifier
-// CHECK-NEXT:int f1() restrict(cpu,auto1)
-// CHECK-NEXT:                      ^
-
-int f2() restrict(auto2,,,,,)
-{
-  return 2;
-}
-// CHECK: id_is_unrecognized.cpp:[[@LINE-4]]:19: error: 'auto2' : unrecognized restriction specifier
-// CHECK-NEXT:int f2() restrict(auto2,,,,,)
-// CHECK-NEXT:                  ^
-
-int f3() restrict(,,auto2,,,)
-{
-  return 2;
-}
-// CHECK: id_is_unrecognized.cpp:[[@LINE-4]]:21: error: 'auto2' : unrecognized restriction specifier
-// CHECK-NEXT:int f3() restrict(,,auto2,,,)
-// CHECK-NEXT:                    ^
-
-int f4() restrict(,,,,,auto3)
-{
-  return 2;
-}
-// CHECK: id_is_unrecognized.cpp:[[@LINE-4]]:24: error: 'auto3' : unrecognized restriction specifier
-// CHECK-NEXT:int f4() restrict(,,,,,auto3)
-// CHECK-NEXT:                       ^
-
-int main(void)
-{
-  return 0;
-}
-
diff --git a/tests/Unit/RestrictionSpecifier/Negative/non-comma_between_ids.cpp b/tests/Unit/RestrictionSpecifier/Negative/non-comma_between_ids.cpp
deleted file mode 100644
index 1c3ba5f3313..00000000000
--- a/tests/Unit/RestrictionSpecifier/Negative/non-comma_between_ids.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
-
-//////////////////////////////////////////////////////////////////////////////////
-// Do not delete or add any line; it is referred to by absolute line number in the
-// FileCheck lines below
-//////////////////////////////////////////////////////////////////////////////////
-
-#include <hc.hpp>
-
-int foo() restrict(xx:auto1)
-{
-  return 1;
-}
-// CHECK: non-comma_between_ids.cpp:[[@LINE-4]]:20: error: 'xx' : unrecognized restriction specifier
-// CHECK-NEXT:int foo() restrict(xx:auto1)
-// CHECK-NEXT:                   ^
-// CHECK: non-comma_between_ids.cpp:[[@LINE-7]]:22: error: ':' : unrecognized restriction specifier
-// CHECK-NEXT:int foo() restrict(xx:auto1)
-// CHECK-NEXT:                     ^
-// CHECK: non-comma_between_ids.cpp:[[@LINE-10]]:23: error: 'auto1' : unrecognized restriction specifier
-// CHECK-NEXT:int foo() restrict(xx:auto1)
-// CHECK-NEXT:                      ^
-
-// Left end
-int fooxx() restrict(:auto2,,,)
-{
-  return 1;
-}
-// CHECK: non-comma_between_ids.cpp:[[@LINE-4]]:22: error: ':' : unrecognized restriction specifier
-// CHECK-NEXT:int fooxx() restrict(:auto2,,,)
-// CHECK-NEXT:                     ^
-// CHECK: non-comma_between_ids.cpp:[[@LINE-7]]:23: error: 'auto2' : unrecognized restriction specifier
-// CHECK-NEXT:int fooxx() restrict(:auto2,,,)
-// CHECK-NEXT:                      ^
-
-
-// Right end
-int fooyy() restrict(,,,::auto3)
-{
-  return 1;
-}
-// CHECK: non-comma_between_ids.cpp:[[@LINE-4]]:25: error: '::' : unrecognized restriction specifier
-// CHECK-NEXT:int fooyy() restrict(,,,::auto3)
-// CHECK-NEXT:                        ^
-// CHECK: non-comma_between_ids.cpp:[[@LINE-7]]:27: error: 'auto3' : unrecognized restriction specifier
-// CHECK-NEXT:int fooyy() restrict(,,,::auto3)
-// CHECK-NEXT:                         ^
-
-// At both ends
-int foozz() restrict(!X,,,a)
-{
-  return 1;
-}
-// CHECK: non-comma_between_ids.cpp:[[@LINE-4]]:22: error: '!' : unrecognized restriction specifier
-// CHECK-NEXT:int foozz() restrict(!X,,,a)
-// CHECK-NEXT:                     ^
-// CHECK: non-comma_between_ids.cpp:[[@LINE-7]]:23: error: 'X' : unrecognized restriction specifier
-// CHECK-NEXT:int foozz() restrict(!X,,,a)
-// CHECK-NEXT:                      ^
-// CHECK: non-comma_between_ids.cpp:[[@LINE-10]]:27: error: 'a' : unrecognized restriction specifier
-// CHECK-NEXT:int foozz() restrict(!X,,,a)
-// CHECK-NEXT:                          ^
-
-int foo1() restrict(cpu:auto1)
-{
-  return 1;
-}
-// CHECK: non-comma_between_ids.cpp:[[@LINE-4]]:24: error: ':' : unrecognized restriction specifier
-// CHECK-NEXT:int foo1() restrict(cpu:auto1)
-// CHECK-NEXT:                       ^
-// CHECK: non-comma_between_ids.cpp:[[@LINE-7]]:25: error: 'auto1' : unrecognized restriction specifier
-// CHECK-NEXT:int foo1() restrict(cpu:auto1)
-// CHECK-NEXT:                        ^
-
-
-int foo2() restrict(auto1&cpu)
-{
-  return 1;
-}
-// CHECK: non-comma_between_ids.cpp:[[@LINE-4]]:21: error: 'auto1' : unrecognized restriction specifier
-// CHECK-NEXT:int foo2() restrict(auto1&cpu)
-// CHECK-NEXT:                    ^
-// CHECK: non-comma_between_ids.cpp:[[@LINE-7]]:26: error: '&' : unrecognized restriction specifier
-// CHECK-NEXT:int foo2() restrict(auto1&cpu)
-// CHECK-NEXT:                         ^
-
-int main(void)
-{
-  return 0;
-}
-
diff --git a/tests/Unit/RestrictionSpecifier/Negative/non-id_at_two_ends.cpp b/tests/Unit/RestrictionSpecifier/Negative/non-id_at_two_ends.cpp
deleted file mode 100644
index b23959a44b6..00000000000
--- a/tests/Unit/RestrictionSpecifier/Negative/non-id_at_two_ends.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
-
-//////////////////////////////////////////////////////////////////////////////////
-// Do not delete or add any line; it is referred to by absolute line number in the
-// FileCheck lines below
-//////////////////////////////////////////////////////////////////////////////////
-
-#include <hc.hpp>
-
-int foo() restrict(!,,,,)
-{
-  return 1;
-}
-// CHECK: non-id_at_two_ends.cpp:[[@LINE-4]]:20: error: '!' : unrecognized restriction specifier
-// CHECK-NEXT:int foo() restrict(!,,,,)
-// CHECK-NEXT:                   ^
-
-// consecutive
-int foo1() restrict(!!,,,,)
-{
-  return 1;
-}
-// CHECK: non-id_at_two_ends.cpp:[[@LINE-4]]:21: error: '!' : unrecognized restriction specifier
-// CHECK-NEXT:int foo1() restrict(!!,,,,)
-// CHECK-NEXT:                    ^
-// CHECK: non-id_at_two_ends.cpp:[[@LINE-7]]:22: error: '!' : unrecognized restriction specifier
-// CHECK-NEXT:int foo1() restrict(!!,,,,)
-// CHECK-NEXT:                     ^
-
-
-int foo2() restrict(,,,,*)
-{
-  return 1;
-}
-// CHECK: non-id_at_two_ends.cpp:[[@LINE-4]]:25: error: '*' : unrecognized restriction specifier
-// CHECK-NEXT:int foo2() restrict(,,,,*)
-// CHECK-NEXT:                        ^
-
-
-int foo3() restrict(,,,,**)
-{
-  return 1;
-}
-// CHECK: non-id_at_two_ends.cpp:[[@LINE-4]]:25: error: '*' : unrecognized restriction specifier
-// CHECK-NEXT:int foo3() restrict(,,,,**)
-// CHECK-NEXT:                        ^
-// CHECK: non-id_at_two_ends.cpp:[[@LINE-7]]:26: error: '*' : unrecognized restriction specifier
-// CHECK-NEXT:int foo3() restrict(,,,,**)
-// CHECK-NEXT:                         ^
-
-// both
-int foo4() restrict(!,,,,*)
-{
-  return 1;
-}
-// CHECK: non-id_at_two_ends.cpp:[[@LINE-4]]:21: error: '!' : unrecognized restriction specifier
-// CHECK-NEXT:int foo4() restrict(!,,,,*)
-// CHECK-NEXT:                    ^
-// CHECK: non-id_at_two_ends.cpp:[[@LINE-7]]:26: error: '*' : unrecognized restriction specifier
-// CHECK-NEXT:int foo4() restrict(!,,,,*)
-// CHECK-NEXT:                         ^
-
-
-int main(void)
-{
-  return 0;
-}
-
diff --git a/tests/Unit/RestrictionSpecifier/Negative/should_not_parse.cpp b/tests/Unit/RestrictionSpecifier/Negative/should_not_parse.cpp
deleted file mode 100644
index 7fddb497a33..00000000000
--- a/tests/Unit/RestrictionSpecifier/Negative/should_not_parse.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
-
-//////////////////////////////////////////////////////////////////////////////////
-// Do not delete or add any line; it is referred to by absolute line number in the
-// FileCheck lines below
-//////////////////////////////////////////////////////////////////////////////////
-#include <hc.hpp>
-
-int f1() restrict(amp:,)
-{
-  return 1;
-}
-// CHECK: should_not_parse.cpp:[[@LINE-4]]:22: error: ':' : unrecognized restriction specifier
-// CHECK-NEXT:int f1() restrict(amp:,)
-// CHECK-NEXT:                     ^
-
-// 'amp' should not be attached to f1()
-int f2() [[hc]]
-{
-  f1();  // expected-error{{'f1': no overload...}}
-  return 0;
-}
-
-int main(void)
-{
-  return 0;
-}
-
diff --git a/tests/Unit/RestrictionSpecifier/Negative/space.cpp b/tests/Unit/RestrictionSpecifier/Negative/space.cpp
deleted file mode 100644
index d0effa63f6d..00000000000
--- a/tests/Unit/RestrictionSpecifier/Negative/space.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %amp_device -D__KALMAR_ACCELERATOR__ %s -emit-llvm -c -S -O2 -o %t.ll 2>&1 | %FileCheck --strict-whitespace %s
-
-//////////////////////////////////////////////////////////////////////////////////
-// Do not delete or add any line; it is referred to by absolute line number in the
-// FileCheck lines below
-//////////////////////////////////////////////////////////////////////////////////
-
-#include <hc.hpp>
-
-int f1() restrict(cpu,   ,auto1)  // expected-error{{'auto1': unrecognized restriction sepcifier}}
-{
-  return 1;
-}
-// CHECK: space.cpp:[[@LINE-4]]:27: error: 'auto1' : unrecognized restriction specifier
-// CHECK-NEXT:int f1() restrict(cpu,   ,auto1)
-// CHECK-NEXT:                          ^
-
-int main(void)
-{
-  return 0;
-}
-
diff --git a/tests/Unit/RestrictionSpecifier/OKCase.cpp b/tests/Unit/RestrictionSpecifier/OKCase.cpp
deleted file mode 100644
index df230c263a4..00000000000
--- a/tests/Unit/RestrictionSpecifier/OKCase.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-using namespace hc;
-
-int foo() restrict(,)  // OK
-{
-  return 0;
-}
-
-
-int foo1() restrict(amp,)  // OK
-{
-  return 0;
-}
-int fooAMP() restrict(,amp)  // OK
-{
-  foo1();  // OK
-  return 0;
-}
-
-
-int foo2() restrict(,   ,,,   ,cpu,,,,)  // OK
-{
-  return 0;
-}
-int fooCPU() [[cpu]]  // OK
-{
-  foo2();  // OK
-  return 0;
-}
-
-
-int main(void)
-{
-  parallel_for_each(extent<1>(1), [](index<1>) [[hc]]
-    {
-        fooAMP();
-    });
-}
-
diff --git a/tests/Unit/RestrictionSpecifier/Override_Qualifier.cpp b/tests/Unit/RestrictionSpecifier/Override_Qualifier.cpp
deleted file mode 100644
index 593414d9e8f..00000000000
--- a/tests/Unit/RestrictionSpecifier/Override_Qualifier.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: %cxxamp %s -o %t.out && %t.out
-#include <hc.hpp>
-using namespace hc;
-
-#define LLVM_OVERRIDE override
-
-int64_t current_pos() LLVM_OVERRIDE { return 1; }
-
-  /// preferred_buffer_size - Determine an efficient buffer size.
-size_t preferred_buffer_size() LLVM_OVERRIDE;
-
-int main(void)
-{
-  return 0;
-}
diff --git a/tests/Unit/SharedLibrary/shared_library1.cpp b/tests/Unit/SharedLibrary/shared_library1.cpp
index 821765ecc13..b7332adaecc 100644
--- a/tests/Unit/SharedLibrary/shared_library1.cpp
+++ b/tests/Unit/SharedLibrary/shared_library1.cpp
@@ -8,7 +8,7 @@
 
 #if SHARED_LIBRARY
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int foo(int grid_size) {
   using namespace hc;
diff --git a/tests/Unit/SharedLibrary/shared_library2.cpp b/tests/Unit/SharedLibrary/shared_library2.cpp
index 4b271bf255a..e4ee26f35cf 100644
--- a/tests/Unit/SharedLibrary/shared_library2.cpp
+++ b/tests/Unit/SharedLibrary/shared_library2.cpp
@@ -8,7 +8,7 @@
 
 #if SHARED_LIBRARY_1
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int foo(int grid_size) {
   using namespace hc;
@@ -32,7 +32,7 @@ extern "C" int foo(int grid_size) {
 
 #if SHARED_LIBRARY_2
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int bar(int grid_size) {
   using namespace hc;
@@ -78,7 +78,7 @@ int main() {
   }
  
   if (foo_handle && bar_handle) {
-    for (int i = 0; i < 16; ++i) {
+    for (int i = 1; i != 17; ++i) {
       ret &= (foo_handle(i) == i);
       ret &= (bar_handle(i * 2) == (i * 4));
     }
diff --git a/tests/Unit/SharedLibrary/shared_library3.cpp b/tests/Unit/SharedLibrary/shared_library3.cpp
index 7b37a29e9d5..2db00b1a9bd 100644
--- a/tests/Unit/SharedLibrary/shared_library3.cpp
+++ b/tests/Unit/SharedLibrary/shared_library3.cpp
@@ -8,7 +8,7 @@
 
 #if SHARED_LIBRARY_1
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int foo(int grid_size) {
   using namespace hc;
@@ -32,7 +32,7 @@ extern "C" int foo(int grid_size) {
 
 #if SHARED_LIBRARY_2
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int bar(int grid_size) {
   using namespace hc;
@@ -60,7 +60,7 @@ extern "C" int bar(int);
 int main() {
   bool ret = true;
 
-  for (int i = 0; i < 16; ++i) {
+  for (int i = 1; i != 17; ++i) {
     ret &= (foo(i) == i);
     ret &= (bar(i * 2) == (i * 4));
   }
diff --git a/tests/Unit/SharedLibrary/shared_library4.cpp b/tests/Unit/SharedLibrary/shared_library4.cpp
index 79a9b7a5247..0a2793516c5 100644
--- a/tests/Unit/SharedLibrary/shared_library4.cpp
+++ b/tests/Unit/SharedLibrary/shared_library4.cpp
@@ -8,7 +8,7 @@
 
 #if SHARED_LIBRARY
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int foo(int grid_size) {
   using namespace hc;
@@ -48,7 +48,7 @@ extern "C" int bar(int grid_size) {
 
 #else
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int foo(int);
 extern "C" int bar(int);
diff --git a/tests/Unit/SharedLibrary/shared_library5.cpp b/tests/Unit/SharedLibrary/shared_library5.cpp
index f57cc421175..1a087d4c695 100644
--- a/tests/Unit/SharedLibrary/shared_library5.cpp
+++ b/tests/Unit/SharedLibrary/shared_library5.cpp
@@ -8,7 +8,7 @@
 
 #if SHARED_LIBRARY
 
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int foo(int grid_size) {
   using namespace hc;
diff --git a/tests/Unit/StaticLibrary/static_library1.cpp b/tests/Unit/StaticLibrary/static_library1.cpp
index 2b4f54b78d4..7f7fe7815e3 100644
--- a/tests/Unit/StaticLibrary/static_library1.cpp
+++ b/tests/Unit/StaticLibrary/static_library1.cpp
@@ -4,7 +4,7 @@
 // RUN: %hc %s -L%T -lstatic_library1 -o %t.out && %t.out
 
 #include <cstdio>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int sum(hc::array_view<int,1>& input);
 
diff --git a/tests/Unit/StaticLibrary/static_library2.cpp b/tests/Unit/StaticLibrary/static_library2.cpp
index 245209e56f7..4d0709caed6 100644
--- a/tests/Unit/StaticLibrary/static_library2.cpp
+++ b/tests/Unit/StaticLibrary/static_library2.cpp
@@ -4,7 +4,7 @@
 // RUN: %hc %s -L./Output -lstatic_library2 -o %t.out && %t.out
 
 #include <cstdio>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int sum(hc::array_view<int,1>& input);
 
diff --git a/tests/Unit/StaticLibrary/static_library3.cpp b/tests/Unit/StaticLibrary/static_library3.cpp
index c6235c39178..5bf7ccbd0d0 100644
--- a/tests/Unit/StaticLibrary/static_library3.cpp
+++ b/tests/Unit/StaticLibrary/static_library3.cpp
@@ -4,7 +4,7 @@
 // RUN: %hc %s ./Output/libstatic_library3.a -o %t.out && %t.out
 
 #include <cstdio>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int sum(hc::array_view<int,1>& input);
 
diff --git a/tests/Unit/StaticLibrary/static_library4.cpp b/tests/Unit/StaticLibrary/static_library4.cpp
index a8c49c6212c..4776e8b9500 100644
--- a/tests/Unit/StaticLibrary/static_library4.cpp
+++ b/tests/Unit/StaticLibrary/static_library4.cpp
@@ -4,7 +4,7 @@
 // RUN: %hc %target_all_gpus %s -L%T -lstatic_library4 -o %t.out && %t.out
 
 #include <cstdio>
-#include <hc.hpp>
+#include <hc/hc.hpp>
 
 extern "C" int sum(hc::array_view<int,1>& input);
 
diff --git a/tests/lit.cfg b/tests/lit.cfg
index 2c6d993f5a5..d50c0483a18 100644
--- a/tests/lit.cfg
+++ b/tests/lit.cfg
@@ -105,7 +105,7 @@ gtest_link_options = ' ' + ' '.join([
 config.clang = inferClang(config.llvm_tools_dir)
 config.clang_cc1 = config.clang + "++"
 config.clang_cxx11  = config.clang_cc1 + cxx_options + "-std=c++11"
-config.clang_cxxamp = config.clang_cc1 + cxx_options + "-std=c++amp" + link_options
+config.clang_cxxamp = config.clang_cc1 + cxx_options + "-std=c++11 -hc" + link_options
 config.clang_hc = config.clang_cc1 + cxx_options + "-hc" + link_options
 config.clang_cxxamp_device = config.clang_cxxamp + " -Xclang -famp-is-device -fno-builtin "
 config.clang_gtest_amp = config.clang_cxxamp + gtest_link_options
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
new file mode 100644
index 00000000000..d49a47f0094
--- /dev/null
+++ b/third_party/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(elfio)
\ No newline at end of file
diff --git a/third_party/elfio/CMakeLists.txt b/third_party/elfio/CMakeLists.txt
new file mode 100644
index 00000000000..ffc187a8841
--- /dev/null
+++ b/third_party/elfio/CMakeLists.txt
@@ -0,0 +1,39 @@
+set(elfio_headers
+    elf_types.hpp
+    elfio_amdgpu.hpp
+    elfio_dump.hpp
+    elfio_dynamic.hpp
+    elfio_header.hpp
+    elfio_note.hpp
+    elfio_relocation.hpp
+    elfio_section.hpp
+    elfio_segment.hpp
+    elfio_strings.hpp
+    elfio_symbols.hpp
+    elfio_utils.hpp
+    elfio.hpp)
+set(elfio_license COPYING)
+
+# Set location for output directory
+set(output_dir "${PROJECT_BINARY_DIR}/third_party/elfio")
+set(out_files)
+foreach(f ${elfio_headers} ${elfio_license})
+    set(src ${CMAKE_CURRENT_SOURCE_DIR}/${f})
+    set(dst ${output_dir}/${f})
+    add_custom_command(
+        OUTPUT ${dst}
+        DEPENDS ${src}
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+        COMMENT "Copying ELFIO's ${f}...")
+    list(APPEND out_files ${dst})
+endforeach()
+
+# Create target for hcc-headers and set dependencies
+add_custom_target(elfio-headers ALL DEPENDS ${out_files})
+add_dependencies(world hc-headers hc-impl-headers)
+
+# Install command for headers
+install(
+    FILES ${elfio_headers}
+    PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+    DESTINATION third_party/elfio)
\ No newline at end of file
diff --git a/third_party/elfio/COPYING b/third_party/elfio/COPYING
new file mode 100644
index 00000000000..b0cd4ac1ce8
--- /dev/null
+++ b/third_party/elfio/COPYING
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (C) 2001-2011 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/third_party/elfio/elf_types.hpp b/third_party/elfio/elf_types.hpp
new file mode 100644
index 00000000000..1301cf4315f
--- /dev/null
+++ b/third_party/elfio/elf_types.hpp
@@ -0,0 +1,780 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFTYPES_H
+#define ELFTYPES_H
+
+#ifndef ELFIO_NO_OWN_TYPES
+    #if !defined(ELFIO_NO_CSTDINT) && !defined(ELFIO_NO_INTTYPES)
+        #include <stdint.h>
+    #else
+        typedef unsigned char  uint8_t;
+        typedef signed char    int8_t;
+        typedef unsigned short uint16_t;
+        typedef signed short   int16_t;
+        #ifdef _MSC_VER
+            typedef unsigned __int32 uint32_t;
+            typedef signed   __int32 int32_t;
+            typedef unsigned __int64 uint64_t;
+            typedef signed   __int64 int64_t;
+        #else
+            typedef unsigned int       uint32_t;
+            typedef signed   int       int32_t;
+            typedef unsigned long long uint64_t;
+            typedef signed   long long int64_t;
+        #endif // _MSC_VER
+    #endif // ELFIO_NO_CSTDINT
+#endif // ELFIO_NO_OWN_TYPES
+
+namespace ELFIO {
+
+// Attention! Platform depended definitions.
+typedef uint16_t Elf_Half;
+typedef uint32_t Elf_Word;
+typedef int32_t  Elf_Sword;
+typedef uint64_t Elf_Xword;
+typedef int64_t  Elf_Sxword;
+
+typedef uint32_t Elf32_Addr;
+typedef uint32_t Elf32_Off;
+typedef uint64_t Elf64_Addr;
+typedef uint64_t Elf64_Off;
+
+#define Elf32_Half Elf_Half
+#define Elf64_Half Elf_Half
+#define Elf32_Word Elf_Word
+#define Elf64_Word Elf_Word
+#define Elf32_Sword Elf_Sword
+#define Elf64_Sword Elf_Sword
+
+///////////////////////
+// ELF Header Constants
+
+// File type
+#define ET_NONE        0
+#define ET_REL         1
+#define ET_EXEC        2
+#define ET_DYN         3
+#define ET_CORE        4
+#define ET_LOOS   0xFE00
+#define ET_HIOS   0xFEFF
+#define ET_LOPROC 0xFF00
+#define ET_HIPROC 0xFFFF
+
+
+#define EM_NONE          0   // No machine
+#define EM_M32           1   // AT&T WE 32100
+#define EM_SPARC         2   // SUN SPARC
+#define EM_386           3   // Intel 80386
+#define EM_68K           4   // Motorola m68k family
+#define EM_88K           5   // Motorola m88k family
+#define EM_486           6   // Intel 80486// Reserved for future use
+#define EM_860           7   // Intel 80860
+#define EM_MIPS          8   // MIPS R3000 (officially, big-endian only)
+#define EM_S370          9   // IBM System/370
+#define EM_MIPS_RS3_LE   10  // MIPS R3000 little-endian (Oct 4 1999 Draft) Deprecated
+#define EM_res011        11  // Reserved
+#define EM_res012        12  // Reserved
+#define EM_res013        13  // Reserved
+#define EM_res014        14  // Reserved
+#define EM_PARISC        15  // HPPA
+#define EM_res016        16  // Reserved
+#define EM_VPP550        17  // Fujitsu VPP500
+#define EM_SPARC32PLUS   18  // Sun's "v8plus"
+#define EM_960           19  // Intel 80960
+#define EM_PPC           20  // PowerPC
+#define EM_PPC64         21  // 64-bit PowerPC
+#define EM_S390          22  // IBM S/390
+#define EM_SPU           23  // Sony/Toshiba/IBM SPU
+#define EM_res024        24  // Reserved
+#define EM_res025        25  // Reserved
+#define EM_res026        26  // Reserved
+#define EM_res027        27  // Reserved
+#define EM_res028        28  // Reserved
+#define EM_res029        29  // Reserved
+#define EM_res030        30  // Reserved
+#define EM_res031        31  // Reserved
+#define EM_res032        32  // Reserved
+#define EM_res033        33  // Reserved
+#define EM_res034        34  // Reserved
+#define EM_res035        35  // Reserved
+#define EM_V800          36  // NEC V800 series
+#define EM_FR20          37  // Fujitsu FR20
+#define EM_RH32          38  // TRW RH32
+#define EM_MCORE         39  // Motorola M*Core // May also be taken by Fujitsu MMA
+#define EM_RCE           39  // Old name for MCore
+#define EM_ARM           40  // ARM
+#define EM_OLD_ALPHA     41  // Digital Alpha
+#define EM_SH            42  // Renesas (formerly Hitachi) / SuperH SH
+#define EM_SPARCV9       43  // SPARC v9 64-bit
+#define EM_TRICORE       44  // Siemens Tricore embedded processor
+#define EM_ARC           45  // ARC Cores
+#define EM_H8_300        46  // Renesas (formerly Hitachi) H8/300
+#define EM_H8_300H       47  // Renesas (formerly Hitachi) H8/300H
+#define EM_H8S           48  // Renesas (formerly Hitachi) H8S
+#define EM_H8_500        49  // Renesas (formerly Hitachi) H8/500
+#define EM_IA_64         50  // Intel IA-64 Processor
+#define EM_MIPS_X        51  // Stanford MIPS-X
+#define EM_COLDFIRE      52  // Motorola Coldfire
+#define EM_68HC12        53  // Motorola M68HC12
+#define EM_MMA           54  // Fujitsu Multimedia Accelerator
+#define EM_PCP           55  // Siemens PCP
+#define EM_NCPU          56  // Sony nCPU embedded RISC processor
+#define EM_NDR1          57  // Denso NDR1 microprocesspr
+#define EM_STARCORE      58  // Motorola Star*Core processor
+#define EM_ME16          59  // Toyota ME16 processor
+#define EM_ST100         60  // STMicroelectronics ST100 processor
+#define EM_TINYJ         61  // Advanced Logic Corp. TinyJ embedded processor
+#define EM_X86_64        62  // Advanced Micro Devices X86-64 processor
+#define EM_PDSP          63  // Sony DSP Processor
+#define EM_PDP10         64  // Digital Equipment Corp. PDP-10
+#define EM_PDP11         65  // Digital Equipment Corp. PDP-11
+#define EM_FX66          66  // Siemens FX66 microcontroller
+#define EM_ST9PLUS       67  // STMicroelectronics ST9+ 8/16 bit microcontroller
+#define EM_ST7           68  // STMicroelectronics ST7 8-bit microcontroller
+#define EM_68HC16        69  // Motorola MC68HC16 Microcontroller
+#define EM_68HC11        70  // Motorola MC68HC11 Microcontroller
+#define EM_68HC08        71  // Motorola MC68HC08 Microcontroller
+#define EM_68HC05        72  // Motorola MC68HC05 Microcontroller
+#define EM_SVX           73  // Silicon Graphics SVx
+#define EM_ST19          74  // STMicroelectronics ST19 8-bit cpu
+#define EM_VAX           75  // Digital VAX
+#define EM_CRIS          76  // Axis Communications 32-bit embedded processor
+#define EM_JAVELIN       77  // Infineon Technologies 32-bit embedded cpu
+#define EM_FIREPATH      78  // Element 14 64-bit DSP processor
+#define EM_ZSP           79  // LSI Logic's 16-bit DSP processor
+#define EM_MMIX          80  // Donald Knuth's educational 64-bit processor
+#define EM_HUANY         81  // Harvard's machine-independent format
+#define EM_PRISM         82  // SiTera Prism
+#define EM_AVR           83  // Atmel AVR 8-bit microcontroller
+#define EM_FR30          84  // Fujitsu FR30
+#define EM_D10V          85  // Mitsubishi D10V
+#define EM_D30V          86  // Mitsubishi D30V
+#define EM_V850          87  // NEC v850
+#define EM_M32R          88  // Renesas M32R (formerly Mitsubishi M32R)
+#define EM_MN10300       89  // Matsushita MN10300
+#define EM_MN10200       90  // Matsushita MN10200
+#define EM_PJ            91  // picoJava
+#define EM_OPENRISC      92  // OpenRISC 32-bit embedded processor
+#define EM_ARC_A5        93  // ARC Cores Tangent-A5
+#define EM_XTENSA        94  // Tensilica Xtensa Architecture
+#define EM_VIDEOCORE     95  // Alphamosaic VideoCore processor
+#define EM_TMM_GPP       96  // Thompson Multimedia General Purpose Processor
+#define EM_NS32K         97  // National Semiconductor 32000 series
+#define EM_TPC           98  // Tenor Network TPC processor
+#define EM_SNP1K         99  // Trebia SNP 1000 processor
+#define EM_ST200         100 // STMicroelectronics ST200 microcontroller
+#define EM_IP2K          101 // Ubicom IP2022 micro controller
+#define EM_MAX           102 // MAX Processor
+#define EM_CR            103 // National Semiconductor CompactRISC
+#define EM_F2MC16        104 // Fujitsu F2MC16
+#define EM_MSP430        105 // TI msp430 micro controller
+#define EM_BLACKFIN      106 // ADI Blackfin
+#define EM_SE_C33        107 // S1C33 Family of Seiko Epson processors
+#define EM_SEP           108 // Sharp embedded microprocessor
+#define EM_ARCA          109 // Arca RISC Microprocessor
+#define EM_UNICORE       110 // Microprocessor series from PKU-Unity Ltd. and MPRC of Peking University
+#define EM_EXCESS        111 // eXcess: 16/32/64-bit configurable embedded CPU
+#define EM_DXP           112 // Icera Semiconductor Inc. Deep Execution Processor
+#define EM_ALTERA_NIOS2  113 // Altera Nios II soft-core processor
+#define EM_CRX           114 // National Semiconductor CRX
+#define EM_XGATE         115 // Motorola XGATE embedded processor
+#define EM_C166          116 // Infineon C16x/XC16x processor
+#define EM_M16C          117 // Renesas M16C series microprocessors
+#define EM_DSPIC30F      118 // Microchip Technology dsPIC30F Digital Signal Controller
+#define EM_CE            119 // Freescale Communication Engine RISC core
+#define EM_M32C          120 // Renesas M32C series microprocessors
+#define EM_res121        121 // Reserved
+#define EM_res122        122 // Reserved
+#define EM_res123        123 // Reserved
+#define EM_res124        124 // Reserved
+#define EM_res125        125 // Reserved
+#define EM_res126        126 // Reserved
+#define EM_res127        127 // Reserved
+#define EM_res128        128 // Reserved
+#define EM_res129        129 // Reserved
+#define EM_res130        130 // Reserved
+#define EM_TSK3000       131 // Altium TSK3000 core
+#define EM_RS08          132 // Freescale RS08 embedded processor
+#define EM_res133        133 // Reserved
+#define EM_ECOG2         134 // Cyan Technology eCOG2 microprocessor
+#define EM_SCORE         135 // Sunplus Score
+#define EM_SCORE7        135 // Sunplus S+core7 RISC processor
+#define EM_DSP24         136 // New Japan Radio (NJR) 24-bit DSP Processor
+#define EM_VIDEOCORE3    137 // Broadcom VideoCore III processor
+#define EM_LATTICEMICO32 138 // RISC processor for Lattice FPGA architecture
+#define EM_SE_C17        139 // Seiko Epson C17 family
+#define EM_TI_C6000      140 // Texas Instruments TMS320C6000 DSP family
+#define EM_TI_C2000      141 // Texas Instruments TMS320C2000 DSP family
+#define EM_TI_C5500      142 // Texas Instruments TMS320C55x DSP family
+#define EM_res143        143 // Reserved
+#define EM_res144        144 // Reserved
+#define EM_res145        145 // Reserved
+#define EM_res146        146 // Reserved
+#define EM_res147        147 // Reserved
+#define EM_res148        148 // Reserved
+#define EM_res149        149 // Reserved
+#define EM_res150        150 // Reserved
+#define EM_res151        151 // Reserved
+#define EM_res152        152 // Reserved
+#define EM_res153        153 // Reserved
+#define EM_res154        154 // Reserved
+#define EM_res155        155 // Reserved
+#define EM_res156        156 // Reserved
+#define EM_res157        157 // Reserved
+#define EM_res158        158 // Reserved
+#define EM_res159        159 // Reserved
+#define EM_MMDSP_PLUS    160 // STMicroelectronics 64bit VLIW Data Signal Processor
+#define EM_CYPRESS_M8C   161 // Cypress M8C microprocessor
+#define EM_R32C          162 // Renesas R32C series microprocessors
+#define EM_TRIMEDIA      163 // NXP Semiconductors TriMedia architecture family
+#define EM_QDSP6         164 // QUALCOMM DSP6 Processor
+#define EM_8051          165 // Intel 8051 and variants
+#define EM_STXP7X        166 // STMicroelectronics STxP7x family
+#define EM_NDS32         167 // Andes Technology compact code size embedded RISC processor family
+#define EM_ECOG1         168 // Cyan Technology eCOG1X family
+#define EM_ECOG1X        168 // Cyan Technology eCOG1X family
+#define EM_MAXQ30        169 // Dallas Semiconductor MAXQ30 Core Micro-controllers
+#define EM_XIMO16        170 // New Japan Radio (NJR) 16-bit DSP Processor
+#define EM_MANIK         171 // M2000 Reconfigurable RISC Microprocessor
+#define EM_CRAYNV2       172 // Cray Inc. NV2 vector architecture
+#define EM_RX            173 // Renesas RX family
+#define EM_METAG         174 // Imagination Technologies META processor architecture
+#define EM_MCST_ELBRUS   175 // MCST Elbrus general purpose hardware architecture
+#define EM_ECOG16        176 // Cyan Technology eCOG16 family
+#define EM_CR16          177 // National Semiconductor CompactRISC 16-bit processor
+#define EM_ETPU          178 // Freescale Extended Time Processing Unit
+#define EM_SLE9X         179 // Infineon Technologies SLE9X core
+#define EM_L1OM          180 // Intel L1OM
+#define EM_INTEL181      181 // Reserved by Intel
+#define EM_INTEL182      182 // Reserved by Intel
+#define EM_res183        183 // Reserved by ARM
+#define EM_res184        184 // Reserved by ARM
+#define EM_AVR32         185 // Atmel Corporation 32-bit microprocessor family
+#define EM_STM8          186 // STMicroeletronics STM8 8-bit microcontroller
+#define EM_TILE64        187 // Tilera TILE64 multicore architecture family
+#define EM_TILEPRO       188 // Tilera TILEPro multicore architecture family
+#define EM_MICROBLAZE    189 // Xilinx MicroBlaze 32-bit RISC soft processor core
+#define EM_CUDA          190 // NVIDIA CUDA architecture 
+#define EM_TILEGX        191 // Tilera TILE-Gx multicore architecture family
+#define EM_CLOUDSHIELD   192 // CloudShield architecture family
+#define EM_COREA_1ST     193 // KIPO-KAIST Core-A 1st generation processor family
+#define EM_COREA_2ND     194 // KIPO-KAIST Core-A 2nd generation processor family
+#define EM_ARC_COMPACT2  195 // Synopsys ARCompact V2
+#define EM_OPEN8         196 // Open8 8-bit RISC soft processor core
+#define EM_RL78          197 // Renesas RL78 family
+#define EM_VIDEOCORE5    198 // Broadcom VideoCore V processor
+#define EM_78KOR         199 // Renesas 78KOR family
+#define EM_56800EX       200 // Freescale 56800EX Digital Signal Controller (DSC)
+#define EM_BA1           201 // Beyond BA1 CPU architecture
+#define EM_BA2           202 // Beyond BA2 CPU architecture
+#define EM_XCORE         203 // XMOS xCORE processor family
+#define EM_MCHP_PIC      204 // Microchip 8-bit PIC(r) family
+#define EM_INTEL205      205 // Reserved by Intel
+#define EM_INTEL206      206 // Reserved by Intel
+#define EM_INTEL207      207 // Reserved by Intel
+#define EM_INTEL208      208 // Reserved by Intel
+#define EM_INTEL209      209 // Reserved by Intel
+#define EM_KM32          210 // KM211 KM32 32-bit processor
+#define EM_KMX32         211 // KM211 KMX32 32-bit processor
+#define EM_KMX16         212 // KM211 KMX16 16-bit processor
+#define EM_KMX8          213 // KM211 KMX8 8-bit processor
+#define EM_KVARC         214 // KM211 KVARC processor
+#define EM_CDP           215 // Paneve CDP architecture family
+#define EM_COGE          216 // Cognitive Smart Memory Processor
+#define EM_COOL          217 // iCelero CoolEngine
+#define EM_NORC          218 // Nanoradio Optimized RISC
+#define EM_CSR_KALIMBA   219 // CSR Kalimba architecture family
+#define EM_Z80           220 // Zilog Z80
+#define EM_VISIUM        221 // Controls and Data Services VISIUMcore processor
+#define EM_FT32          222 // FTDI Chip FT32 high performance 32-bit RISC architecture
+#define EM_MOXIE         223 // Moxie processor family
+#define EM_AMDGPU        224 // AMD GPU architecture
+#define EM_RISCV         243 // RISC-V
+#define EM_LANAI         244 // Lanai processor
+#define EM_CEVA          245 // CEVA Processor Architecture Family
+#define EM_CEVA_X2       246 // CEVA X2 Processor Family
+#define EM_BPF           247 // Linux BPF – in-kernel virtual machine
+
+// File version
+#define EV_NONE    0
+#define EV_CURRENT 1
+
+// Identification index
+#define EI_MAG0        0
+#define EI_MAG1        1
+#define EI_MAG2        2
+#define EI_MAG3        3
+#define EI_CLASS       4
+#define EI_DATA        5
+#define EI_VERSION     6
+#define EI_OSABI       7
+#define EI_ABIVERSION  8
+#define EI_PAD         9
+#define EI_NIDENT     16
+
+// Magic number
+#define ELFMAG0 0x7F
+#define ELFMAG1  'E'
+#define ELFMAG2  'L'
+#define ELFMAG3  'F'
+
+// File class
+#define ELFCLASSNONE 0
+#define ELFCLASS32   1
+#define ELFCLASS64   2
+
+// Encoding
+#define ELFDATANONE 0
+#define ELFDATA2LSB 1
+#define ELFDATA2MSB 2
+
+// OS extensions
+#define ELFOSABI_NONE     0 // No extensions or unspecified
+#define ELFOSABI_HPUX     1 // Hewlett-Packard HP-UX
+#define ELFOSABI_NETBSD   2 // NetBSD
+#define ELFOSABI_LINUX    3 // Linux
+#define ELFOSABI_SOLARIS  6 // Sun Solaris
+#define ELFOSABI_AIX      7 // AIX
+#define ELFOSABI_IRIX     8 // IRIX
+#define ELFOSABI_FREEBSD  9 // FreeBSD
+#define ELFOSABI_TRU64   10 // Compaq TRU64 UNIX
+#define ELFOSABI_MODESTO 11 // Novell Modesto
+#define ELFOSABI_OPENBSD 12 // Open BSD
+#define ELFOSABI_OPENVMS 13 // Open VMS
+#define ELFOSABI_NSK     14 // Hewlett-Packard Non-Stop Kernel
+#define ELFOSABI_AROS    15 // Amiga Research OS
+#define ELFOSABI_FENIXOS 16 // The FenixOS highly scalable multi-core OS
+//                       64-255 Architecture-specific value range
+
+
+
+/////////////////////
+// Sections constants
+
+// Section indexes
+#define SHN_UNDEF          0
+#define SHN_LORESERVE 0xFF00
+#define SHN_LOPROC    0xFF00
+#define SHN_HIPROC    0xFF1F
+#define SHN_LOOS      0xFF20
+#define SHN_HIOS      0xFF3F
+#define SHN_ABS       0xFFF1
+#define SHN_COMMON    0xFFF2
+#define SHN_XINDEX    0xFFFF
+#define SHN_HIRESERVE 0xFFFF
+
+// Section types
+#define SHT_NULL                   0
+#define SHT_PROGBITS               1
+#define SHT_SYMTAB                 2
+#define SHT_STRTAB                 3
+#define SHT_RELA                   4
+#define SHT_HASH                   5
+#define SHT_DYNAMIC                6
+#define SHT_NOTE                   7
+#define SHT_NOBITS                 8
+#define SHT_REL                    9
+#define SHT_SHLIB                 10
+#define SHT_DYNSYM                11
+#define SHT_INIT_ARRAY            14
+#define SHT_FINI_ARRAY            15
+#define SHT_PREINIT_ARRAY         16
+#define SHT_GROUP                 17
+#define SHT_SYMTAB_SHNDX          18
+#define SHT_LOOS          0x60000000
+#define SHT_HIOS          0x6fffffff
+#define SHT_LOPROC        0x70000000
+#define SHT_HIPROC        0x7FFFFFFF
+#define SHT_LOUSER        0x80000000
+#define SHT_HIUSER        0xFFFFFFFF
+
+// Section attribute flags
+#define SHF_WRITE                   0x1
+#define SHF_ALLOC                   0x2
+#define SHF_EXECINSTR               0x4
+#define SHF_MERGE                  0x10
+#define SHF_STRINGS                0x20
+#define SHF_INFO_LINK              0x40
+#define SHF_LINK_ORDER             0x80
+#define SHF_OS_NONCONFORMING      0x100
+#define SHF_GROUP                 0x200
+#define SHF_TLS                   0x400
+#define SHF_MASKOS           0x0ff00000
+#define SHF_MASKPROC         0xF0000000
+
+// Section group flags
+#define GRP_COMDAT          0x1
+#define GRP_MASKOS   0x0ff00000
+#define GRP_MASKPROC 0xf0000000
+
+// Symbol binding
+#define STB_LOCAL     0
+#define STB_GLOBAL    1
+#define STB_WEAK      2
+#define STB_LOOS     10
+#define STB_HIOS     12
+#define STB_MULTIDEF 13
+#define STB_LOPROC   13
+#define STB_HIPROC   15
+
+// Symbol types
+#define STT_NOTYPE   0
+#define STT_OBJECT   1
+#define STT_FUNC     2
+#define STT_SECTION  3
+#define STT_FILE     4
+#define STT_COMMON   5
+#define STT_TLS      6
+#define STT_LOOS    10
+#define STT_HIOS    12
+#define STT_LOPROC  13
+#define STT_HIPROC  15
+
+// Symbol visibility
+#define STV_DEFAULT   0
+#define STV_INTERNAL  1
+#define STV_HIDDEN    2
+#define STV_PROTECTED 3
+
+// Undefined name
+#define STN_UNDEF 0
+
+// Relocation types
+#define R_386_NONE             0
+#define R_X86_64_NONE          0
+#define R_386_32               1
+#define R_X86_64_64            1
+#define R_386_PC32             2
+#define R_X86_64_PC32          2
+#define R_386_GOT32            3
+#define R_X86_64_GOT32         3
+#define R_386_PLT32            4
+#define R_X86_64_PLT32         4
+#define R_386_COPY             5
+#define R_X86_64_COPY          5
+#define R_386_GLOB_DAT         6
+#define R_X86_64_GLOB_DAT      6
+#define R_386_JMP_SLOT         7
+#define R_X86_64_JUMP_SLOT     7
+#define R_386_RELATIVE         8
+#define R_X86_64_RELATIVE      8
+#define R_386_GOTOFF           9
+#define R_X86_64_GOTPCREL      9
+#define R_386_GOTPC           10
+#define R_X86_64_32           10
+#define R_386_32PLT           11
+#define R_X86_64_32S          11
+#define R_X86_64_16           12
+#define R_X86_64_PC16         13
+#define R_386_TLS_TPOFF       14
+#define R_X86_64_8            14
+#define R_386_TLS_IE          15
+#define R_X86_64_PC8          15
+#define R_386_TLS_GOTIE       16
+#define R_X86_64_DTPMOD64     16
+#define R_386_TLS_LE          17
+#define R_X86_64_DTPOFF64     17
+#define R_386_TLS_GD          18
+#define R_X86_64_TPOFF64      18
+#define R_386_TLS_LDM         19
+#define R_X86_64_TLSGD        19
+#define R_386_16              20
+#define R_X86_64_TLSLD        20
+#define R_386_PC16            21
+#define R_X86_64_DTPOFF32     21
+#define R_386_8               22
+#define R_X86_64_GOTTPOFF     22
+#define R_386_PC8             23
+#define R_X86_64_TPOFF32      23
+#define R_386_TLS_GD_32       24
+#define R_X86_64_PC64         24
+#define R_386_TLS_GD_PUSH     25
+#define R_X86_64_GOTOFF64     25
+#define R_386_TLS_GD_CALL     26
+#define R_X86_64_GOTPC32      26
+#define R_386_TLS_GD_POP      27
+#define R_X86_64_GOT64        27
+#define R_386_TLS_LDM_32      28
+#define R_X86_64_GOTPCREL64   28
+#define R_386_TLS_LDM_PUSH    29
+#define R_X86_64_GOTPC64      29
+#define R_386_TLS_LDM_CALL    30
+#define R_X86_64_GOTPLT64     30
+#define R_386_TLS_LDM_POP     31
+#define R_X86_64_PLTOFF64     31
+#define R_386_TLS_LDO_32      32
+#define R_386_TLS_IE_32       33
+#define R_386_TLS_LE_32       34
+#define R_X86_64_GOTPC32_TLSDESC  34
+#define R_386_TLS_DTPMOD32    35
+#define R_X86_64_TLSDESC_CALL 35
+#define R_386_TLS_DTPOFF32    36
+#define R_X86_64_TLSDESC      36
+#define R_386_TLS_TPOFF32     37
+#define R_X86_64_IRELATIVE    37
+#define R_386_SIZE32          38
+#define R_386_TLS_GOTDESC     39
+#define R_386_TLS_DESC_CALL   40
+#define R_386_TLS_DESC        41
+#define R_386_IRELATIVE       42
+#define R_386_GOT32X          43
+#define R_X86_64_GNU_VTINHERIT  250
+#define R_X86_64_GNU_VTENTRY    251
+
+// Segment types
+#define PT_NULL             0
+#define PT_LOAD             1
+#define PT_DYNAMIC          2
+#define PT_INTERP           3
+#define PT_NOTE             4
+#define PT_SHLIB            5
+#define PT_PHDR             6
+#define PT_TLS              7
+#define PT_LOOS    0x60000000
+#define PT_HIOS    0x6fffffff
+#define PT_LOPROC  0x70000000
+#define PT_HIPROC  0x7FFFFFFF
+
+// Segment flags
+#define PF_X                 1 // Execute
+#define PF_W                 2 // Write
+#define PF_R                 4 // Read
+#define PF_MASKOS   0x0ff00000 // Unspecified
+#define PF_MASKPROC 0xf0000000 // Unspecified
+
+// Dynamic Array Tags
+#define DT_NULL              0
+#define DT_NEEDED            1
+#define DT_PLTRELSZ          2
+#define DT_PLTGOT            3
+#define DT_HASH              4
+#define DT_STRTAB            5
+#define DT_SYMTAB            6
+#define DT_RELA              7
+#define DT_RELASZ            8
+#define DT_RELAENT           9
+#define DT_STRSZ            10
+#define DT_SYMENT           11
+#define DT_INIT             12
+#define DT_FINI             13
+#define DT_SONAME           14
+#define DT_RPATH            15
+#define DT_SYMBOLIC         16
+#define DT_REL              17
+#define DT_RELSZ            18
+#define DT_RELENT           19
+#define DT_PLTREL           20
+#define DT_DEBUG            21
+#define DT_TEXTREL          22
+#define DT_JMPREL           23
+#define DT_BIND_NOW         24
+#define DT_INIT_ARRAY       25
+#define DT_FINI_ARRAY       26
+#define DT_INIT_ARRAYSZ     27
+#define DT_FINI_ARRAYSZ     28
+#define DT_RUNPATH          29
+#define DT_FLAGS            30
+#define DT_ENCODING         32
+#define DT_PREINIT_ARRAY    32
+#define DT_PREINIT_ARRAYSZ  33
+#define DT_MAXPOSTAGS       34
+#define DT_LOOS     0x6000000D
+#define DT_HIOS     0x6ffff000
+#define DT_LOPROC   0x70000000
+#define DT_HIPROC   0x7FFFFFFF
+
+// DT_FLAGS values
+#define DF_ORIGIN     0x1
+#define DF_SYMBOLIC   0x2
+#define DF_TEXTREL    0x4
+#define DF_BIND_NOW   0x8
+#define DF_STATIC_TLS 0x10
+
+
+// ELF file header
+struct Elf32_Ehdr {
+    unsigned char e_ident[EI_NIDENT];
+    Elf_Half    e_type;
+    Elf_Half    e_machine;
+    Elf_Word    e_version;
+    Elf32_Addr  e_entry;
+    Elf32_Off   e_phoff;
+    Elf32_Off   e_shoff;
+    Elf_Word    e_flags;
+    Elf_Half    e_ehsize;
+    Elf_Half    e_phentsize;
+    Elf_Half    e_phnum;
+    Elf_Half    e_shentsize;
+    Elf_Half    e_shnum;
+    Elf_Half    e_shstrndx;
+};
+
+struct Elf64_Ehdr {
+    unsigned char e_ident[EI_NIDENT];
+    Elf_Half    e_type;
+    Elf_Half    e_machine;
+    Elf_Word    e_version;
+    Elf64_Addr  e_entry;
+    Elf64_Off   e_phoff;
+    Elf64_Off   e_shoff;
+    Elf_Word    e_flags;
+    Elf_Half    e_ehsize;
+    Elf_Half    e_phentsize;
+    Elf_Half    e_phnum;
+    Elf_Half    e_shentsize;
+    Elf_Half    e_shnum;
+    Elf_Half    e_shstrndx;
+};
+
+
+// Section header
+struct Elf32_Shdr {
+    Elf_Word   sh_name;
+    Elf_Word   sh_type;
+    Elf_Word   sh_flags;
+    Elf32_Addr sh_addr;
+    Elf32_Off  sh_offset;
+    Elf_Word   sh_size;
+    Elf_Word   sh_link;
+    Elf_Word   sh_info;
+    Elf_Word   sh_addralign;
+    Elf_Word   sh_entsize;
+};
+
+struct Elf64_Shdr {
+    Elf_Word   sh_name;
+    Elf_Word   sh_type;
+    Elf_Xword  sh_flags;
+    Elf64_Addr sh_addr;
+    Elf64_Off  sh_offset;
+    Elf_Xword  sh_size;
+    Elf_Word   sh_link;
+    Elf_Word   sh_info;
+    Elf_Xword  sh_addralign;
+    Elf_Xword  sh_entsize;
+};
+
+
+// Segment header
+struct Elf32_Phdr {
+    Elf_Word   p_type;
+    Elf32_Off  p_offset;
+    Elf32_Addr p_vaddr;
+    Elf32_Addr p_paddr;
+    Elf_Word   p_filesz;
+    Elf_Word   p_memsz;
+    Elf_Word   p_flags;
+    Elf_Word   p_align;
+};
+
+struct Elf64_Phdr {
+    Elf_Word   p_type;
+    Elf_Word   p_flags;
+    Elf64_Off  p_offset;
+    Elf64_Addr p_vaddr;
+    Elf64_Addr p_paddr;
+    Elf_Xword  p_filesz;
+    Elf_Xword  p_memsz;
+    Elf_Xword  p_align;
+};
+
+
+// Symbol table entry
+struct Elf32_Sym {
+    Elf_Word      st_name;
+    Elf32_Addr    st_value;
+    Elf_Word      st_size;
+    unsigned char st_info;
+    unsigned char st_other;
+    Elf_Half      st_shndx;
+};
+
+struct Elf64_Sym {
+    Elf_Word      st_name;
+    unsigned char st_info;
+    unsigned char st_other;
+    Elf_Half      st_shndx;
+    Elf64_Addr    st_value;
+    Elf_Xword     st_size;
+};
+
+
+#define ELF_ST_BIND(i)   ((i)>>4)
+#define ELF_ST_TYPE(i)   ((i)&0xf)
+#define ELF_ST_INFO(b,t) (((b)<<4)+((t)&0xf))
+
+#define ELF_ST_VISIBILITY(o) ((o)&0x3)
+
+
+// Relocation entries
+struct Elf32_Rel {
+    Elf32_Addr r_offset;
+    Elf_Word   r_info;
+};
+
+struct Elf32_Rela {
+    Elf32_Addr r_offset;
+    Elf_Word   r_info;
+    Elf_Sword  r_addend;
+};
+
+struct Elf64_Rel {
+    Elf64_Addr r_offset;
+    Elf_Xword  r_info;
+};
+
+struct Elf64_Rela {
+    Elf64_Addr r_offset;
+    Elf_Xword  r_info;
+    Elf_Sxword r_addend;
+};
+
+
+#define ELF32_R_SYM(i)    ((i)>>8)
+#define ELF32_R_TYPE(i)   ((unsigned char)(i))
+#define ELF32_R_INFO(s,t) (((s)<<8 )+(unsigned char)(t))
+
+#define ELF64_R_SYM(i)    ((i)>>32)
+#define ELF64_R_TYPE(i)   ((i)&0xffffffffL)
+#define ELF64_R_INFO(s,t) ((((int64_t)s)<<32)+((t)&0xffffffffL))
+
+// Dynamic structure
+struct Elf32_Dyn {
+    Elf_Sword d_tag;
+    union {
+        Elf_Word   d_val;
+        Elf32_Addr d_ptr;
+    } d_un;
+};
+
+struct Elf64_Dyn {
+    Elf_Sxword d_tag;
+    union {
+        Elf_Xword  d_val;
+        Elf64_Addr d_ptr;
+    } d_un;
+};
+
+} // namespace ELFIO
+
+#endif // ELFTYPES_H
diff --git a/third_party/elfio/elfio.hpp b/third_party/elfio/elfio.hpp
new file mode 100644
index 00000000000..a4052bd0620
--- /dev/null
+++ b/third_party/elfio/elfio.hpp
@@ -0,0 +1,947 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_HPP
+#define ELFIO_HPP
+
+#ifdef _MSC_VER
+#pragma warning ( push )
+#pragma warning(disable:4996)
+#pragma warning(disable:4355)
+#pragma warning(disable:4244)
+#endif
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <vector>
+#include <deque>
+#include <iterator>
+#include <typeinfo>
+
+#include <elfio/elf_types.hpp>
+#include <elfio/elfio_utils.hpp>
+#include <elfio/elfio_header.hpp>
+#include <elfio/elfio_section.hpp>
+#include <elfio/elfio_segment.hpp>
+#include <elfio/elfio_strings.hpp>
+
+#define ELFIO_HEADER_ACCESS_GET( TYPE, FNAME ) \
+TYPE                                           \
+get_##FNAME() const                            \
+{                                              \
+  return header? header->get_##FNAME() : 0;    \
+}
+
+#define ELFIO_HEADER_ACCESS_GET_SET( TYPE, FNAME ) \
+TYPE                                               \
+get_##FNAME() const                                \
+{                                                  \
+  return header? header->get_##FNAME() : 0;        \
+}                                                  \
+void                                               \
+set_##FNAME( TYPE val )                            \
+{ 						   \
+  if (header) { 			    	   \
+      header->set_##FNAME( val );                  \
+  } 						   \
+}                                                  \
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+class elfio
+{
+  public:
+//------------------------------------------------------------------------------
+    elfio() : sections( this ), segments( this )
+    {
+        header           = 0;
+        current_file_pos = 0;
+        create( ELFCLASS32, ELFDATA2LSB );
+    }
+
+//------------------------------------------------------------------------------
+    ~elfio()
+    {
+        clean();
+    }
+
+//------------------------------------------------------------------------------
+    void create( unsigned char file_class, unsigned char encoding )
+    {
+        clean();
+        convertor.setup( encoding );
+        header = create_header( file_class, encoding );
+        create_mandatory_sections();
+    }
+
+//------------------------------------------------------------------------------
+    bool load( const std::string& file_name )
+    {
+        std::ifstream stream;
+        stream.open( file_name.c_str(), std::ios::in | std::ios::binary );
+        if ( !stream ) {
+            return false;
+        }
+
+        return load(stream);
+    }
+
+//------------------------------------------------------------------------------
+    bool load( std::istream &stream )
+    {
+        clean();
+
+	unsigned char e_ident[EI_NIDENT];
+	// Read ELF file signature
+	stream.read( reinterpret_cast<char*>( &e_ident ), sizeof( e_ident ) );
+
+        // Is it ELF file?
+        if ( stream.gcount() != sizeof( e_ident ) ||
+             e_ident[EI_MAG0] != ELFMAG0    ||
+             e_ident[EI_MAG1] != ELFMAG1    ||
+             e_ident[EI_MAG2] != ELFMAG2    ||
+             e_ident[EI_MAG3] != ELFMAG3 ) {
+            return false;
+        }
+
+        if ( ( e_ident[EI_CLASS] != ELFCLASS64 ) &&
+             ( e_ident[EI_CLASS] != ELFCLASS32 )) {
+            return false;
+        }
+
+        convertor.setup( e_ident[EI_DATA] );
+        header = create_header( e_ident[EI_CLASS], e_ident[EI_DATA] );
+        if ( 0 == header ) {
+            return false;
+        }
+        if ( !header->load( stream ) ) {
+            return false;
+        }
+
+        load_sections( stream );
+        bool is_still_good = load_segments( stream );
+        return is_still_good;
+    }
+
+//------------------------------------------------------------------------------
+    bool save( const std::string& file_name )
+    {
+        std::ofstream f( file_name.c_str(), std::ios::out | std::ios::binary );
+
+        if ( !f || !header) {
+            return false;
+        }
+
+        bool is_still_good = true;
+        // Define layout specific header fields
+        // The position of the segment table is fixed after the header.
+        // The position of the section table is variable and needs to be fixed
+        // before saving.
+        header->set_segments_num( segments.size() );
+        header->set_segments_offset( segments.size() ? header->get_header_size() : 0 );
+        header->set_sections_num( sections.size() );
+        header->set_sections_offset( 0 );
+
+        // Layout the first section right after the segment table
+        current_file_pos = header->get_header_size() +
+                    header->get_segment_entry_size() * header->get_segments_num();
+
+        calc_segment_alignment();
+
+        is_still_good = layout_segments_and_their_sections();
+        is_still_good = is_still_good && layout_sections_without_segments();
+        is_still_good = is_still_good && layout_section_table();
+
+        is_still_good = is_still_good && save_header( f );
+        is_still_good = is_still_good && save_sections( f );
+        is_still_good = is_still_good && save_segments( f );
+
+        f.close();
+
+        return is_still_good;
+    }
+
+//------------------------------------------------------------------------------
+    // ELF header access functions
+    ELFIO_HEADER_ACCESS_GET( unsigned char, class              );
+    ELFIO_HEADER_ACCESS_GET( unsigned char, elf_version        );
+    ELFIO_HEADER_ACCESS_GET( unsigned char, encoding           );
+    ELFIO_HEADER_ACCESS_GET( Elf_Word,      version            );
+    ELFIO_HEADER_ACCESS_GET( Elf_Half,      header_size        );
+    ELFIO_HEADER_ACCESS_GET( Elf_Half,      section_entry_size );
+    ELFIO_HEADER_ACCESS_GET( Elf_Half,      segment_entry_size );
+
+    ELFIO_HEADER_ACCESS_GET_SET( unsigned char, os_abi                 );
+    ELFIO_HEADER_ACCESS_GET_SET( unsigned char, abi_version            );
+    ELFIO_HEADER_ACCESS_GET_SET( Elf_Half,      type                   );
+    ELFIO_HEADER_ACCESS_GET_SET( Elf_Half,      machine                );
+    ELFIO_HEADER_ACCESS_GET_SET( Elf_Word,      flags                  );
+    ELFIO_HEADER_ACCESS_GET_SET( Elf64_Addr,    entry                  );
+    ELFIO_HEADER_ACCESS_GET_SET( Elf64_Off,     sections_offset        );
+    ELFIO_HEADER_ACCESS_GET_SET( Elf64_Off,     segments_offset        );
+    ELFIO_HEADER_ACCESS_GET_SET( Elf_Half,      section_name_str_index );
+
+//------------------------------------------------------------------------------
+    const endianess_convertor& get_convertor() const
+    {
+        return convertor;
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Xword get_default_entry_size( Elf_Word section_type ) const
+    {
+        switch( section_type ) {
+        case SHT_RELA:
+            if ( header->get_class() == ELFCLASS64 ) {
+                return sizeof( Elf64_Rela );
+            }
+            else {
+                return sizeof( Elf32_Rela );
+            }
+        case SHT_REL:
+            if ( header->get_class() == ELFCLASS64 ) {
+                return sizeof( Elf64_Rel );
+            }
+            else {
+                return sizeof( Elf32_Rel );
+            }
+        case SHT_SYMTAB:
+            if ( header->get_class() == ELFCLASS64 ) {
+                return sizeof( Elf64_Sym );
+            }
+            else {
+                return sizeof( Elf32_Sym );
+            }
+        case SHT_DYNAMIC:
+            if ( header->get_class() == ELFCLASS64 ) {
+                return sizeof( Elf64_Dyn );
+            }
+            else {
+                return sizeof( Elf32_Dyn );
+            }
+        default:
+            return 0;
+        }
+    }
+
+//------------------------------------------------------------------------------
+  private:
+      bool is_offset_in_section( Elf64_Off offset, const section* sec ) const {
+          return offset >= sec->get_offset() && offset < sec->get_offset()+sec->get_size();
+      }
+
+//------------------------------------------------------------------------------
+  public:
+
+      //! returns an empty string if no problems are detected,
+      //! or a string containing an error message if problems are found
+      std::string validate() const {
+
+          // check for overlapping sections in the file
+          for ( int i = 0; i < sections.size(); ++i) {
+              for ( int j = i+1; j < sections.size(); ++j ) {
+                  const section* a = sections[i];
+                  const section* b = sections[j];
+                  if (   !(a->get_type() & SHT_NOBITS)
+                      && !(b->get_type() & SHT_NOBITS)
+                      && (a->get_size() > 0)
+                      && (b->get_size() > 0)
+                      && (a->get_offset() > 0)
+                      && (b->get_offset() > 0)) {
+                      if (   is_offset_in_section( a->get_offset(), b )
+                          || is_offset_in_section( a->get_offset()+a->get_size()-1, b )
+                          || is_offset_in_section( b->get_offset(), a )
+                          || is_offset_in_section( b->get_offset()+b->get_size()-1, a )) {
+                          return "Sections " + a->get_name() + " and " + b->get_name() + " overlap in file";
+                      }
+                  }
+              }
+          }
+
+          // more checks to be added here...
+
+          return "";
+      }
+
+//------------------------------------------------------------------------------
+  private:
+//------------------------------------------------------------------------------
+    void clean()
+    {
+        delete header;
+        header = 0;
+
+        std::vector<section*>::const_iterator it;
+        for ( it = sections_.begin(); it != sections_.end(); ++it ) {
+            delete *it;
+        }
+        sections_.clear();
+
+        std::vector<segment*>::const_iterator it1;
+        for ( it1 = segments_.begin(); it1 != segments_.end(); ++it1 ) {
+            delete *it1;
+        }
+        segments_.clear();
+    }
+
+//------------------------------------------------------------------------------
+    elf_header* create_header( unsigned char file_class, unsigned char encoding )
+    {
+        elf_header* new_header = 0;
+
+        if ( file_class == ELFCLASS64 ) {
+            new_header = new elf_header_impl< Elf64_Ehdr >( &convertor,
+                                                            encoding );
+        }
+        else if ( file_class == ELFCLASS32 ) {
+            new_header = new elf_header_impl< Elf32_Ehdr >( &convertor,
+                                                            encoding );
+        }
+        else {
+            return 0;
+        }
+
+        return new_header;
+    }
+
+//------------------------------------------------------------------------------
+    section* create_section()
+    {
+        section*      new_section;
+        unsigned char file_class = get_class();
+
+        if ( file_class == ELFCLASS64 ) {
+            new_section = new section_impl<Elf64_Shdr>( &convertor );
+        }
+        else if ( file_class == ELFCLASS32 ) {
+            new_section = new section_impl<Elf32_Shdr>( &convertor );
+        }
+        else {
+            return 0;
+        }
+
+        new_section->set_index( (Elf_Half)sections_.size() );
+        sections_.push_back( new_section );
+
+        return new_section;
+    }
+
+
+//------------------------------------------------------------------------------
+    segment* create_segment()
+    {
+        segment*      new_segment;
+        unsigned char file_class = header->get_class();
+
+        if ( file_class == ELFCLASS64 ) {
+            new_segment = new segment_impl<Elf64_Phdr>( &convertor );
+        }
+        else if ( file_class == ELFCLASS32 ) {
+            new_segment = new segment_impl<Elf32_Phdr>( &convertor );
+        }
+        else {
+            return 0;
+        }
+
+        new_segment->set_index( (Elf_Half)segments_.size() );
+        segments_.push_back( new_segment );
+
+        return new_segment;
+    }
+
+//------------------------------------------------------------------------------
+    void create_mandatory_sections()
+    {
+        // Create null section without calling to 'add_section' as no string
+        // section containing section names exists yet
+        section* sec0 = create_section();
+        sec0->set_index( 0 );
+        sec0->set_name( "" );
+        sec0->set_name_string_offset( 0 );
+
+        set_section_name_str_index( 1 );
+        section* shstrtab = sections.add( ".shstrtab" );
+        shstrtab->set_type( SHT_STRTAB );
+        shstrtab->set_addr_align( 1 );
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Half load_sections( std::istream& stream )
+    {
+        Elf_Half  entry_size = header->get_section_entry_size();
+        Elf_Half  num        = header->get_sections_num();
+        Elf64_Off offset     = header->get_sections_offset();
+
+        for ( Elf_Half i = 0; i < num; ++i ) {
+            section* sec = create_section();
+            sec->load( stream, (std::streamoff)offset + i * entry_size );
+            sec->set_index( i );
+            // To mark that the section is not permitted to reassign address
+            // during layout calculation
+            sec->set_address( sec->get_address() );
+        }
+
+        Elf_Half shstrndx = get_section_name_str_index();
+
+        if ( SHN_UNDEF != shstrndx ) {
+            string_section_accessor str_reader( sections[shstrndx] );
+            for ( Elf_Half i = 0; i < num; ++i ) {
+                Elf_Word section_offset = sections[i]->get_name_string_offset();
+                const char* p = str_reader.get_string( section_offset );
+                if ( p != 0 ) {
+                    sections[i]->set_name( p );
+                }
+            }
+        }
+
+        return num;
+    }
+
+//------------------------------------------------------------------------------
+    //! Checks whether the addresses of the section entirely fall within the given segment.
+    //! It doesn't matter if the addresses are memory addresses, or file offsets,
+    //!  they just need to be in the same address space
+    bool is_sect_in_seg ( Elf64_Off sect_begin, Elf_Xword sect_size, Elf64_Off seg_begin, Elf64_Off seg_end ) {
+        return seg_begin <= sect_begin
+                && sect_begin + sect_size <= seg_end
+                && sect_begin < seg_end;  // this is important criteria when sect_size == 0
+                                          // Example:  seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11)
+                                          //           sect_begin=12, sect_size=0  -> shall return false!
+    }
+
+//------------------------------------------------------------------------------
+    bool load_segments( std::istream& stream )
+    {
+        Elf_Half  entry_size = header->get_segment_entry_size();
+        Elf_Half  num        = header->get_segments_num();
+        Elf64_Off offset     = header->get_segments_offset();
+
+        for ( Elf_Half i = 0; i < num; ++i ) {
+            segment* seg;
+            unsigned char file_class = header->get_class();
+
+            if ( file_class == ELFCLASS64 ) {
+                seg = new segment_impl<Elf64_Phdr>( &convertor );
+            }
+            else if ( file_class == ELFCLASS32 ) {
+                seg = new segment_impl<Elf32_Phdr>( &convertor );
+            }
+            else {
+                return false;
+            }
+
+            seg->load( stream, (std::streamoff)offset + i * entry_size );
+            seg->set_index( i );
+
+            // Add sections to the segments (similar to readelfs algorithm)
+            Elf64_Off segBaseOffset = seg->get_offset();
+            Elf64_Off segEndOffset  = segBaseOffset + seg->get_file_size();
+            Elf64_Off segVBaseAddr = seg->get_virtual_address();
+            Elf64_Off segVEndAddr  = segVBaseAddr + seg->get_memory_size();
+            for( Elf_Half j = 0; j < sections.size(); ++j ) {
+                const section* psec = sections[j];
+
+                // SHF_ALLOC sections are matched based on the virtual address
+                // otherwise the file offset is matched
+                if( psec->get_flags() & SHF_ALLOC
+                      ? is_sect_in_seg( psec->get_address(), psec->get_size(), segVBaseAddr,  segVEndAddr )
+                      : is_sect_in_seg( psec->get_offset(),  psec->get_size(), segBaseOffset, segEndOffset )) {
+                      // Alignment of segment shall not be updated, to preserve original value
+                      // It will be re-calculated on saving.
+                      seg->add_section_index( psec->get_index(), 0 );
+                }
+            }
+
+            // Add section into the segments' container
+            segments_.push_back( seg );
+        }
+
+        return true;
+    }
+
+//------------------------------------------------------------------------------
+    bool save_header( std::ofstream& f )
+    {
+        return header->save( f );
+    }
+
+//------------------------------------------------------------------------------
+    bool save_sections( std::ofstream& f )
+    {
+        for ( unsigned int i = 0; i < sections_.size(); ++i ) {
+            section *sec = sections_.at(i);
+
+            std::streampos headerPosition =
+                (std::streamoff)header->get_sections_offset() +
+                header->get_section_entry_size() * sec->get_index();
+
+            sec->save(f,headerPosition,sec->get_offset());
+        }
+        return true;
+    }
+
+//------------------------------------------------------------------------------
+    bool save_segments( std::ofstream& f )
+    {
+        for ( unsigned int i = 0; i < segments_.size(); ++i ) {
+            segment *seg = segments_.at(i);
+
+            std::streampos headerPosition = header->get_segments_offset()  +
+                header->get_segment_entry_size()*seg->get_index();
+
+            seg->save( f, headerPosition, seg->get_offset() );
+        }
+        return true;
+    }
+
+//------------------------------------------------------------------------------
+    bool is_section_without_segment( unsigned int section_index )
+    {
+        bool found = false;
+
+        for ( unsigned int j = 0; !found && ( j < segments.size() ); ++j ) {
+            for ( unsigned int k = 0;
+                  !found && ( k < segments[j]->get_sections_num() );
+                  ++k ) {
+                found = segments[j]->get_section_index_at( k ) == section_index;
+            }
+        }
+
+        return !found;
+    }
+
+//------------------------------------------------------------------------------
+    bool is_subsequence_of( segment* seg1, segment* seg2 )
+    {
+        // Return 'true' if sections of seg1 are a subset of sections in seg2
+        const std::vector<Elf_Half>& sections1 = seg1->get_sections();
+        const std::vector<Elf_Half>& sections2 = seg2->get_sections();
+
+        bool found = false;
+        if ( sections1.size() <  sections2.size() ) {
+            found = std::includes( sections2.begin(), sections2.end(),
+                                   sections1.begin(), sections1.end() );
+        }
+
+        return found;
+    }
+
+//------------------------------------------------------------------------------
+    std::vector<segment*> get_ordered_segments( )
+    {
+        std::vector<segment*> res;
+        std::deque<segment*>  worklist;
+
+        res.reserve(segments.size());
+        std::copy( segments_.begin(), segments_.end(),
+                   std::back_inserter( worklist )) ;
+
+        // Bring the segments which start at address 0 to the front
+        size_t nextSlot = 0;
+        for( size_t i = 0; i < worklist.size(); ++i ) {
+            if( i != nextSlot && worklist[i]->is_offset_initialized()
+                && worklist[i]->get_offset() == 0 ) {
+                if (worklist[nextSlot]->get_offset() == 0) {
+                    ++nextSlot;
+                }
+                std::swap(worklist[i],worklist[nextSlot]);
+                ++nextSlot;
+            }
+        }
+
+        while ( !worklist.empty() ) {
+            segment *seg = worklist.front();
+            worklist.pop_front();
+
+            size_t i = 0;
+            for ( ; i < worklist.size(); ++i ) {
+                if ( is_subsequence_of( seg, worklist[i] ) ) {
+                    break;
+                }
+            }
+
+            if ( i < worklist.size() )
+                worklist.push_back(seg);
+            else
+                res.push_back(seg);
+        }
+
+        return res;
+    }
+
+
+//------------------------------------------------------------------------------
+    bool layout_sections_without_segments( )
+    {
+        for ( unsigned int i = 0; i < sections_.size(); ++i ) {
+            if ( is_section_without_segment( i ) ) {
+                section *sec = sections_[i];
+
+                Elf_Xword section_align = sec->get_addr_align();
+                if ( section_align > 1 && current_file_pos % section_align != 0 ) {
+                    current_file_pos += section_align -
+                                            current_file_pos % section_align;
+                }
+
+                if ( 0 != sec->get_index() )
+                  sec->set_offset(current_file_pos);
+
+                if ( SHT_NOBITS != sec->get_type() &&
+                     SHT_NULL   != sec->get_type() ) {
+                    current_file_pos += sec->get_size();
+                }
+            }
+        }
+
+        return true;
+    }
+
+
+//------------------------------------------------------------------------------
+    void calc_segment_alignment( )
+    {
+        for( std::vector<segment*>::iterator s = segments_.begin(); s != segments_.end(); ++s ) {
+            segment* seg = *s;
+            for ( int i = 0; i < seg->get_sections_num(); ++i ) {
+                section* sect = sections_[ seg->get_section_index_at(i) ];
+                if ( sect->get_addr_align() > seg->get_align() ) {
+                    seg->set_align( sect->get_addr_align() );
+                }
+            }
+        }
+    }
+
+//------------------------------------------------------------------------------
+    bool layout_segments_and_their_sections( )
+    {
+        std::vector<segment*>  worklist;
+        std::vector<bool>      section_generated(sections.size(),false);
+
+        // Get segments in a order in where segments which contain a
+        // sub sequence of other segments are located at the end
+        worklist = get_ordered_segments();
+
+        for ( unsigned int i = 0; i < worklist.size(); ++i ) {
+            Elf_Xword segment_memory   = 0;
+            Elf_Xword segment_filesize = 0;
+            Elf_Xword seg_start_pos    = current_file_pos;
+            segment* seg               = worklist[i];
+
+            // Special case: PHDR segment
+            // This segment contains the program headers but no sections
+            if ( seg->get_type() == PT_PHDR && seg->get_sections_num() == 0 ) {
+                seg_start_pos = header->get_segments_offset();
+                segment_memory = segment_filesize =
+                    header->get_segment_entry_size() * header->get_segments_num();
+            }
+            // Special case:
+            // Segments which start with the NULL section and have further sections
+            else if ( seg->get_sections_num() > 1
+                      && sections[seg->get_section_index_at( 0 )]->get_type() == SHT_NULL ) {
+                seg_start_pos = 0;
+                if ( seg->get_sections_num() ) {
+                    segment_memory = segment_filesize = current_file_pos;
+                }
+            }
+            // New segments with not generated sections
+            // have to be aligned
+            else if ( seg->get_sections_num()
+                     && !section_generated[seg->get_section_index_at( 0 )] ) {
+                Elf_Xword align = seg->get_align() > 0 ? seg->get_align() : 1;
+                Elf64_Off cur_page_alignment = current_file_pos % align;
+                Elf64_Off req_page_alignment = seg->get_virtual_address() % align;
+                Elf64_Off error              = req_page_alignment - cur_page_alignment;
+
+                current_file_pos += ( seg->get_align() + error ) % align;
+                seg_start_pos = current_file_pos;
+            }
+            else if ( seg->get_sections_num() ) {
+                seg_start_pos = sections[seg->get_section_index_at( 0 )]->get_offset();
+            }
+
+            // Write segment's data
+            for ( unsigned int j = 0; j < seg->get_sections_num(); ++j ) {
+                Elf_Half index = seg->get_section_index_at( j );
+
+                section* sec = sections[ index ];
+
+                // The NULL section is always generated
+                if ( SHT_NULL == sec->get_type()) {
+                    section_generated[index] = true;
+                    continue;
+                }
+
+                Elf_Xword secAlign = 0;
+                // Fix up the alignment
+                if ( !section_generated[index] && sec->is_address_initialized()
+                    && SHT_NOBITS != sec->get_type()
+                    && SHT_NULL != sec->get_type()
+                    && 0 != sec->get_size() ) {
+                    // Align the sections based on the virtual addresses
+                    // when possible (this is what matters for execution)
+                    Elf64_Off req_offset = sec->get_address() - seg->get_virtual_address();
+                    Elf64_Off cur_offset = current_file_pos - seg_start_pos;
+                    if ( req_offset < cur_offset) {
+                         // something has gone awfully wrong, abort!
+                         // secAlign would turn out negative, seeking backwards and overwriting previous data
+                         return false;
+                    }
+                    secAlign             = req_offset - cur_offset;
+                }
+                else if (!section_generated[index] && !sec->is_address_initialized() ) {
+                    // If no address has been specified then only the section
+                    // alignment constraint has to be matched
+					Elf_Xword align = sec->get_addr_align();
+					if (align == 0) {
+						align = 1;
+					}
+                    Elf64_Off error = current_file_pos % align;
+                    secAlign = ( align - error ) % align;
+                }
+                else if (section_generated[index] ) {
+                    // Alignment for already generated sections
+                    secAlign = sec->get_offset() - seg_start_pos - segment_filesize;
+                }
+
+                // Determine the segment file and memory sizes
+                // Special case .tbss section (NOBITS) in non TLS segment
+                if ( (sec->get_flags() & SHF_ALLOC)
+                    && !( (sec->get_flags() & SHF_TLS) && (seg->get_type() != PT_TLS)
+                          && ( SHT_NOBITS == sec->get_type())) )
+                    segment_memory += sec->get_size() + secAlign;
+                if ( SHT_NOBITS != sec->get_type() && SHT_NULL != sec->get_type() )
+                    segment_filesize += sec->get_size() + secAlign;
+
+                // Nothing to be done when generating nested segments
+                if(section_generated[index]) {
+                    continue;
+                }
+
+                current_file_pos += secAlign;
+
+                // Set the section addresses when missing
+                if ( !sec->is_address_initialized() )
+                    sec->set_address( seg->get_virtual_address()
+                                      + current_file_pos - seg_start_pos);
+
+                if ( 0 != sec->get_index() )
+                  sec->set_offset(current_file_pos);
+
+                if ( SHT_NOBITS != sec->get_type() && SHT_NULL != sec->get_type() )
+                  current_file_pos += sec->get_size();
+                section_generated[index] = true;
+            }
+
+            seg->set_file_size( segment_filesize );
+
+            // If we already have a memory size from loading an elf file (value > 0),
+            // it must not shrink!
+            // Memory size may be bigger than file size and it is the loader's job to do something
+            // with the surplus bytes in memory, like initializing them with a defined value.
+            if ( seg->get_memory_size() < segment_memory ) {
+                seg->set_memory_size( segment_memory );
+            }
+
+            seg->set_offset(seg_start_pos);
+        }
+
+        return true;
+    }
+
+//------------------------------------------------------------------------------
+    bool layout_section_table()
+    {
+        // Simply place the section table at the end for now
+        Elf64_Off alignmentError = current_file_pos % 4;
+        current_file_pos += ( 4 - alignmentError ) % 4;
+        header->set_sections_offset(current_file_pos);
+        return true;
+    }
+
+
+//------------------------------------------------------------------------------
+  public:
+    friend class Sections;
+    class Sections {
+      public:
+//------------------------------------------------------------------------------
+        Sections( elfio* parent_ ) :
+            parent( parent_ )
+        {
+        }
+
+//------------------------------------------------------------------------------
+        Elf_Half size() const
+        {
+            return (Elf_Half)parent->sections_.size();
+        }
+
+//------------------------------------------------------------------------------
+        section* operator[]( unsigned int index ) const
+        {
+            section* sec = 0;
+
+            if ( index < parent->sections_.size() ) {
+                sec = parent->sections_[index];
+            }
+
+            return sec;
+        }
+
+//------------------------------------------------------------------------------
+        section* operator[]( const std::string& name ) const
+        {
+            section* sec = 0;
+
+            std::vector<section*>::const_iterator it;
+            for ( it = parent->sections_.begin();
+                  it != parent->sections_.end();
+                  ++it ) {
+                if ( (*it)->get_name() == name ) {
+                    sec = *it;
+                    break;
+                }
+            }
+
+            return sec;
+        }
+
+//------------------------------------------------------------------------------
+        section* add( const std::string& name )
+        {
+            section* new_section = parent->create_section();
+            new_section->set_name( name );
+
+            Elf_Half str_index = parent->get_section_name_str_index();
+            section* string_table( parent->sections_[str_index] );
+            string_section_accessor str_writer( string_table );
+            Elf_Word pos = str_writer.add_string( name );
+            new_section->set_name_string_offset( pos );
+
+            return new_section;
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<section*>::iterator begin() {
+            return parent->sections_.begin();
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<section*>::iterator end() {
+            return parent->sections_.end();
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<section*>::const_iterator begin() const {
+            return parent->sections_.cbegin();
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<section*>::const_iterator end() const {
+            return parent->sections_.cend();
+        }
+
+//------------------------------------------------------------------------------
+      private:
+        elfio* parent;
+    } sections;
+
+//------------------------------------------------------------------------------
+  public:
+    friend class Segments;
+    class Segments {
+      public:
+//------------------------------------------------------------------------------
+        Segments( elfio* parent_ ) :
+            parent( parent_ )
+        {
+        }
+
+//------------------------------------------------------------------------------
+        Elf_Half size() const
+        {
+            return (Elf_Half)parent->segments_.size();
+        }
+
+//------------------------------------------------------------------------------
+        segment* operator[]( unsigned int index ) const
+        {
+            return parent->segments_[index];
+        }
+
+
+//------------------------------------------------------------------------------
+        segment* add()
+        {
+            return parent->create_segment();
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<segment*>::iterator begin() {
+            return parent->segments_.begin();
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<segment*>::iterator end() {
+            return parent->segments_.end();
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<segment*>::const_iterator begin() const {
+            return parent->segments_.cbegin();
+        }
+
+//------------------------------------------------------------------------------
+        std::vector<segment*>::const_iterator end() const {
+            return parent->segments_.cend();
+        }
+
+//------------------------------------------------------------------------------
+      private:
+        elfio* parent;
+    } segments;
+
+//------------------------------------------------------------------------------
+  private:
+    elf_header*           header;
+    std::vector<section*> sections_;
+    std::vector<segment*> segments_;
+    endianess_convertor   convertor;
+
+    Elf_Xword current_file_pos;
+};
+
+} // namespace ELFIO
+
+#include <elfio/elfio_symbols.hpp>
+#include <elfio/elfio_note.hpp>
+#include <elfio/elfio_relocation.hpp>
+#include <elfio/elfio_dynamic.hpp>
+
+#ifdef _MSC_VER
+#pragma warning ( pop )
+#endif
+
+#endif // ELFIO_HPP
diff --git a/third_party/elfio/elfio_amdgpu.hpp b/third_party/elfio/elfio_amdgpu.hpp
new file mode 100644
index 00000000000..318ecdf144a
--- /dev/null
+++ b/third_party/elfio/elfio_amdgpu.hpp
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ELFIO_AMDGPU_HPP
+#define ELFIO_AMDGPU_HPP
+
+namespace hc {
+
+// AMDGPU e_flags mirroring from llvm/include/llvm/BinaryFormat/ELF.h
+
+// AMDGPU specific e_flags.
+enum : unsigned {
+  // Processor selection mask for EF_AMDGPU_MACH_* values.
+  EF_AMDGPU_MACH = 0x0ff,
+
+  // Not specified processor.
+  EF_AMDGPU_MACH_NONE = 0x000,
+
+  // R600-based processors.
+
+  // Radeon HD 2000/3000 Series (R600).
+  EF_AMDGPU_MACH_R600_R600 = 0x001,
+  EF_AMDGPU_MACH_R600_R630 = 0x002,
+  EF_AMDGPU_MACH_R600_RS880 = 0x003,
+  EF_AMDGPU_MACH_R600_RV670 = 0x004,
+  // Radeon HD 4000 Series (R700).
+  EF_AMDGPU_MACH_R600_RV710 = 0x005,
+  EF_AMDGPU_MACH_R600_RV730 = 0x006,
+  EF_AMDGPU_MACH_R600_RV770 = 0x007,
+  // Radeon HD 5000 Series (Evergreen).
+  EF_AMDGPU_MACH_R600_CEDAR = 0x008,
+  EF_AMDGPU_MACH_R600_CYPRESS = 0x009,
+  EF_AMDGPU_MACH_R600_JUNIPER = 0x00a,
+  EF_AMDGPU_MACH_R600_REDWOOD = 0x00b,
+  EF_AMDGPU_MACH_R600_SUMO = 0x00c,
+  // Radeon HD 6000 Series (Northern Islands).
+  EF_AMDGPU_MACH_R600_BARTS = 0x00d,
+  EF_AMDGPU_MACH_R600_CAICOS = 0x00e,
+  EF_AMDGPU_MACH_R600_CAYMAN = 0x00f,
+  EF_AMDGPU_MACH_R600_TURKS = 0x010,
+
+  // Reserved for R600-based processors.
+  EF_AMDGPU_MACH_R600_RESERVED_FIRST = 0x011,
+  EF_AMDGPU_MACH_R600_RESERVED_LAST = 0x01f,
+
+  // First/last R600-based processors.
+  EF_AMDGPU_MACH_R600_FIRST = EF_AMDGPU_MACH_R600_R600,
+  EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS,
+
+  // AMDGCN-based processors.
+
+  // AMDGCN GFX6.
+  EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
+  EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
+  // AMDGCN GFX7.
+  EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
+  EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023,
+  EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024,
+  EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025,
+  EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026,
+  // AMDGCN GFX8.
+  EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
+  EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029,
+  EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
+  EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b,
+  // AMDGCN GFX9.
+  EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
+  EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
+  EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
+
+  // Reserved for AMDGCN-based processors.
+  EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x030,
+
+  // First/last AMDGCN-based processors.
+  EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX906,
+
+  // Indicates if the xnack target feature is enabled for all code contained in
+  // the object.
+  EF_AMDGPU_XNACK = 0x100,
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_AMDGPU_HPP
diff --git a/third_party/elfio/elfio_dump.hpp b/third_party/elfio/elfio_dump.hpp
new file mode 100644
index 00000000000..913011ecde0
--- /dev/null
+++ b/third_party/elfio/elfio_dump.hpp
@@ -0,0 +1,976 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_DUMP_HPP
+#define ELFIO_DUMP_HPP
+
+#include <algorithm>
+#include <string>
+#include <ostream>
+#include <sstream>
+#include <iomanip>
+#include <elfio/elfio.hpp>
+
+namespace ELFIO {
+
+
+static struct class_table_t {
+    const char  key;
+    const char* str;
+} class_table [] = 
+{
+    { ELFCLASS32, "ELF32" },
+    { ELFCLASS64, "ELF64" },
+};
+
+
+static struct endian_table_t {
+    const char  key;
+    const char* str;
+} endian_table [] = 
+{
+    { ELFDATANONE, "None"          },
+    { ELFDATA2LSB, "Little endian" },
+    { ELFDATA2MSB, "Big endian"    },
+};
+
+
+static struct version_table_t {
+    const Elf64_Word key;
+    const char*      str;
+} version_table [] = 
+{
+    { EV_NONE   , "None"    },
+    { EV_CURRENT, "Current" },
+};
+
+
+static struct type_table_t {
+    const Elf32_Half key;
+    const char*      str;
+} type_table [] = 
+{
+    { ET_NONE, "No file type"       },
+    { ET_REL , "Relocatable file"   },
+    { ET_EXEC, "Executable file"    },
+    { ET_DYN , "Shared object file" },
+    { ET_CORE, "Core file"          },
+};
+
+
+static struct machine_table_t {
+    const Elf64_Half key;
+    const char*      str;
+} machine_table [] = 
+{
+    { EM_NONE         , "No machine"                                                              },
+    { EM_M32          , "AT&T WE 32100"                                                           },
+    { EM_SPARC        , "SUN SPARC"                                                               },
+    { EM_386          , "Intel 80386"                                                             },
+    { EM_68K          , "Motorola m68k family"                                                    },
+    { EM_88K          , "Motorola m88k family"                                                    },
+    { EM_486          , "Intel 80486// Reserved for future use"                                   },
+    { EM_860          , "Intel 80860"                                                             },
+    { EM_MIPS         , "MIPS R3000 (officially, big-endian only)"                                },
+    { EM_S370         , "IBM System/370"                                                          },
+    { EM_MIPS_RS3_LE  , "MIPS R3000 little-endian (Oct 4 1999 Draft) Deprecated"                  },
+    { EM_res011       , "Reserved"                                                                },
+    { EM_res012       , "Reserved"                                                                },
+    { EM_res013       , "Reserved"                                                                },
+    { EM_res014       , "Reserved"                                                                },
+    { EM_PARISC       , "HPPA"                                                                    },
+    { EM_res016       , "Reserved"                                                                },
+    { EM_VPP550       , "Fujitsu VPP500"                                                          },
+    { EM_SPARC32PLUS  , "Sun's v8plus"                                                            },
+    { EM_960          , "Intel 80960"                                                             },
+    { EM_PPC          , "PowerPC"                                                                 },
+    { EM_PPC64        , "64-bit PowerPC"                                                          },
+    { EM_S390         , "IBM S/390"                                                               },
+    { EM_SPU          , "Sony/Toshiba/IBM SPU"                                                    },
+    { EM_res024       , "Reserved"                                                                },
+    { EM_res025       , "Reserved"                                                                },
+    { EM_res026       , "Reserved"                                                                },
+    { EM_res027       , "Reserved"                                                                },
+    { EM_res028       , "Reserved"                                                                },
+    { EM_res029       , "Reserved"                                                                },
+    { EM_res030       , "Reserved"                                                                },
+    { EM_res031       , "Reserved"                                                                },
+    { EM_res032       , "Reserved"                                                                },
+    { EM_res033       , "Reserved"                                                                },
+    { EM_res034       , "Reserved"                                                                },
+    { EM_res035       , "Reserved"                                                                },
+    { EM_V800         , "NEC V800 series"                                                         },
+    { EM_FR20         , "Fujitsu FR20"                                                            },
+    { EM_RH32         , "TRW RH32"                                                                },
+    { EM_MCORE        , "Motorola M*Core // May also be taken by Fujitsu MMA"                     },
+    { EM_RCE          , "Old name for MCore"                                                      },
+    { EM_ARM          , "ARM"                                                                     },
+    { EM_OLD_ALPHA    , "Digital Alpha"                                                           },
+    { EM_SH           , "Renesas (formerly Hitachi) / SuperH SH"                                  },
+    { EM_SPARCV9      , "SPARC v9 64-bit"                                                         },
+    { EM_TRICORE      , "Siemens Tricore embedded processor"                                      },
+    { EM_ARC          , "ARC Cores"                                                               },
+    { EM_H8_300       , "Renesas (formerly Hitachi) H8/300"                                       },
+    { EM_H8_300H      , "Renesas (formerly Hitachi) H8/300H"                                      },
+    { EM_H8S          , "Renesas (formerly Hitachi) H8S"                                          },
+    { EM_H8_500       , "Renesas (formerly Hitachi) H8/500"                                       },
+    { EM_IA_64        , "Intel IA-64 Processor"                                                   },
+    { EM_MIPS_X       , "Stanford MIPS-X"                                                         },
+    { EM_COLDFIRE     , "Motorola Coldfire"                                                       },
+    { EM_68HC12       , "Motorola M68HC12"                                                        },
+    { EM_MMA          , "Fujitsu Multimedia Accelerator"                                          },
+    { EM_PCP          , "Siemens PCP"                                                             },
+    { EM_NCPU         , "Sony nCPU embedded RISC processor"                                       },
+    { EM_NDR1         , "Denso NDR1 microprocesspr"                                               },
+    { EM_STARCORE     , "Motorola Star*Core processor"                                            },
+    { EM_ME16         , "Toyota ME16 processor"                                                   },
+    { EM_ST100        , "STMicroelectronics ST100 processor"                                      },
+    { EM_TINYJ        , "Advanced Logic Corp. TinyJ embedded processor"                           },
+    { EM_X86_64       , "Advanced Micro Devices X86-64 processor"                                 },
+    { EM_PDSP         , "Sony DSP Processor"                                                      },
+    { EM_PDP10        , "Digital Equipment Corp. PDP-10"                                          },
+    { EM_PDP11        , "Digital Equipment Corp. PDP-11"                                          },
+    { EM_FX66         , "Siemens FX66 microcontroller"                                            },
+    { EM_ST9PLUS      , "STMicroelectronics ST9+ 8/16 bit microcontroller"                        },
+    { EM_ST7          , "STMicroelectronics ST7 8-bit microcontroller"                            },
+    { EM_68HC16       , "Motorola MC68HC16 Microcontroller"                                       },
+    { EM_68HC11       , "Motorola MC68HC11 Microcontroller"                                       },
+    { EM_68HC08       , "Motorola MC68HC08 Microcontroller"                                       },
+    { EM_68HC05       , "Motorola MC68HC05 Microcontroller"                                       },
+    { EM_SVX          , "Silicon Graphics SVx"                                                    },
+    { EM_ST19         , "STMicroelectronics ST19 8-bit cpu"                                       },
+    { EM_VAX          , "Digital VAX"                                                             },
+    { EM_CRIS         , "Axis Communications 32-bit embedded processor"                           },
+    { EM_JAVELIN      , "Infineon Technologies 32-bit embedded cpu"                               },
+    { EM_FIREPATH     , "Element 14 64-bit DSP processor"                                         },
+    { EM_ZSP          , "LSI Logic's 16-bit DSP processor"                                        },
+    { EM_MMIX         , "Donald Knuth's educational 64-bit processor"                             },
+    { EM_HUANY        , "Harvard's machine-independent format"                                    },
+    { EM_PRISM        , "SiTera Prism"                                                            },
+    { EM_AVR          , "Atmel AVR 8-bit microcontroller"                                         },
+    { EM_FR30         , "Fujitsu FR30"                                                            },
+    { EM_D10V         , "Mitsubishi D10V"                                                         },
+    { EM_D30V         , "Mitsubishi D30V"                                                         },
+    { EM_V850         , "NEC v850"                                                                },
+    { EM_M32R         , "Renesas M32R (formerly Mitsubishi M32R)"                                 },
+    { EM_MN10300      , "Matsushita MN10300"                                                      },
+    { EM_MN10200      , "Matsushita MN10200"                                                      },
+    { EM_PJ           , "picoJava"                                                                },
+    { EM_OPENRISC     , "OpenRISC 32-bit embedded processor"                                      },
+    { EM_ARC_A5       , "ARC Cores Tangent-A5"                                                    },
+    { EM_XTENSA       , "Tensilica Xtensa Architecture"                                           },
+    { EM_VIDEOCORE    , "Alphamosaic VideoCore processor"                                         },
+    { EM_TMM_GPP      , "Thompson Multimedia General Purpose Processor"                           },
+    { EM_NS32K        , "National Semiconductor 32000 series"                                     },
+    { EM_TPC          , "Tenor Network TPC processor"                                             },
+    { EM_SNP1K        , "Trebia SNP 1000 processor"                                               },
+    { EM_ST200        , "STMicroelectronics ST200 microcontroller"                                },
+    { EM_IP2K         , "Ubicom IP2022 micro controller"                                          },
+    { EM_MAX          , "MAX Processor"                                                           },
+    { EM_CR           , "National Semiconductor CompactRISC"                                      },
+    { EM_F2MC16       , "Fujitsu F2MC16"                                                          },
+    { EM_MSP430       , "TI msp430 micro controller"                                              },
+    { EM_BLACKFIN     , "ADI Blackfin"                                                            },
+    { EM_SE_C33       , "S1C33 Family of Seiko Epson processors"                                  },
+    { EM_SEP          , "Sharp embedded microprocessor"                                           },
+    { EM_ARCA         , "Arca RISC Microprocessor"                                                },
+    { EM_UNICORE      , "Microprocessor series from PKU-Unity Ltd. and MPRC of Peking University" },
+    { EM_EXCESS       , "eXcess: 16/32/64-bit configurable embedded CPU"                          },
+    { EM_DXP          , "Icera Semiconductor Inc. Deep Execution Processor"                       },
+    { EM_ALTERA_NIOS2 , "Altera Nios II soft-core processor"                                      },
+    { EM_CRX          , "National Semiconductor CRX"                                              },
+    { EM_XGATE        , "Motorola XGATE embedded processor"                                       },
+    { EM_C166         , "Infineon C16x/XC16x processor"                                           },
+    { EM_M16C         , "Renesas M16C series microprocessors"                                     },
+    { EM_DSPIC30F     , "Microchip Technology dsPIC30F Digital Signal Controller"                 },
+    { EM_CE           , "Freescale Communication Engine RISC core"                                },
+    { EM_M32C         , "Renesas M32C series microprocessors"                                     },
+    { EM_res121       , "Reserved"                                                                },
+    { EM_res122       , "Reserved"                                                                },
+    { EM_res123       , "Reserved"                                                                },
+    { EM_res124       , "Reserved"                                                                },
+    { EM_res125       , "Reserved"                                                                },
+    { EM_res126       , "Reserved"                                                                },
+    { EM_res127       , "Reserved"                                                                },
+    { EM_res128       , "Reserved"                                                                },
+    { EM_res129       , "Reserved"                                                                },
+    { EM_res130       , "Reserved"                                                                },
+    { EM_TSK3000      , "Altium TSK3000 core"                                                     },
+    { EM_RS08         , "Freescale RS08 embedded processor"                                       },
+    { EM_res133       , "Reserved"                                                                },
+    { EM_ECOG2        , "Cyan Technology eCOG2 microprocessor"                                    },
+    { EM_SCORE        , "Sunplus Score"                                                           },
+    { EM_SCORE7       , "Sunplus S+core7 RISC processor"                                          },
+    { EM_DSP24        , "New Japan Radio (NJR) 24-bit DSP Processor"                              },
+    { EM_VIDEOCORE3   , "Broadcom VideoCore III processor"                                        },
+    { EM_LATTICEMICO32, "RISC processor for Lattice FPGA architecture"                            },
+    { EM_SE_C17       , "Seiko Epson C17 family"                                                  },
+    { EM_TI_C6000     , "Texas Instruments TMS320C6000 DSP family"                                },
+    { EM_TI_C2000     , "Texas Instruments TMS320C2000 DSP family"                                },
+    { EM_TI_C5500     , "Texas Instruments TMS320C55x DSP family"                                 },
+    { EM_res143       , "Reserved"                                                                },
+    { EM_res144       , "Reserved"                                                                },
+    { EM_res145       , "Reserved"                                                                },
+    { EM_res146       , "Reserved"                                                                },
+    { EM_res147       , "Reserved"                                                                },
+    { EM_res148       , "Reserved"                                                                },
+    { EM_res149       , "Reserved"                                                                },
+    { EM_res150       , "Reserved"                                                                },
+    { EM_res151       , "Reserved"                                                                },
+    { EM_res152       , "Reserved"                                                                },
+    { EM_res153       , "Reserved"                                                                },
+    { EM_res154       , "Reserved"                                                                },
+    { EM_res155       , "Reserved"                                                                },
+    { EM_res156       , "Reserved"                                                                },
+    { EM_res157       , "Reserved"                                                                },
+    { EM_res158       , "Reserved"                                                                },
+    { EM_res159       , "Reserved"                                                                },
+    { EM_MMDSP_PLUS   , "STMicroelectronics 64bit VLIW Data Signal Processor"                     },
+    { EM_CYPRESS_M8C  , "Cypress M8C microprocessor"                                              },
+    { EM_R32C         , "Renesas R32C series microprocessors"                                     },
+    { EM_TRIMEDIA     , "NXP Semiconductors TriMedia architecture family"                         },
+    { EM_QDSP6        , "QUALCOMM DSP6 Processor"                                                 },
+    { EM_8051         , "Intel 8051 and variants"                                                 },
+    { EM_STXP7X       , "STMicroelectronics STxP7x family"                                        },
+    { EM_NDS32        , "Andes Technology compact code size embedded RISC processor family"       },
+    { EM_ECOG1        , "Cyan Technology eCOG1X family"                                           },
+    { EM_ECOG1X       , "Cyan Technology eCOG1X family"                                           },
+    { EM_MAXQ30       , "Dallas Semiconductor MAXQ30 Core Micro-controllers"                      },
+    { EM_XIMO16       , "New Japan Radio (NJR) 16-bit DSP Processor"                              },
+    { EM_MANIK        , "M2000 Reconfigurable RISC Microprocessor"                                },
+    { EM_CRAYNV2      , "Cray Inc. NV2 vector architecture"                                       },
+    { EM_RX           , "Renesas RX family"                                                       },
+    { EM_METAG        , "Imagination Technologies META processor architecture"                    },
+    { EM_MCST_ELBRUS  , "MCST Elbrus general purpose hardware architecture"                       },
+    { EM_ECOG16       , "Cyan Technology eCOG16 family"                                           },
+    { EM_CR16         , "National Semiconductor CompactRISC 16-bit processor"                     },
+    { EM_ETPU         , "Freescale Extended Time Processing Unit"                                 },
+    { EM_SLE9X        , "Infineon Technologies SLE9X core"                                        },
+    { EM_L1OM         , "Intel L1OM"                                                              },
+    { EM_INTEL181     , "Reserved by Intel"                                                       },
+    { EM_INTEL182     , "Reserved by Intel"                                                       },
+    { EM_res183       , "Reserved by ARM"                                                         },
+    { EM_res184       , "Reserved by ARM"                                                         },
+    { EM_AVR32        , "Atmel Corporation 32-bit microprocessor family"                          },
+    { EM_STM8         , "STMicroeletronics STM8 8-bit microcontroller"                            },
+    { EM_TILE64       , "Tilera TILE64 multicore architecture family"                             },
+    { EM_TILEPRO      , "Tilera TILEPro multicore architecture family"                            },
+    { EM_MICROBLAZE   , "Xilinx MicroBlaze 32-bit RISC soft processor core"                       },
+    { EM_CUDA         , "NVIDIA CUDA architecture "                                               },
+};
+
+
+static struct section_type_table_t {
+    const Elf64_Half key;
+    const char*      str;
+} section_type_table [] = 
+{
+    { SHT_NULL         , "NULL"          },
+    { SHT_PROGBITS     , "PROGBITS"      },
+    { SHT_SYMTAB       , "SYMTAB"        },
+    { SHT_STRTAB       , "STRTAB"        },
+    { SHT_RELA         , "RELA"          },
+    { SHT_HASH         , "HASH"          },
+    { SHT_DYNAMIC      , "DYNAMIC"       },
+    { SHT_NOTE         , "NOTE"          },
+    { SHT_NOBITS       , "NOBITS"        },
+    { SHT_REL          , "REL"           },
+    { SHT_SHLIB        , "SHLIB"         },
+    { SHT_DYNSYM       , "DYNSYM"        },
+    { SHT_INIT_ARRAY   , "INIT_ARRAY"    },
+    { SHT_FINI_ARRAY   , "FINI_ARRAY"    },
+    { SHT_PREINIT_ARRAY, "PREINIT_ARRAY" },
+    { SHT_GROUP        , "GROUP"         },
+    { SHT_SYMTAB_SHNDX , "SYMTAB_SHNDX " },
+};
+
+
+static struct segment_type_table_t {
+    const Elf_Word key;
+    const char*    str;
+} segment_type_table [] = 
+{
+    { PT_NULL   , "NULL"    },
+    { PT_LOAD   , "LOAD"    },
+    { PT_DYNAMIC, "DYNAMIC" },
+    { PT_INTERP , "INTERP"  },
+    { PT_NOTE   , "NOTE"    },
+    { PT_SHLIB  , "SHLIB"   },
+    { PT_PHDR   , "PHDR"    },
+    { PT_TLS    , "TLS"     },
+};
+
+
+static struct segment_flag_table_t {
+    const Elf_Word key;
+    const char*    str;
+} segment_flag_table [] = 
+{
+    { 0, ""    },
+    { 1, "X"   },
+    { 2, "W"   },
+    { 3, "WX"  },
+    { 4, "R"   },
+    { 5, "RX"  },
+    { 6, "RW"  },
+    { 7, "RWX" },
+};
+
+
+static struct symbol_bind_t {
+    const Elf_Word key;
+    const char*    str;
+} symbol_bind_table [] = 
+{
+    { STB_LOCAL   , "LOCAL"    },
+    { STB_GLOBAL  , "GLOBAL"   },
+    { STB_WEAK    , "WEAK"     },
+    { STB_LOOS    , "LOOS"     },
+    { STB_HIOS    , "HIOS"     },
+    { STB_MULTIDEF, "MULTIDEF" },
+    { STB_LOPROC  , "LOPROC"   },
+    { STB_HIPROC  , "HIPROC"   },
+};
+
+
+static struct symbol_type_t {
+    const Elf_Word key;
+    const char*    str;
+} symbol_type_table [] = 
+{
+    { STT_NOTYPE , "NOTYPE"  },
+    { STT_OBJECT , "OBJECT"  },
+    { STT_FUNC   , "FUNC"    },
+    { STT_SECTION, "SECTION" },
+    { STT_FILE   , "FILE"    },
+    { STT_COMMON , "COMMON"  },
+    { STT_TLS    , "TLS"     },
+    { STT_LOOS   , "LOOS"    },
+    { STT_HIOS   , "HIOS"    },
+    { STT_LOPROC , "LOPROC"  },
+    { STT_HIPROC , "HIPROC"  },
+};
+
+
+static struct dynamic_tag_t {
+    const Elf_Word key;
+    const char*    str;
+} dynamic_tag_table [] = 
+{
+    { DT_NULL           , "NULL"            },
+    { DT_NEEDED         , "NEEDED"          },
+    { DT_PLTRELSZ       , "PLTRELSZ"        },
+    { DT_PLTGOT         , "PLTGOT"          },
+    { DT_HASH           , "HASH"            },
+    { DT_STRTAB         , "STRTAB"          },
+    { DT_SYMTAB         , "SYMTAB"          },
+    { DT_RELA           , "RELA"            },
+    { DT_RELASZ         , "RELASZ"          },
+    { DT_RELAENT        , "RELAENT"         },
+    { DT_STRSZ          , "STRSZ"           },
+    { DT_SYMENT         , "SYMENT"          },
+    { DT_INIT           , "INIT"            },
+    { DT_FINI           , "FINI"            },
+    { DT_SONAME         , "SONAME"          },
+    { DT_RPATH          , "RPATH"           },
+    { DT_SYMBOLIC       , "SYMBOLIC"        },
+    { DT_REL            , "REL"             },
+    { DT_RELSZ          , "RELSZ"           },
+    { DT_RELENT         , "RELENT"          },
+    { DT_PLTREL         , "PLTREL"          },
+    { DT_DEBUG          , "DEBUG"           },
+    { DT_TEXTREL        , "TEXTREL"         },
+    { DT_JMPREL         , "JMPREL"          },
+    { DT_BIND_NOW       , "BIND_NOW"        },
+    { DT_INIT_ARRAY     , "INIT_ARRAY"      },
+    { DT_FINI_ARRAY     , "FINI_ARRAY"      },
+    { DT_INIT_ARRAYSZ   , "INIT_ARRAYSZ"    },
+    { DT_FINI_ARRAYSZ   , "FINI_ARRAYSZ"    },
+    { DT_RUNPATH        , "RUNPATH"         },
+    { DT_FLAGS          , "FLAGS"           },
+    { DT_ENCODING       , "ENCODING"        },
+    { DT_PREINIT_ARRAY  , "PREINIT_ARRAY"   },
+    { DT_PREINIT_ARRAYSZ, "PREINIT_ARRAYSZ" },
+    { DT_MAXPOSTAGS     , "MAXPOSTAGS"      },
+};
+
+static const ELFIO::Elf_Xword MAX_DATA_ENTRIES = 64;
+
+//------------------------------------------------------------------------------
+class dump
+{
+#define DUMP_DEC_FORMAT( width ) std::setw(width) << std::setfill( ' ' ) << \
+                                 std::dec << std::right
+#define DUMP_HEX_FORMAT( width ) std::setw(width) << std::setfill( '0' ) << \
+                                 std::hex << std::right
+#define DUMP_STR_FORMAT( width ) std::setw(width) << std::setfill( ' ' ) << \
+                                 std::hex << std::left
+
+  public:
+//------------------------------------------------------------------------------
+    static void
+    header( std::ostream& out, const elfio& reader )
+     {
+	if (!reader.get_header_size()) 
+	 {
+	    return;
+	 }
+	out << "ELF Header"   << std::endl                               << std::endl
+	  << "  Class:      " << str_class( reader.get_class() )         << std::endl
+	  << "  Encoding:   " << str_endian( reader.get_encoding() )     << std::endl
+	  << "  ELFVersion: " << str_version( reader.get_elf_version() ) << std::endl
+	  << "  Type:       " << str_type( reader.get_type() )           << std::endl
+	  << "  Machine:    " << str_machine( reader.get_machine() )     << std::endl
+	  << "  Version:    " << str_version( reader.get_version() )     << std::endl
+	  << "  Entry:      " << "0x" << std::hex << reader.get_entry()  << std::endl
+	  << "  Flags:      " << "0x" << std::hex << reader.get_flags()  << std::endl
+	  << std::endl;
+     }
+
+//------------------------------------------------------------------------------
+    static void
+    section_headers( std::ostream& out, const elfio& reader )
+    {
+        Elf_Half n = reader.sections.size();
+
+        if ( n == 0 ) {
+            return;
+        }
+
+        out << "Section Headers:" << std::endl;
+        if ( reader.get_class() == ELFCLASS32 ) { // Output for 32-bit
+            out << "[  Nr ] Type              Addr     Size     ES Flg Lk Inf Al Name" << std::endl;
+        }
+        else {                                    // Output for 64-bit
+            out << "[  Nr ] Type              Addr             Size             ES   Flg" << std::endl
+                << "        Lk   Inf  Al      Name" << std::endl;
+        }
+            
+        for ( Elf_Half i = 0; i < n; ++i ) { // For all sections
+            section* sec = reader.sections[i];
+            section_header( out, i, sec, reader.get_class() );
+        }
+        
+        out << "Key to Flags: W (write), A (alloc), X (execute)\n\n"
+            << std::endl;
+    }
+
+//------------------------------------------------------------------------------
+    static void
+    section_header( std::ostream& out, Elf_Half no, const section* sec,
+                    unsigned char elf_class )
+    {
+        std::ios_base::fmtflags original_flags = out.flags();
+
+        if ( elf_class == ELFCLASS32 ) { // Output for 32-bit
+            out << "[" 
+                << DUMP_DEC_FORMAT(  5 ) << no
+                << "] "
+                << DUMP_STR_FORMAT( 17 ) << str_section_type( sec->get_type() ) << " "
+                << DUMP_HEX_FORMAT(  8 ) << sec->get_address()                  << " "
+                << DUMP_HEX_FORMAT(  8 ) << sec->get_size()                     << " "
+                << DUMP_HEX_FORMAT(  2 ) << sec->get_entry_size()               << " "
+                << DUMP_STR_FORMAT(  3 ) << section_flags( sec->get_flags() )   << " "
+                << DUMP_HEX_FORMAT(  2 ) << sec->get_link()                     << " "
+                << DUMP_HEX_FORMAT(  3 ) << sec->get_info()                     << " "
+                << DUMP_HEX_FORMAT(  2 ) << sec->get_addr_align()               << " "
+                << DUMP_STR_FORMAT( 17 ) << sec->get_name()                     << " "
+                << std::endl;
+        }
+        else {                           // Output for 64-bit
+            out << "[" 
+                << DUMP_DEC_FORMAT(  5 ) << no
+                << "] "
+                << DUMP_STR_FORMAT( 17 ) << str_section_type( sec->get_type() ) << " "
+                << DUMP_HEX_FORMAT( 16 ) << sec->get_address()                  << " "
+                << DUMP_HEX_FORMAT( 16 ) << sec->get_size()                     << " "
+                << DUMP_HEX_FORMAT(  4 ) << sec->get_entry_size()               << " "
+                << DUMP_STR_FORMAT(  3 ) << section_flags( sec->get_flags() )   << " "
+                << std::endl
+                << "        "
+                << DUMP_HEX_FORMAT(  4 ) << sec->get_link()                     << " "
+                << DUMP_HEX_FORMAT(  4 ) << sec->get_info()                     << " "
+                << DUMP_HEX_FORMAT(  4 ) << sec->get_addr_align()               << "    "
+                << DUMP_STR_FORMAT( 17 ) << sec->get_name()                     << " "
+                << std::endl;
+        }
+
+        out.flags(original_flags);
+
+        return; 
+    }
+
+//------------------------------------------------------------------------------
+    static void
+    segment_headers( std::ostream& out, const elfio& reader )
+    {
+        Elf_Half n = reader.segments.size();
+        if ( n == 0 ) {
+            return;
+        }
+
+        out << "Segment headers:" << std::endl;
+        if ( reader.get_class() == ELFCLASS32 ) { // Output for 32-bit
+            out << "[  Nr ] Type           VirtAddr PhysAddr FileSize Mem.Size Flags    Align"
+                << std::endl;
+        }
+        else {                                    // Output for 64-bit
+            out << "[  Nr ] Type           VirtAddr         PhysAddr         Flags" << std::endl
+                << "                       FileSize         Mem.Size         Align"
+                << std::endl;
+        }
+            
+        for ( Elf_Half i = 0; i < n; ++i ) {
+            segment* seg = reader.segments[i];
+            segment_header( out, i, seg, reader.get_class() );
+        }
+            
+        out << std::endl;
+    }
+
+//------------------------------------------------------------------------------
+    static void
+    segment_header( std::ostream& out, Elf_Half no, const segment* seg,
+                    unsigned int elf_class )
+    {
+        std::ios_base::fmtflags original_flags = out.flags();
+
+        if ( elf_class == ELFCLASS32 ) { // Output for 32-bit
+            out << "[" 
+                << DUMP_DEC_FORMAT(  5 ) << no
+                << "] "
+                << DUMP_STR_FORMAT( 14 ) << str_segment_type( seg->get_type() )  << " "
+                << DUMP_HEX_FORMAT(  8 ) << seg->get_virtual_address()           << " "
+                << DUMP_HEX_FORMAT(  8 ) << seg->get_physical_address()          << " "
+                << DUMP_HEX_FORMAT(  8 ) << seg->get_file_size()                 << " "
+                << DUMP_HEX_FORMAT(  8 ) << seg->get_memory_size()               << " "
+                << DUMP_STR_FORMAT(  8 ) << str_segment_flag( seg->get_flags() ) << " "
+                << DUMP_HEX_FORMAT(  8 ) << seg->get_align()                     << " "
+                << std::endl;
+        }
+        else {                           // Output for 64-bit
+            out << "[" 
+                << DUMP_DEC_FORMAT(  5 ) << no
+                << "] "
+                << DUMP_STR_FORMAT( 14 ) << str_segment_type( seg->get_type() )  << " "
+                << DUMP_HEX_FORMAT( 16 ) << seg->get_virtual_address()           << " "
+                << DUMP_HEX_FORMAT( 16 ) << seg->get_physical_address()          << " "
+                << DUMP_STR_FORMAT( 16 ) << str_segment_flag( seg->get_flags() ) << " "
+                << std::endl
+                << "                       "
+                << DUMP_HEX_FORMAT( 16 ) << seg->get_file_size()                 << " "
+                << DUMP_HEX_FORMAT( 16 ) << seg->get_memory_size()               << " "
+                << DUMP_HEX_FORMAT( 16 ) << seg->get_align()                     << " "
+                << std::endl;
+        }
+
+        out.flags(original_flags);
+    }
+    
+//------------------------------------------------------------------------------
+    static void
+    symbol_tables( std::ostream& out, const elfio& reader )
+    {
+        Elf_Half n = reader.sections.size();
+        for ( Elf_Half i = 0; i < n; ++i ) {    // For all sections
+            section* sec = reader.sections[i];
+            if ( SHT_SYMTAB == sec->get_type() || SHT_DYNSYM == sec->get_type() ) {
+                symbol_section_accessor symbols( reader, sec );
+
+                Elf_Xword     sym_no = symbols.get_symbols_num();
+                if ( sym_no > 0 ) {
+                    out << "Symbol table (" << sec->get_name() << ")" << std::endl;
+                    if ( reader.get_class() == ELFCLASS32 ) { // Output for 32-bit
+                        out << "[  Nr ] Value    Size     Type    Bind      Sect Name"
+                            << std::endl;
+                    }
+                    else {                                    // Output for 64-bit
+                        out << "[  Nr ] Value            Size             Type    Bind      Sect" << std::endl
+                            << "        Name"
+                            << std::endl;
+                    }
+                    for ( Elf_Half i = 0; i < sym_no; ++i ) {
+                        std::string   name;
+                        Elf64_Addr    value   = 0;
+                        Elf_Xword     size    = 0;
+                        unsigned char bind    = 0;
+                        unsigned char type    = 0;
+                        Elf_Half      section = 0;
+                        unsigned char other   = 0;
+                        symbols.get_symbol( i, name, value, size, bind, type, section, other );
+                        symbol_table( out, i, name, value, size, bind, type, section, reader.get_class() );
+                    }
+
+                    out << std::endl;
+                }
+            }
+        }
+    }
+    
+//------------------------------------------------------------------------------
+    static void
+    symbol_table( std::ostream& out,
+                  Elf_Half      no,
+                  std::string&  name,
+                  Elf64_Addr    value,
+                  Elf_Xword     size,
+                  unsigned char bind,
+                  unsigned char type,
+                  Elf_Half      section,
+                  unsigned int  elf_class )
+    {
+        std::ios_base::fmtflags original_flags = out.flags();
+
+        if ( elf_class == ELFCLASS32 ) { // Output for 32-bit
+            out << "[" 
+                << DUMP_DEC_FORMAT(  5 ) << no
+                << "] "
+                << DUMP_HEX_FORMAT(  8 ) << value                   << " "
+                << DUMP_HEX_FORMAT(  8 ) << size                    << " "
+                << DUMP_STR_FORMAT(  7 ) << str_symbol_type( type ) << " "
+                << DUMP_STR_FORMAT(  8 ) << str_symbol_bind( bind ) << " "
+                << DUMP_DEC_FORMAT(  5 ) << section                 << " "
+                << DUMP_STR_FORMAT(  1 ) << name                    << " "
+                << std::endl;
+        }
+        else {                           // Output for 64-bit
+            out << "[" 
+                << DUMP_DEC_FORMAT(  5 ) << no
+                << "] "
+                << DUMP_HEX_FORMAT( 16 ) << value                   << " "
+                << DUMP_HEX_FORMAT( 16 ) << size                    << " "
+                << DUMP_STR_FORMAT(  7 ) << str_symbol_type( type ) << " "
+                << DUMP_STR_FORMAT(  8 ) << str_symbol_bind( bind ) << " "
+                << DUMP_DEC_FORMAT(  5 ) << section                 << " "
+                << std::endl
+                << "        "
+                << DUMP_STR_FORMAT(  1 ) << name                    << " "
+                << std::endl;
+        }
+
+        out.flags(original_flags);
+    }
+    
+//------------------------------------------------------------------------------
+    static void
+    notes( std::ostream& out, const elfio& reader )
+    {
+        Elf_Half no = reader.sections.size();
+        for ( Elf_Half i = 0; i < no; ++i ) {                 // For all sections
+            section* sec = reader.sections[i];
+            if ( SHT_NOTE == sec->get_type() ) {              // Look at notes
+                note_section_accessor notes( reader, sec );
+                int no_notes = notes.get_notes_num();
+                if ( no > 0 ) {
+                    out << "Note section (" << sec->get_name() << ")" << std::endl
+                        << "    No Type     Name"
+                        << std::endl;
+                    for ( int j = 0; j < no_notes; ++j ) {    // For all notes
+                        Elf_Word    type;
+                        std::string name;
+                        void*       desc;
+                        Elf_Word    descsz;
+                    
+                        if ( notes.get_note(j, type, name, desc, descsz) ) {
+                            // 'name' usually contains \0 at the end. Try to fix it
+                            name = name.c_str();
+                            note( out, j, type, name );
+                        }
+                    }
+                    
+                    out << std::endl;
+                }
+            }
+        }
+    }
+
+//------------------------------------------------------------------------------
+    static void
+    note( std::ostream&      out,
+          int                no,
+          Elf_Word           type,
+          const std::string& name )
+    {
+        out << "  [" 
+            << DUMP_DEC_FORMAT( 2 ) << no
+            << "] "
+            << DUMP_HEX_FORMAT( 8 ) << type << " "
+            << DUMP_STR_FORMAT( 1 ) << name
+            << std::endl;
+    }
+    
+//------------------------------------------------------------------------------
+    static void
+    dynamic_tags( std::ostream& out, const elfio& reader )
+    {
+        Elf_Half n = reader.sections.size();
+        for ( Elf_Half i = 0; i < n; ++i ) {    // For all sections
+            section* sec = reader.sections[i];
+            if ( SHT_DYNAMIC == sec->get_type() ) {
+                dynamic_section_accessor dynamic( reader, sec );
+
+                Elf_Xword dyn_no = dynamic.get_entries_num();
+                if ( dyn_no > 0 ) {
+                    out << "Dynamic section (" << sec->get_name() << ")" << std::endl;
+                    out << "[  Nr ] Tag              Name/Value" << std::endl;
+                    for ( Elf_Xword i = 0; i < dyn_no; ++i ) {
+                        Elf_Xword   tag   = 0;
+                        Elf_Xword   value = 0;
+                        std::string str;
+                        dynamic.get_entry( i, tag, value, str );
+                        dynamic_tag( out, i, tag, value, str, reader.get_class() );
+                        if ( DT_NULL == tag ) {
+                            break;
+                        }
+                    }
+
+                    out << std::endl;
+                }
+            }
+        }
+    }
+    
+//------------------------------------------------------------------------------
+    static void
+    dynamic_tag( std::ostream& out,
+                 int           no,
+                 Elf_Xword     tag,
+                 Elf_Xword     value,
+                 std::string   str,
+                 unsigned int  /*elf_class*/ )
+    {
+            out << "[" 
+                << DUMP_DEC_FORMAT(  5 ) << no
+                << "] "
+                << DUMP_STR_FORMAT( 16 ) << str_dynamic_tag( tag ) << " ";
+            if ( str.empty() ) {
+                out << DUMP_HEX_FORMAT( 16 ) << value                  << " ";
+            }
+            else {
+                out << DUMP_STR_FORMAT( 32 ) << str                    << " ";
+            }
+            out << std::endl;
+    }
+
+//------------------------------------------------------------------------------
+    static void
+    section_data( std::ostream& out, const section* sec )
+    {
+        std::ios_base::fmtflags original_flags = out.flags();
+
+        out << sec->get_name() << std::endl;
+        const char* pdata = sec->get_data();
+        if ( pdata ){
+            ELFIO::Elf_Xword i;
+            for ( i = 0; i < std::min( sec->get_size(), MAX_DATA_ENTRIES ); ++i ) {
+                if ( i % 16 == 0 ) {
+                    out << "[" <<  DUMP_HEX_FORMAT( 8 ) << i << "]";
+                }
+
+                out << " " << DUMP_HEX_FORMAT( 2 ) << ( pdata[i] & 0x000000FF );
+
+                if ( i % 16 == 15 ) {
+                    out << std::endl;
+                }
+            }
+            if ( i % 16 != 0 ) {
+                out << std::endl;
+            }
+
+            out.flags(original_flags);
+        }
+
+        return; 
+    }
+
+//------------------------------------------------------------------------------
+    static void
+    section_datas( std::ostream& out, const elfio& reader )
+    {
+        Elf_Half n = reader.sections.size();
+
+        if ( n == 0 ) {
+            return;
+        }
+
+        out << "Section Data:" << std::endl;
+
+        for ( Elf_Half i = 1; i < n; ++i ) { // For all sections
+            section* sec = reader.sections[i];
+            if ( sec->get_type() == SHT_NOBITS ) {
+                continue;
+            }
+            section_data( out, sec );
+        }
+
+        out << std::endl;
+    }
+
+//------------------------------------------------------------------------------
+    static void
+    segment_data( std::ostream& out, Elf_Half no, const segment* seg )
+    {
+        std::ios_base::fmtflags original_flags = out.flags();
+
+        out << "Segment # " << no << std::endl;
+        const char* pdata = seg->get_data();
+        if ( pdata ) {
+            ELFIO::Elf_Xword i;
+            for ( i = 0; i < std::min( seg->get_file_size(), MAX_DATA_ENTRIES ); ++i ) {
+                if ( i % 16 == 0 ) {
+                    out << "[" <<  DUMP_HEX_FORMAT( 8 ) << i << "]";
+                }
+
+                out << " " << DUMP_HEX_FORMAT( 2 ) << ( pdata[i] & 0x000000FF );
+
+                if ( i % 16 == 15 ) {
+                    out << std::endl;
+                }
+            }
+            if ( i % 16 != 0 ) {
+                out << std::endl;
+            }
+
+            out.flags(original_flags);
+        }
+
+        return; 
+    }
+
+//------------------------------------------------------------------------------
+    static void
+    segment_datas( std::ostream& out, const elfio& reader )
+    {
+        Elf_Half n = reader.segments.size();
+
+        if ( n == 0 ) {
+            return;
+        }
+
+        out << "Segment Data:" << std::endl;
+
+        for ( Elf_Half i = 0; i < n; ++i ) { // For all sections
+            segment* seg = reader.segments[i];
+            segment_data( out, i, seg );
+        }
+
+        out << std::endl;
+    }
+    
+  private:
+//------------------------------------------------------------------------------
+    template< typename T, typename K >
+    std::string
+    static
+    find_value_in_table( const T& table, const K& key )
+    {
+        std::string res = "?";
+        for ( unsigned int i = 0; i < sizeof( table )/sizeof( table[0] ); ++i ) {
+            if ( table[i].key == key ) {
+                res = table[i].str;
+                break;
+            }
+        }
+
+        return res;
+    }
+
+
+//------------------------------------------------------------------------------
+    template< typename T, typename K >
+    static
+    std::string
+    format_assoc( const T& table, const K& key )
+    {
+        std::string str = find_value_in_table( table, key );
+        if ( str == "?" ) {
+            std::ostringstream oss;
+            oss << str << " (0x" << std::hex << key << ")";
+            str = oss.str();
+        }
+
+        return str;
+    }
+
+
+//------------------------------------------------------------------------------
+    template< typename T >
+    static
+    std::string
+    format_assoc( const T& table, const char key )
+    {
+        return format_assoc( table, (const int)key );
+    }
+
+    
+//------------------------------------------------------------------------------
+    static
+    std::string
+    section_flags( Elf_Xword flags )
+    {
+        std::string ret = "";
+        if ( flags & SHF_WRITE ) {
+            ret += "W";
+        }
+        if ( flags & SHF_ALLOC ) {
+            ret += "A";
+        }
+        if ( flags & SHF_EXECINSTR ) {
+            ret += "X";
+        }
+
+        return ret;
+    }
+
+
+//------------------------------------------------------------------------------
+#define STR_FUNC_TABLE( name )                    \
+    template< typename T >                        \
+    static                                        \
+    std::string                                   \
+    str_##name( const T key )                     \
+    {                                             \
+        return format_assoc( name##_table, key ); \
+    }
+
+    STR_FUNC_TABLE( class )
+    STR_FUNC_TABLE( endian )
+    STR_FUNC_TABLE( version )
+    STR_FUNC_TABLE( type )
+    STR_FUNC_TABLE( machine )
+    STR_FUNC_TABLE( section_type )
+    STR_FUNC_TABLE( segment_type )
+    STR_FUNC_TABLE( segment_flag )
+    STR_FUNC_TABLE( symbol_bind )
+    STR_FUNC_TABLE( symbol_type )
+    STR_FUNC_TABLE( dynamic_tag )
+
+#undef STR_FUNC_TABLE
+#undef DUMP_DEC_FORMAT
+#undef DUMP_HEX_FORMAT
+#undef DUMP_STR_FORMAT
+}; // class dump
+    
+
+}; // namespace ELFIO
+
+#endif // ELFIO_DUMP_HPP
diff --git a/third_party/elfio/elfio_dynamic.hpp b/third_party/elfio/elfio_dynamic.hpp
new file mode 100644
index 00000000000..64f13b9ce7a
--- /dev/null
+++ b/third_party/elfio/elfio_dynamic.hpp
@@ -0,0 +1,257 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_DYNAMIC_HPP
+#define ELFIO_DYNAMIC_HPP
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+template< class S >
+class dynamic_section_accessor_template
+{
+  public:
+//------------------------------------------------------------------------------
+    dynamic_section_accessor_template( const elfio& elf_file_, S* section_ ) :
+                                       elf_file( elf_file_ ),
+                                       dynamic_section( section_ )
+    {
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Xword
+    get_entries_num() const
+    {
+        Elf_Xword nRet = 0;
+
+        if ( 0 != dynamic_section->get_entry_size() ) {
+            nRet = dynamic_section->get_size() / dynamic_section->get_entry_size();
+        }
+
+        return nRet;
+    }
+
+//------------------------------------------------------------------------------
+    bool
+    get_entry( Elf_Xword    index,
+               Elf_Xword&   tag,
+               Elf_Xword&   value,
+               std::string& str ) const
+    {
+        if ( index >= get_entries_num() ) {    // Is index valid
+            return false;
+        }
+
+        if ( elf_file.get_class() == ELFCLASS32 ) {
+            generic_get_entry_dyn< Elf32_Dyn >( index, tag, value );
+        }
+        else {
+            generic_get_entry_dyn< Elf64_Dyn >( index, tag, value );
+        }
+
+        // If the tag may have a string table reference, prepare the string
+        if ( tag == DT_NEEDED ||
+             tag == DT_SONAME ||
+             tag == DT_RPATH  ||
+             tag == DT_RUNPATH ) {
+            string_section_accessor strsec =
+                elf_file.sections[ get_string_table_index() ];
+            const char* result = strsec.get_string( value );
+            if ( 0 == result ) {
+                str.clear();
+                return false;
+            }
+            str = result;
+        }
+        else {
+            str.clear();
+        }
+
+        return true;
+    }
+
+//------------------------------------------------------------------------------
+    void
+    add_entry( Elf_Xword& tag,
+               Elf_Xword& value )
+    {
+        if ( elf_file.get_class() == ELFCLASS32 ) {
+            generic_add_entry< Elf32_Dyn >( tag, value );
+        }
+        else {
+            generic_add_entry< Elf64_Dyn >( tag, value );
+        }
+    }
+
+//------------------------------------------------------------------------------
+    void
+    add_entry( Elf_Xword&   tag,
+               std::string& str )
+    {
+        string_section_accessor strsec =
+            elf_file.sections[ get_string_table_index() ];
+        Elf_Xword value = strsec.add_string( str );
+        add_entry( tag, value );
+    }
+
+//------------------------------------------------------------------------------
+  private:
+//------------------------------------------------------------------------------
+    Elf_Half
+    get_string_table_index() const
+    {
+        return (Elf_Half)dynamic_section->get_link();
+    }
+
+//------------------------------------------------------------------------------
+    template< class T >
+    void
+    generic_get_entry_dyn( Elf_Xword  index,
+                           Elf_Xword& tag,
+                           Elf_Xword& value ) const
+    {
+        const endianess_convertor& convertor = elf_file.get_convertor();
+
+        // Check unusual case when dynamic section has no data
+        if( dynamic_section->get_data() == 0 ||
+            ( index + 1 ) * dynamic_section->get_entry_size() > dynamic_section->get_size() ) {
+            tag   = DT_NULL;
+            value = 0;
+            return;
+        }
+
+        const T* pEntry = reinterpret_cast<const T*>(
+                dynamic_section->get_data() +
+                index * dynamic_section->get_entry_size() );
+        tag = convertor( pEntry->d_tag );
+        switch ( tag ) {
+        case DT_NULL:
+        case DT_SYMBOLIC:
+        case DT_TEXTREL:
+        case DT_BIND_NOW:
+            value = 0;
+            break;
+        case DT_NEEDED:
+        case DT_PLTRELSZ:
+        case DT_RELASZ:
+        case DT_RELAENT:
+        case DT_STRSZ:
+        case DT_SYMENT:
+        case DT_SONAME:
+        case DT_RPATH:
+        case DT_RELSZ:
+        case DT_RELENT:
+        case DT_PLTREL:
+        case DT_INIT_ARRAYSZ:
+        case DT_FINI_ARRAYSZ:
+        case DT_RUNPATH:
+        case DT_FLAGS:
+        case DT_PREINIT_ARRAYSZ:
+            value = convertor( pEntry->d_un.d_val );
+            break;
+        case DT_PLTGOT:
+        case DT_HASH:
+        case DT_STRTAB:
+        case DT_SYMTAB:
+        case DT_RELA:
+        case DT_INIT:
+        case DT_FINI:
+        case DT_REL:
+        case DT_DEBUG:
+        case DT_JMPREL:
+        case DT_INIT_ARRAY:
+        case DT_FINI_ARRAY:
+        case DT_PREINIT_ARRAY:
+        default:
+            value = convertor( pEntry->d_un.d_ptr );
+            break;
+        }
+    }
+
+//------------------------------------------------------------------------------
+    template< class T >
+    void
+    generic_add_entry( Elf_Xword tag, Elf_Xword value )
+    {
+        const endianess_convertor& convertor = elf_file.get_convertor();
+
+        T entry;
+
+        switch ( tag ) {
+        case DT_NULL:
+        case DT_SYMBOLIC:
+        case DT_TEXTREL:
+        case DT_BIND_NOW:
+            value = 0;
+        case DT_NEEDED:
+        case DT_PLTRELSZ:
+        case DT_RELASZ:
+        case DT_RELAENT:
+        case DT_STRSZ:
+        case DT_SYMENT:
+        case DT_SONAME:
+        case DT_RPATH:
+        case DT_RELSZ:
+        case DT_RELENT:
+        case DT_PLTREL:
+        case DT_INIT_ARRAYSZ:
+        case DT_FINI_ARRAYSZ:
+        case DT_RUNPATH:
+        case DT_FLAGS:
+        case DT_PREINIT_ARRAYSZ:
+            entry.d_un.d_val = convertor( value );
+            break;
+        case DT_PLTGOT:
+        case DT_HASH:
+        case DT_STRTAB:
+        case DT_SYMTAB:
+        case DT_RELA:
+        case DT_INIT:
+        case DT_FINI:
+        case DT_REL:
+        case DT_DEBUG:
+        case DT_JMPREL:
+        case DT_INIT_ARRAY:
+        case DT_FINI_ARRAY:
+        case DT_PREINIT_ARRAY:
+        default:
+            entry.d_un.d_ptr = convertor( value );
+            break;
+        }
+
+        entry.d_tag = convertor( tag );
+
+        dynamic_section->append_data( reinterpret_cast<char*>( &entry ), sizeof( entry ) );
+    }
+
+//------------------------------------------------------------------------------
+  private:
+    const elfio& elf_file;
+    S*           dynamic_section;
+};
+
+using dynamic_section_accessor = dynamic_section_accessor_template<section>;
+using const_dynamic_section_accessor = dynamic_section_accessor_template<const section>;
+
+} // namespace ELFIO
+
+#endif // ELFIO_DYNAMIC_HPP
diff --git a/third_party/elfio/elfio_header.hpp b/third_party/elfio/elfio_header.hpp
new file mode 100644
index 00000000000..e8713cd7894
--- /dev/null
+++ b/third_party/elfio/elfio_header.hpp
@@ -0,0 +1,146 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELF_HEADER_HPP
+#define ELF_HEADER_HPP
+
+#include <iostream>
+
+namespace ELFIO {
+
+class elf_header
+{
+  public:
+    virtual ~elf_header() {};
+    virtual bool load( std::istream& stream )       = 0;
+    virtual bool save( std::ostream& stream ) const = 0;
+
+    // ELF header functions
+    ELFIO_GET_ACCESS_DECL( unsigned char, class              );
+    ELFIO_GET_ACCESS_DECL( unsigned char, elf_version        );
+    ELFIO_GET_ACCESS_DECL( unsigned char, encoding           );
+    ELFIO_GET_ACCESS_DECL( Elf_Half,      header_size        );
+    ELFIO_GET_ACCESS_DECL( Elf_Half,      section_entry_size );
+    ELFIO_GET_ACCESS_DECL( Elf_Half,      segment_entry_size );
+
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Word,      version         );
+    ELFIO_GET_SET_ACCESS_DECL( unsigned char, os_abi          );
+    ELFIO_GET_SET_ACCESS_DECL( unsigned char, abi_version     );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Half,      type            );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Half,      machine         );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Word,      flags           );
+    ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr,    entry           );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Half,      sections_num    );
+    ELFIO_GET_SET_ACCESS_DECL( Elf64_Off,     sections_offset );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Half,      segments_num    );
+    ELFIO_GET_SET_ACCESS_DECL( Elf64_Off,     segments_offset );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Half,      section_name_str_index );
+};
+
+
+template< class T > struct elf_header_impl_types;
+template<> struct elf_header_impl_types<Elf32_Ehdr> {
+    typedef Elf32_Phdr Phdr_type;
+    typedef Elf32_Shdr Shdr_type;
+    static const unsigned char file_class = ELFCLASS32;
+};
+template<> struct elf_header_impl_types<Elf64_Ehdr> {
+    typedef Elf64_Phdr Phdr_type;
+    typedef Elf64_Shdr Shdr_type;
+    static const unsigned char file_class = ELFCLASS64;
+};
+
+template< class T > class elf_header_impl : public elf_header
+{
+  public:
+    elf_header_impl( endianess_convertor* convertor_,
+                     unsigned char encoding )
+    {
+        convertor = convertor_;
+
+        std::fill_n( reinterpret_cast<char*>( &header ), sizeof( header ), '\0' );
+
+        header.e_ident[EI_MAG0]    = ELFMAG0;
+        header.e_ident[EI_MAG1]    = ELFMAG1;
+        header.e_ident[EI_MAG2]    = ELFMAG2;
+        header.e_ident[EI_MAG3]    = ELFMAG3;
+        header.e_ident[EI_CLASS]   = elf_header_impl_types<T>::file_class;
+        header.e_ident[EI_DATA]    = encoding;
+        header.e_ident[EI_VERSION] = EV_CURRENT;
+        header.e_ehsize            = ( sizeof( header ) );
+        header.e_ehsize            = (*convertor)( header.e_ehsize );
+        header.e_shstrndx          = (*convertor)( (Elf_Half)1 );
+        header.e_phentsize         = sizeof( typename elf_header_impl_types<T>::Phdr_type );
+        header.e_shentsize         = sizeof( typename elf_header_impl_types<T>::Shdr_type );
+        header.e_phentsize         = (*convertor)( header.e_phentsize );
+        header.e_shentsize         = (*convertor)( header.e_shentsize );
+
+		set_version( EV_CURRENT );
+    }
+
+    bool
+    load( std::istream& stream )
+    {
+        stream.seekg( 0 );
+        stream.read( reinterpret_cast<char*>( &header ), sizeof( header ) );
+
+        return (stream.gcount() == sizeof( header ) );
+    }
+
+    bool
+    save( std::ostream& stream ) const
+    {
+        stream.seekp( 0 );
+        stream.write( reinterpret_cast<const char*>( &header ), sizeof( header ) );
+
+        return stream.good();
+    }
+
+    // ELF header functions
+    ELFIO_GET_ACCESS( unsigned char, class,              header.e_ident[EI_CLASS] );
+    ELFIO_GET_ACCESS( unsigned char, elf_version,        header.e_ident[EI_VERSION] );
+    ELFIO_GET_ACCESS( unsigned char, encoding,           header.e_ident[EI_DATA] );
+    ELFIO_GET_ACCESS( Elf_Half,      header_size,        header.e_ehsize );
+    ELFIO_GET_ACCESS( Elf_Half,      section_entry_size, header.e_shentsize );
+    ELFIO_GET_ACCESS( Elf_Half,      segment_entry_size, header.e_phentsize );
+
+    ELFIO_GET_SET_ACCESS( Elf_Word,      version,         header.e_version);
+    ELFIO_GET_SET_ACCESS( unsigned char, os_abi,          header.e_ident[EI_OSABI] );
+    ELFIO_GET_SET_ACCESS( unsigned char, abi_version,     header.e_ident[EI_ABIVERSION] );
+    ELFIO_GET_SET_ACCESS( Elf_Half,      type,            header.e_type );
+    ELFIO_GET_SET_ACCESS( Elf_Half,      machine,         header.e_machine );
+    ELFIO_GET_SET_ACCESS( Elf_Word,      flags,           header.e_flags );
+    ELFIO_GET_SET_ACCESS( Elf_Half,      section_name_str_index, header.e_shstrndx );
+    ELFIO_GET_SET_ACCESS( Elf64_Addr,    entry,           header.e_entry );
+    ELFIO_GET_SET_ACCESS( Elf_Half,      sections_num,    header.e_shnum );
+    ELFIO_GET_SET_ACCESS( Elf64_Off,     sections_offset, header.e_shoff );
+    ELFIO_GET_SET_ACCESS( Elf_Half,      segments_num,    header.e_phnum );
+    ELFIO_GET_SET_ACCESS( Elf64_Off,     segments_offset, header.e_phoff );
+
+  private:
+    T header;
+    endianess_convertor* convertor;
+};
+
+} // namespace ELFIO
+
+#endif // ELF_HEADER_HPP
diff --git a/third_party/elfio/elfio_note.hpp b/third_party/elfio/elfio_note.hpp
new file mode 100644
index 00000000000..8619c7385db
--- /dev/null
+++ b/third_party/elfio/elfio_note.hpp
@@ -0,0 +1,170 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_NOTE_HPP
+#define ELFIO_NOTE_HPP
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+// There are discrepancies in documentations. SCO documentation
+// (http://www.sco.com/developers/gabi/latest/ch5.pheader.html#note_section)
+// requires 8 byte entries alignment for 64-bit ELF file,
+// but Oracle's definition uses the same structure
+// for 32-bit and 64-bit formats.
+// (https://docs.oracle.com/cd/E23824_01/html/819-0690/chapter6-18048.html)
+//
+// It looks like EM_X86_64 Linux implementation is similar to Oracle's
+// definition. Therefore, the same alignment works for both formats
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+template< class S >
+class note_section_accessor_template
+{
+  public:
+//------------------------------------------------------------------------------
+    note_section_accessor_template( const elfio& elf_file_, S* section_ ) :
+                                    elf_file( elf_file_ ), note_section( section_ )
+    {
+        process_section();
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Word
+    get_notes_num() const
+    {
+        return (Elf_Word)note_start_positions.size();
+    }
+
+//------------------------------------------------------------------------------
+    bool
+    get_note( Elf_Word     index,
+              Elf_Word&    type,
+              std::string& name,
+              void*&       desc,
+              Elf_Word&    descSize ) const
+    {
+        if ( index >= note_section->get_size() ) {
+            return false;
+        }
+
+        const char* pData = note_section->get_data() + note_start_positions[index];
+        int align = sizeof( Elf_Word );
+
+        const endianess_convertor& convertor = elf_file.get_convertor();
+        type = convertor( *(const Elf_Word*)( pData + 2*align ) );
+        Elf_Word namesz = convertor( *(const Elf_Word*)( pData ) );
+        descSize = convertor( *(const Elf_Word*)( pData + sizeof( namesz ) ) );
+        Elf_Xword max_name_size = note_section->get_size() - note_start_positions[index];
+        if ( namesz            > max_name_size ||
+             namesz + descSize > max_name_size ) {
+            return false;
+        }
+        name.assign( pData + 3*align, namesz - 1);
+        if ( 0 == descSize ) {
+            desc = 0;
+        }
+        else {
+            desc = const_cast<char*> ( pData + 3*align +
+                                       ( ( namesz + align - 1 )/align )*align );
+        }
+
+        return true;
+    }
+
+//------------------------------------------------------------------------------
+    void add_note( Elf_Word           type,
+                   const std::string& name,
+                   const void*        desc,
+                   Elf_Word           descSize )
+    {
+        const endianess_convertor& convertor = elf_file.get_convertor();
+
+        int align            = sizeof( Elf_Word );
+        Elf_Word nameLen     = (Elf_Word)name.size() + 1;
+        Elf_Word nameLenConv = convertor( nameLen );
+        std::string buffer( reinterpret_cast<char*>( &nameLenConv ), align );
+        Elf_Word descSizeConv = convertor( descSize );
+        buffer.append( reinterpret_cast<char*>( &descSizeConv ), align );
+        type = convertor( type );
+        buffer.append( reinterpret_cast<char*>( &type ), align );
+        buffer.append( name );
+        buffer.append( 1, '\x00' );
+        const char pad[] = { '\0', '\0', '\0', '\0' };
+        if ( nameLen % align != 0 ) {
+            buffer.append( pad, align - nameLen % align );
+        }
+        if ( desc != 0 && descSize != 0 ) {
+            buffer.append( reinterpret_cast<const char*>( desc ), descSize );
+            if ( descSize % align != 0 ) {
+                buffer.append( pad, align - descSize % align );
+            }
+        }
+
+        note_start_positions.push_back( note_section->get_size() );
+        note_section->append_data( buffer );
+    }
+
+  private:
+//------------------------------------------------------------------------------
+    void process_section()
+    {
+        const endianess_convertor& convertor = elf_file.get_convertor();
+        const char* data                     = note_section->get_data();
+        Elf_Xword   size                     = note_section->get_size();
+        Elf_Xword   current                  = 0;
+
+        note_start_positions.clear();
+
+        // Is it empty?
+        if ( 0 == data || 0 == size ) {
+            return;
+        }
+
+        int align = sizeof( Elf_Word );
+        while ( current + 3*align <= size ) {
+            note_start_positions.push_back( current );
+            Elf_Word namesz = convertor(
+                            *(const Elf_Word*)( data + current ) );
+            Elf_Word descsz = convertor(
+                            *(const Elf_Word*)( data + current + sizeof( namesz ) ) );
+
+            current += 3*sizeof( Elf_Word ) +
+                       ( ( namesz + align - 1 ) / align ) * align +
+                       ( ( descsz + align - 1 ) / align ) * align;
+        }
+    }
+
+//------------------------------------------------------------------------------
+  private:
+    const elfio&           elf_file;
+    S*                     note_section;
+    std::vector<Elf_Xword> note_start_positions;
+};
+
+using note_section_accessor = note_section_accessor_template<section>;
+using const_note_section_accessor = note_section_accessor_template<const section>;
+
+} // namespace ELFIO
+
+#endif // ELFIO_NOTE_HPP
diff --git a/third_party/elfio/elfio_relocation.hpp b/third_party/elfio/elfio_relocation.hpp
new file mode 100644
index 00000000000..238598e97ba
--- /dev/null
+++ b/third_party/elfio/elfio_relocation.hpp
@@ -0,0 +1,373 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_RELOCATION_HPP
+#define ELFIO_RELOCATION_HPP
+
+namespace ELFIO {
+
+template<typename T> struct get_sym_and_type;
+template<> struct get_sym_and_type< Elf32_Rel >
+{
+    static int get_r_sym( Elf_Xword info )
+    {
+        return ELF32_R_SYM( (Elf_Word)info );
+    }
+    static int get_r_type( Elf_Xword info )
+    {
+        return ELF32_R_TYPE( (Elf_Word)info );
+    }
+};
+template<> struct get_sym_and_type< Elf32_Rela >
+{
+    static int get_r_sym( Elf_Xword info )
+    {
+        return ELF32_R_SYM( (Elf_Word)info );
+    }
+    static int get_r_type( Elf_Xword info )
+    {
+        return ELF32_R_TYPE( (Elf_Word)info );
+    }
+};
+template<> struct get_sym_and_type< Elf64_Rel >
+{
+    static int get_r_sym( Elf_Xword info )
+    {
+        return ELF64_R_SYM( info );
+    }
+    static int get_r_type( Elf_Xword info )
+    {
+        return ELF64_R_TYPE( info );
+    }
+};
+template<> struct get_sym_and_type< Elf64_Rela >
+{
+    static int get_r_sym( Elf_Xword info )
+    {
+        return ELF64_R_SYM( info );
+    }
+    static int get_r_type( Elf_Xword info )
+    {
+        return ELF64_R_TYPE( info );
+    }
+};
+
+
+//------------------------------------------------------------------------------
+template< class S >
+class relocation_section_accessor_template
+{
+  public:
+//------------------------------------------------------------------------------
+    relocation_section_accessor_template( const elfio& elf_file_, S* section_ ) :
+                                          elf_file( elf_file_ ),
+                                          relocation_section( section_ )
+    {
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Xword
+    get_entries_num() const
+    {
+        Elf_Xword nRet = 0;
+
+        if ( 0 != relocation_section->get_entry_size() ) {
+            nRet = relocation_section->get_size() / relocation_section->get_entry_size();
+        }
+
+        return nRet;
+    }
+
+//------------------------------------------------------------------------------
+    bool
+    get_entry( Elf_Xword   index,
+               Elf64_Addr& offset,
+               Elf_Word&   symbol,
+               Elf_Word&   type,
+               Elf_Sxword& addend ) const
+    {
+        if ( index >= get_entries_num() ) {    // Is index valid
+            return false;
+        }
+
+        if ( elf_file.get_class() == ELFCLASS32 ) {
+            if ( SHT_REL == relocation_section->get_type() ) {
+                generic_get_entry_rel< Elf32_Rel >( index, offset, symbol,
+                                                    type,  addend );
+            }
+            else if ( SHT_RELA == relocation_section->get_type() ) {
+                generic_get_entry_rela< Elf32_Rela >( index, offset, symbol,
+                                                      type,  addend );
+            }
+        }
+        else {
+            if ( SHT_REL == relocation_section->get_type() ) {
+                generic_get_entry_rel< Elf64_Rel >( index, offset, symbol,
+                                                    type,  addend );
+            }
+            else if ( SHT_RELA == relocation_section->get_type() ) {
+                generic_get_entry_rela< Elf64_Rela >( index, offset, symbol,
+                                                      type,  addend );
+            }
+        }
+
+        return true;
+    }
+
+//------------------------------------------------------------------------------
+    bool
+    get_entry( Elf_Xword    index,
+               Elf64_Addr&  offset,
+               Elf64_Addr&  symbolValue,
+               std::string& symbolName,
+               Elf_Word&    type,
+               Elf_Sxword&  addend,
+               Elf_Sxword&  calcValue ) const
+    {
+        // Do regular job
+        Elf_Word symbol;
+        bool ret = get_entry( index, offset, symbol, type, addend );
+
+        // Find the symbol
+        Elf_Xword     size;
+        unsigned char bind;
+        unsigned char symbolType;
+        Elf_Half      section;
+        unsigned char other;
+
+        symbol_section_accessor symbols( elf_file, elf_file.sections[get_symbol_table_index()] );
+        ret = ret && symbols.get_symbol( symbol, symbolName, symbolValue,
+                                         size, bind, symbolType, section, other );
+
+        if ( ret ) { // Was it successful?
+            switch ( type ) {
+            case R_386_NONE:        // none
+                calcValue = 0;
+                break;
+            case R_386_32:          // S + A
+                calcValue = symbolValue + addend;
+                break;
+            case R_386_PC32:        // S + A - P
+                calcValue = symbolValue + addend - offset;
+                break;
+            case R_386_GOT32:       // G + A - P
+                calcValue = 0;
+                break;
+            case R_386_PLT32:       // L + A - P
+                calcValue = 0;
+                break;
+            case R_386_COPY:        // none
+                calcValue = 0;
+                break;
+            case R_386_GLOB_DAT:    // S
+            case R_386_JMP_SLOT:    // S
+                calcValue = symbolValue;
+                break;
+            case R_386_RELATIVE:    // B + A
+                calcValue = addend;
+                break;
+            case R_386_GOTOFF:      // S + A - GOT
+                calcValue = 0;
+                break;
+            case R_386_GOTPC:       // GOT + A - P
+                calcValue = 0;
+                break;
+            default:                // Not recognized symbol!
+                calcValue = 0;
+                break;
+            }
+        }
+
+        return ret;
+    }
+
+//------------------------------------------------------------------------------
+    void
+    add_entry( Elf64_Addr offset, Elf_Xword info )
+    {
+        if ( elf_file.get_class() == ELFCLASS32 ) {
+            generic_add_entry< Elf32_Rel >( offset, info );
+        }
+        else {
+            generic_add_entry< Elf64_Rel >( offset, info );
+        }
+    }
+
+//------------------------------------------------------------------------------
+    void
+    add_entry( Elf64_Addr offset, Elf_Word symbol, unsigned char type )
+    {
+        Elf_Xword info;
+        if ( elf_file.get_class() == ELFCLASS32 ) {
+            info = ELF32_R_INFO( (Elf_Xword)symbol, type );
+        }
+        else {
+            info = ELF64_R_INFO((Elf_Xword)symbol, type );
+        }
+
+        add_entry( offset, info );
+    }
+
+//------------------------------------------------------------------------------
+    void
+    add_entry( Elf64_Addr offset, Elf_Xword info, Elf_Sxword addend )
+    {
+        if ( elf_file.get_class() == ELFCLASS32 ) {
+            generic_add_entry< Elf32_Rela >( offset, info, addend );
+        }
+        else {
+            generic_add_entry< Elf64_Rela >( offset, info, addend );
+        }
+    }
+
+//------------------------------------------------------------------------------
+    void
+    add_entry( Elf64_Addr offset, Elf_Word symbol, unsigned char type,
+               Elf_Sxword addend )
+    {
+        Elf_Xword info;
+        if ( elf_file.get_class() == ELFCLASS32 ) {
+            info = ELF32_R_INFO( (Elf_Xword)symbol, type );
+        }
+        else {
+            info = ELF64_R_INFO( (Elf_Xword)symbol, type );
+        }
+
+        add_entry( offset, info, addend );
+    }
+
+//------------------------------------------------------------------------------
+    void
+    add_entry( string_section_accessor str_writer,
+               const char* str,
+               symbol_section_accessor sym_writer,
+               Elf64_Addr value,
+               Elf_Word size,
+               unsigned char sym_info,
+               unsigned char other,
+               Elf_Half shndx,
+               Elf64_Addr offset,
+               unsigned char type )
+    {
+        Elf_Word str_index = str_writer.add_string( str );
+        Elf_Word sym_index = sym_writer.add_symbol( str_index, value, size,
+                                                   sym_info, other, shndx );
+        add_entry( offset, sym_index, type );
+    }
+
+//------------------------------------------------------------------------------
+  private:
+//------------------------------------------------------------------------------
+    Elf_Half
+    get_symbol_table_index() const
+    {
+        return (Elf_Half)relocation_section->get_link();
+    }
+
+//------------------------------------------------------------------------------
+    template< class T >
+    void
+    generic_get_entry_rel( Elf_Xword   index,
+                           Elf64_Addr& offset,
+                           Elf_Word&   symbol,
+                           Elf_Word&   type,
+                           Elf_Sxword& addend ) const
+    {
+        const endianess_convertor& convertor = elf_file.get_convertor();
+
+        const T* pEntry = reinterpret_cast<const T*>(
+                relocation_section->get_data() +
+                index * relocation_section->get_entry_size() );
+        offset        = convertor( pEntry->r_offset );
+        Elf_Xword tmp = convertor( pEntry->r_info );
+        symbol        = get_sym_and_type<T>::get_r_sym( tmp );
+        type          = get_sym_and_type<T>::get_r_type( tmp );
+        addend        = 0;
+    }
+
+//------------------------------------------------------------------------------
+    template< class T >
+    void
+    generic_get_entry_rela( Elf_Xword   index,
+                            Elf64_Addr& offset,
+                            Elf_Word&   symbol,
+                            Elf_Word&   type,
+                            Elf_Sxword& addend ) const
+    {
+        const endianess_convertor& convertor = elf_file.get_convertor();
+
+        const T* pEntry = reinterpret_cast<const T*>(
+                relocation_section->get_data() +
+                index * relocation_section->get_entry_size() );
+        offset        = convertor( pEntry->r_offset );
+        Elf_Xword tmp = convertor( pEntry->r_info );
+        symbol        = get_sym_and_type<T>::get_r_sym( tmp );
+        type          = get_sym_and_type<T>::get_r_type( tmp );
+        addend        = convertor( pEntry->r_addend );
+    }
+
+//------------------------------------------------------------------------------
+    template< class T >
+    void
+    generic_add_entry( Elf64_Addr offset, Elf_Xword info )
+    {
+        const endianess_convertor& convertor = elf_file.get_convertor();
+
+        T entry;
+        entry.r_offset = offset;
+        entry.r_info   = info;
+        entry.r_offset = convertor( entry.r_offset );
+        entry.r_info   = convertor( entry.r_info );
+
+        relocation_section->append_data( reinterpret_cast<char*>( &entry ), sizeof( entry ) );
+    }
+
+//------------------------------------------------------------------------------
+    template< class T >
+    void
+    generic_add_entry( Elf64_Addr offset, Elf_Xword info, Elf_Sxword addend )
+    {
+        const endianess_convertor& convertor = elf_file.get_convertor();
+
+        T entry;
+        entry.r_offset = offset;
+        entry.r_info   = info;
+        entry.r_addend = addend;
+        entry.r_offset = convertor( entry.r_offset );
+        entry.r_info   = convertor( entry.r_info );
+        entry.r_addend = convertor( entry.r_addend );
+
+        relocation_section->append_data( reinterpret_cast<char*>( &entry ), sizeof( entry ) );
+    }
+
+//------------------------------------------------------------------------------
+  private:
+    const elfio& elf_file;
+    S*           relocation_section;
+};
+
+using relocation_section_accessor = relocation_section_accessor_template<section>;
+using const_relocation_section_accessor = relocation_section_accessor_template<const section>;
+
+} // namespace ELFIO
+
+#endif // ELFIO_RELOCATION_HPP
diff --git a/third_party/elfio/elfio_section.hpp b/third_party/elfio/elfio_section.hpp
new file mode 100644
index 00000000000..cb188c14d08
--- /dev/null
+++ b/third_party/elfio/elfio_section.hpp
@@ -0,0 +1,313 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_SECTION_HPP
+#define ELFIO_SECTION_HPP
+
+#include <string>
+#include <iostream>
+
+namespace ELFIO {
+
+class section
+{
+    friend class elfio;
+  public:
+    virtual ~section() {};
+
+    ELFIO_GET_ACCESS_DECL    ( Elf_Half,    index              );
+    ELFIO_GET_SET_ACCESS_DECL( std::string, name               );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Word,    type               );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Xword,   flags              );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Word,    info               );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Word,    link               );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Xword,   addr_align         );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Xword,   entry_size         );
+    ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr,  address            );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Xword,   size               );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Word,    name_string_offset );
+    ELFIO_GET_ACCESS_DECL    ( Elf64_Off,   offset             );
+    size_t stream_size;
+    size_t get_stream_size() const
+     {
+	return stream_size;
+     }
+
+    void set_stream_size(size_t value)
+     {
+	stream_size = value;
+     }
+
+    virtual const char* get_data() const                                = 0;
+    virtual void        set_data( const char* pData, Elf_Word size )    = 0;
+    virtual void        set_data( const std::string& data )             = 0;
+    virtual void        append_data( const char* pData, Elf_Word size ) = 0;
+    virtual void        append_data( const std::string& data )          = 0;
+
+  protected:
+    ELFIO_SET_ACCESS_DECL( Elf64_Off, offset );
+    ELFIO_SET_ACCESS_DECL( Elf_Half,  index  );
+    
+    virtual void load( std::istream&  f,
+                       std::streampos header_offset ) = 0;
+    virtual void save( std::ostream&  f,
+                       std::streampos header_offset,
+                       std::streampos data_offset )   = 0;
+    virtual bool is_address_initialized() const       = 0;
+};
+
+
+template< class T >
+class section_impl : public section
+{
+  public:
+//------------------------------------------------------------------------------
+    section_impl( const endianess_convertor* convertor_ ) : convertor( convertor_ )
+    {
+        std::fill_n( reinterpret_cast<char*>( &header ), sizeof( header ), '\0' );
+        is_address_set = false;
+        data           = 0;
+        data_size      = 0;
+    }
+
+//------------------------------------------------------------------------------
+    ~section_impl()
+    {
+        delete [] data;
+    }
+
+//------------------------------------------------------------------------------
+    // Section info functions
+    ELFIO_GET_SET_ACCESS( Elf_Word,   type,               header.sh_type      );
+    ELFIO_GET_SET_ACCESS( Elf_Xword,  flags,              header.sh_flags     );
+    ELFIO_GET_SET_ACCESS( Elf_Xword,  size,               header.sh_size      );
+    ELFIO_GET_SET_ACCESS( Elf_Word,   link,               header.sh_link      );
+    ELFIO_GET_SET_ACCESS( Elf_Word,   info,               header.sh_info      );
+    ELFIO_GET_SET_ACCESS( Elf_Xword,  addr_align,         header.sh_addralign );
+    ELFIO_GET_SET_ACCESS( Elf_Xword,  entry_size,         header.sh_entsize   );
+    ELFIO_GET_SET_ACCESS( Elf_Word,   name_string_offset, header.sh_name      );
+    ELFIO_GET_ACCESS    ( Elf64_Addr, address,            header.sh_addr      );
+
+//------------------------------------------------------------------------------
+    Elf_Half
+    get_index() const
+    {
+        return index;
+    }
+
+
+//------------------------------------------------------------------------------
+    std::string
+    get_name() const
+    {
+        return name;
+    }
+
+//------------------------------------------------------------------------------
+    void
+    set_name( std::string name_ )
+    {
+        name = name_;
+    }
+
+//------------------------------------------------------------------------------
+    void
+    set_address( Elf64_Addr value )
+    {
+        header.sh_addr = value;
+        header.sh_addr = (*convertor)( header.sh_addr );
+        is_address_set = true;
+    }
+
+//------------------------------------------------------------------------------
+    bool
+    is_address_initialized() const
+    {
+        return is_address_set;
+    }
+
+//------------------------------------------------------------------------------
+    const char*
+    get_data() const
+    {
+        return data;
+    }
+
+//------------------------------------------------------------------------------
+    void
+    set_data( const char* raw_data, Elf_Word size )
+    {
+        if ( get_type() != SHT_NOBITS ) {
+            delete [] data;
+            try {
+                data = new char[size];
+            } catch (const std::bad_alloc&) {
+                data      = 0;
+                data_size = 0;
+                size      = 0;
+            }
+            if ( 0 != data && 0 != raw_data ) {
+                data_size = size;
+                std::copy( raw_data, raw_data + size, data );
+            }
+        }
+
+        set_size( size );
+    }
+
+//------------------------------------------------------------------------------
+    void
+    set_data( const std::string& str_data )
+    {
+        return set_data( str_data.c_str(), (Elf_Word)str_data.size() );
+    }
+
+//------------------------------------------------------------------------------
+    void
+    append_data( const char* raw_data, Elf_Word size )
+    {
+        if ( get_type() != SHT_NOBITS ) {
+            if ( get_size() + size < data_size ) {
+                std::copy( raw_data, raw_data + size, data + get_size() );
+            }
+            else {
+                data_size = 2*( data_size + size);
+                char* new_data;
+                try {
+                    new_data = new char[data_size];
+                } catch (const std::bad_alloc&) {
+                    new_data = 0;
+                    size     = 0;
+                }
+                if ( 0 != new_data ) {
+                    std::copy( data, data + get_size(), new_data );
+                    std::copy( raw_data, raw_data + size, new_data + get_size() );
+                    delete [] data;
+                    data = new_data;
+                }
+            }
+            set_size( get_size() + size );
+        }
+    }
+
+//------------------------------------------------------------------------------
+    void
+    append_data( const std::string& str_data )
+    {
+        return append_data( str_data.c_str(), (Elf_Word)str_data.size() );
+    }
+
+//------------------------------------------------------------------------------
+  protected:
+//------------------------------------------------------------------------------
+    ELFIO_GET_SET_ACCESS( Elf64_Off, offset, header.sh_offset );
+
+//------------------------------------------------------------------------------
+    void
+    set_index( Elf_Half value )
+    {
+        index = value;
+    }
+
+//------------------------------------------------------------------------------
+    void
+    load( std::istream&  stream,
+          std::streampos header_offset )
+    {
+        std::fill_n( reinterpret_cast<char*>( &header ), sizeof( header ), '\0' );
+
+	stream.seekg ( 0, stream.end );
+	set_stream_size ( stream.tellg() );
+
+        stream.seekg( header_offset );
+        stream.read( reinterpret_cast<char*>( &header ), sizeof( header ) );
+
+
+        Elf_Xword size = get_size();
+	if ( 0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type() && size < get_stream_size()) {
+	    try {
+		data = new char[size + 1];
+	    } catch (const std::bad_alloc&) {
+		data      = 0;
+		data_size = 0;
+	    }
+	    if ( 0 != size ) {
+		stream.seekg( (*convertor)( header.sh_offset ) );
+		stream.read( data, size );
+		data[size] = 0; //ensure data is ended with 0 to avoid oob read
+		data_size = size;
+	    }
+	}
+    }
+
+//------------------------------------------------------------------------------
+    void
+    save( std::ostream&  f,
+          std::streampos header_offset,
+          std::streampos data_offset )
+    {
+        if ( 0 != get_index() ) {
+            header.sh_offset = data_offset;
+            header.sh_offset = (*convertor)( header.sh_offset );
+        }
+
+        save_header( f, header_offset );
+        if ( get_type() != SHT_NOBITS && get_type() != SHT_NULL &&
+             get_size() != 0 && data != 0 ) {
+            save_data( f, data_offset );
+        }
+    }
+
+//------------------------------------------------------------------------------
+  private:
+//------------------------------------------------------------------------------
+    void
+    save_header( std::ostream&  f,
+                 std::streampos header_offset ) const
+    {
+        f.seekp( header_offset );
+        f.write( reinterpret_cast<const char*>( &header ), sizeof( header ) );
+    }
+
+//------------------------------------------------------------------------------
+    void
+    save_data( std::ostream&  f,
+               std::streampos data_offset ) const
+    {
+        f.seekp( data_offset );
+        f.write( get_data(), get_size() );
+    }
+
+//------------------------------------------------------------------------------
+  private:
+    T                          header;
+    Elf_Half                   index;
+    std::string                name;
+    char*                      data;
+    Elf_Word                   data_size;
+    const endianess_convertor* convertor;
+    bool                       is_address_set;
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_SECTION_HPP
diff --git a/third_party/elfio/elfio_segment.hpp b/third_party/elfio/elfio_segment.hpp
new file mode 100644
index 00000000000..249c6f3eca6
--- /dev/null
+++ b/third_party/elfio/elfio_segment.hpp
@@ -0,0 +1,244 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_SEGMENT_HPP
+#define ELFIO_SEGMENT_HPP
+
+#include <iostream>
+#include <vector>
+
+namespace ELFIO {
+
+class segment
+{
+    friend class elfio;
+  public:
+    virtual ~segment() {};
+
+    ELFIO_GET_ACCESS_DECL    ( Elf_Half,   index            );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Word,   type             );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Word,   flags            );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Xword,  align            );
+    ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, virtual_address  );
+    ELFIO_GET_SET_ACCESS_DECL( Elf64_Addr, physical_address );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Xword,  file_size        );
+    ELFIO_GET_SET_ACCESS_DECL( Elf_Xword,  memory_size      );
+    ELFIO_GET_ACCESS_DECL( Elf64_Off, offset );
+
+    virtual const char* get_data() const = 0;
+
+    virtual Elf_Half add_section_index( Elf_Half index, Elf_Xword addr_align ) = 0;
+    virtual Elf_Half get_sections_num()                                  const = 0;
+    virtual Elf_Half get_section_index_at( Elf_Half num )                const = 0;
+    virtual bool is_offset_initialized()                                 const = 0;
+
+  protected:
+    ELFIO_SET_ACCESS_DECL( Elf64_Off, offset );
+    ELFIO_SET_ACCESS_DECL( Elf_Half,  index  );
+    
+    virtual const std::vector<Elf_Half>& get_sections() const               = 0;
+    virtual void load( std::istream& stream, std::streampos header_offset ) = 0;
+    virtual void save( std::ostream& f,      std::streampos header_offset,
+                                             std::streampos data_offset )   = 0;
+};
+
+
+//------------------------------------------------------------------------------
+template< class T >
+class segment_impl : public segment
+{
+  public:
+//------------------------------------------------------------------------------
+    segment_impl( endianess_convertor* convertor_ ) :
+        convertor( convertor_ )
+    {
+        is_offset_set = false;
+        std::fill_n( reinterpret_cast<char*>( &ph ), sizeof( ph ), '\0' );
+        data = 0;
+    }
+
+//------------------------------------------------------------------------------
+    virtual ~segment_impl()
+    {
+        delete [] data;
+    }
+
+//------------------------------------------------------------------------------
+    // Section info functions
+    ELFIO_GET_SET_ACCESS( Elf_Word,   type,             ph.p_type   );
+    ELFIO_GET_SET_ACCESS( Elf_Word,   flags,            ph.p_flags  );
+    ELFIO_GET_SET_ACCESS( Elf_Xword,  align,            ph.p_align  );
+    ELFIO_GET_SET_ACCESS( Elf64_Addr, virtual_address,  ph.p_vaddr  );
+    ELFIO_GET_SET_ACCESS( Elf64_Addr, physical_address, ph.p_paddr  );
+    ELFIO_GET_SET_ACCESS( Elf_Xword,  file_size,        ph.p_filesz );
+    ELFIO_GET_SET_ACCESS( Elf_Xword,  memory_size,      ph.p_memsz  );
+    ELFIO_GET_ACCESS( Elf64_Off, offset, ph.p_offset );
+    size_t stream_size;
+
+//------------------------------------------------------------------------------
+    size_t
+    get_stream_size() const
+    {
+       return stream_size;
+    }
+
+//------------------------------------------------------------------------------
+    void 
+    set_stream_size(size_t value)
+    {
+       stream_size = value;
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Half
+    get_index() const
+    {
+        return index;
+    }
+
+//------------------------------------------------------------------------------
+    const char*
+    get_data() const
+    {
+        return data;
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Half
+    add_section_index( Elf_Half sec_index, Elf_Xword addr_align )
+    {
+        sections.push_back( sec_index );
+        if ( addr_align > get_align() ) {
+            set_align( addr_align );
+        }
+
+        return (Elf_Half)sections.size();
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Half
+    get_sections_num() const
+    {
+        return (Elf_Half)sections.size();
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Half
+    get_section_index_at( Elf_Half num ) const
+    {
+        if ( num < sections.size() ) {
+            return sections[num];
+        }
+
+        return Elf_Half(-1);
+    }
+
+//------------------------------------------------------------------------------
+  protected:
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+    void
+    set_offset( Elf64_Off value )
+    {
+        ph.p_offset = value;
+        ph.p_offset = (*convertor)( ph.p_offset );
+        is_offset_set = true;
+    }
+
+//------------------------------------------------------------------------------
+    bool
+    is_offset_initialized() const
+    {
+        return is_offset_set;
+    }
+
+//------------------------------------------------------------------------------
+    const std::vector<Elf_Half>&
+    get_sections() const
+    {
+        return sections;
+    }
+    
+//------------------------------------------------------------------------------
+    void
+    set_index( Elf_Half value )
+    {
+        index = value;
+    }
+
+//------------------------------------------------------------------------------
+    void
+    load( std::istream&  stream,
+          std::streampos header_offset )
+    {
+
+	stream.seekg ( 0, stream.end );
+	set_stream_size ( stream.tellg() );
+
+        stream.seekg( header_offset );
+        stream.read( reinterpret_cast<char*>( &ph ), sizeof( ph ) );
+        is_offset_set = true;
+
+        if ( PT_NULL != get_type() && 0 != get_file_size() ) {
+            stream.seekg( (*convertor)( ph.p_offset ) );
+            Elf_Xword size = get_file_size();
+	    if ( size > get_stream_size() ) {
+		data = 0;
+	    } else {
+		try {
+		    data = new char[size + 1];
+		} catch (const std::bad_alloc&) {
+		    data = 0;
+		}
+		if ( 0 != data ) {
+		    stream.read( data, size );
+		    data[size] = 0;
+		}
+	    }
+        }
+    }
+
+//------------------------------------------------------------------------------
+    void save( std::ostream&  f,
+               std::streampos header_offset,
+               std::streampos data_offset )
+    {
+        ph.p_offset = data_offset;
+        ph.p_offset = (*convertor)(ph.p_offset);
+        f.seekp( header_offset );
+        f.write( reinterpret_cast<const char*>( &ph ), sizeof( ph ) );
+    }
+
+//------------------------------------------------------------------------------
+  private:
+    T                     ph;
+    Elf_Half              index;
+    char*                 data;
+    std::vector<Elf_Half> sections;
+    endianess_convertor*  convertor;
+    bool                  is_offset_set;
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_SEGMENT_HPP
diff --git a/third_party/elfio/elfio_strings.hpp b/third_party/elfio/elfio_strings.hpp
new file mode 100644
index 00000000000..552f000294f
--- /dev/null
+++ b/third_party/elfio/elfio_strings.hpp
@@ -0,0 +1,100 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_STRINGS_HPP
+#define ELFIO_STRINGS_HPP
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+template< class S >
+class string_section_accessor_template
+{
+  public:
+//------------------------------------------------------------------------------
+    string_section_accessor_template( S* section_ ) :
+                                      string_section( section_ )
+    {
+    }
+
+
+//------------------------------------------------------------------------------
+    const char*
+    get_string( Elf_Word index ) const
+    {
+        if ( string_section ) {
+            if ( index < string_section->get_size() ) {
+                const char* data = string_section->get_data();
+                if ( 0 != data ) {
+                    return data + index;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+
+//------------------------------------------------------------------------------
+    Elf_Word
+    add_string( const char* str )
+    {
+        Elf_Word current_position = 0;
+        
+        if (string_section) {
+            // Strings are addeded to the end of the current section data
+            current_position = (Elf_Word)string_section->get_size();
+
+            if ( current_position == 0 ) {
+                char empty_string = '\0';
+                string_section->append_data( &empty_string, 1 );
+                current_position++;
+            }
+            string_section->append_data( str, (Elf_Word)std::strlen( str ) + 1 );
+        }
+
+        return current_position;
+    }
+
+
+//------------------------------------------------------------------------------
+    Elf_Word
+    add_string( const std::string& str )
+    {
+        return add_string( str.c_str() );
+    }
+
+//------------------------------------------------------------------------------
+  private:
+    S* string_section;
+};
+
+using string_section_accessor = string_section_accessor_template<section>;
+using const_string_section_accessor = string_section_accessor_template<const section>;
+
+} // namespace ELFIO
+
+#endif // ELFIO_STRINGS_HPP
diff --git a/third_party/elfio/elfio_symbols.hpp b/third_party/elfio/elfio_symbols.hpp
new file mode 100644
index 00000000000..d18756a9af9
--- /dev/null
+++ b/third_party/elfio/elfio_symbols.hpp
@@ -0,0 +1,282 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_SYMBOLS_HPP
+#define ELFIO_SYMBOLS_HPP
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+template< class S >
+class symbol_section_accessor_template
+{
+  public:
+//------------------------------------------------------------------------------
+    symbol_section_accessor_template( const elfio& elf_file_, S* symbol_section_ ) :
+                                      elf_file( elf_file_ ),
+                                      symbol_section( symbol_section_ )
+    {
+        find_hash_section();
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Xword
+    get_symbols_num() const
+    {
+        Elf_Xword nRet = 0;
+        if ( 0 != symbol_section->get_entry_size() ) {
+            nRet = symbol_section->get_size() / symbol_section->get_entry_size();
+        }
+
+        return nRet;
+    }
+
+//------------------------------------------------------------------------------
+    bool
+    get_symbol( Elf_Xword      index,
+                std::string&   name,
+                Elf64_Addr&    value,
+                Elf_Xword&     size,
+                unsigned char& bind,
+                unsigned char& type,
+                Elf_Half&      section_index,
+                unsigned char& other ) const
+    {
+        bool ret = false;
+
+        if ( elf_file.get_class() == ELFCLASS32 ) {
+            ret = generic_get_symbol<Elf32_Sym>( index, name, value, size, bind,
+                                                 type, section_index, other );
+        }
+        else {
+            ret = generic_get_symbol<Elf64_Sym>( index, name, value, size, bind,
+                                                 type, section_index, other );
+        }
+
+        return ret;
+    }
+
+//------------------------------------------------------------------------------
+    bool
+    get_symbol( const std::string& name,
+                Elf64_Addr&        value,
+                Elf_Xword&         size,
+                unsigned char&     bind,
+                unsigned char&     type,
+                Elf_Half&          section_index,
+                unsigned char&     other ) const
+    {
+        bool ret = false;
+
+        if ( 0 != get_hash_table_index() ) {
+            Elf_Word nbucket = *(const Elf_Word*)hash_section->get_data();
+            Elf_Word nchain  = *(const Elf_Word*)( hash_section->get_data() +
+                                   sizeof( Elf_Word ) );
+            Elf_Word val     = elf_hash( (const unsigned char*)name.c_str() );
+
+            Elf_Word y   = *(const Elf_Word*)( hash_section->get_data() +
+                               ( 2 + val % nbucket ) * sizeof( Elf_Word ) );
+            std::string   str;
+            get_symbol( y, str, value, size, bind, type, section_index, other );
+            while ( str != name && STN_UNDEF != y && y < nchain ) {
+                y = *(const Elf_Word*)( hash_section->get_data() +
+                        ( 2 + nbucket + y ) * sizeof( Elf_Word ) );
+                get_symbol( y, str, value, size, bind, type, section_index, other );
+            }
+            if (  str == name ) {
+                ret = true;
+            }
+        }
+
+        return ret;
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Word
+    add_symbol( Elf_Word name, Elf64_Addr value, Elf_Xword size,
+                unsigned char info, unsigned char other,
+                Elf_Half shndx )
+    {
+        Elf_Word nRet;
+
+        if ( symbol_section->get_size() == 0 ) {
+            if ( elf_file.get_class() == ELFCLASS32 ) {
+                nRet = generic_add_symbol<Elf32_Sym>( 0, 0, 0, 0, 0, 0 );
+            }
+            else {
+                nRet = generic_add_symbol<Elf64_Sym>( 0, 0, 0, 0, 0, 0 );
+            }
+        }
+
+        if ( elf_file.get_class() == ELFCLASS32 ) {
+            nRet = generic_add_symbol<Elf32_Sym>( name, value, size, info, other,
+                                                  shndx );
+        }
+        else {
+            nRet = generic_add_symbol<Elf64_Sym>( name, value, size, info, other,
+                                                  shndx );
+        }
+
+        return nRet;
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Word
+    add_symbol( Elf_Word name, Elf64_Addr value, Elf_Xword size,
+                unsigned char bind, unsigned char type, unsigned char other,
+                Elf_Half shndx )
+    {
+        return add_symbol( name, value, size, ELF_ST_INFO( bind, type ), other, shndx );
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Word
+    add_symbol( string_section_accessor& pStrWriter, const char* str,
+                Elf64_Addr value, Elf_Xword size,
+                unsigned char info, unsigned char other,
+                Elf_Half shndx )
+    {
+        Elf_Word index = pStrWriter.add_string( str );
+        return add_symbol( index, value, size, info, other, shndx );
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Word
+    add_symbol( string_section_accessor& pStrWriter, const char* str,
+                Elf64_Addr value, Elf_Xword size,
+                unsigned char bind, unsigned char type, unsigned char other,
+                Elf_Half shndx )
+    {
+        return add_symbol( pStrWriter, str, value, size, ELF_ST_INFO( bind, type ), other, shndx );
+    }
+
+//------------------------------------------------------------------------------
+  private:
+//------------------------------------------------------------------------------
+    void
+    find_hash_section()
+    {
+        hash_section       = 0;
+        hash_section_index = 0;
+        Elf_Half nSecNo = elf_file.sections.size();
+        for ( Elf_Half i = 0; i < nSecNo && 0 == hash_section_index; ++i ) {
+            const section* sec = elf_file.sections[i];
+            if ( sec->get_link() == symbol_section->get_index() ) {
+                hash_section       = sec;
+                hash_section_index = i;
+            }
+        }
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Half
+    get_string_table_index() const
+    {
+        return (Elf_Half)symbol_section->get_link();
+    }
+
+//------------------------------------------------------------------------------
+    Elf_Half
+    get_hash_table_index() const
+    {
+        return hash_section_index;
+    }
+
+//------------------------------------------------------------------------------
+    template< class T >
+    bool
+    generic_get_symbol( Elf_Xword index,
+                        std::string& name, Elf64_Addr& value,
+                        Elf_Xword& size,
+                        unsigned char& bind, unsigned char& type,
+                        Elf_Half& section_index,
+                        unsigned char& other ) const
+    {
+        bool ret = false;
+
+        if ( index < get_symbols_num() ) {
+            const T* pSym = reinterpret_cast<const T*>(
+                symbol_section->get_data() +
+                    index * symbol_section->get_entry_size() );
+
+            const endianess_convertor& convertor = elf_file.get_convertor();
+
+            section* string_section = elf_file.sections[get_string_table_index()];
+            string_section_accessor str_reader( string_section );
+            const char* pStr = str_reader.get_string( convertor( pSym->st_name ) );
+            if ( 0 != pStr ) {
+                name = pStr;
+            }
+            value   = convertor( pSym->st_value );
+            size    = convertor( pSym->st_size );
+            bind    = ELF_ST_BIND( pSym->st_info );
+            type    = ELF_ST_TYPE( pSym->st_info );
+            section_index = convertor( pSym->st_shndx );
+            other   = pSym->st_other;
+
+            ret = true;
+        }
+
+        return ret;
+    }
+
+//------------------------------------------------------------------------------
+    template< class T >
+    Elf_Word
+    generic_add_symbol( Elf_Word name, Elf64_Addr value, Elf_Xword size,
+                        unsigned char info, unsigned char other,
+                        Elf_Half shndx )
+    {
+        const endianess_convertor& convertor = elf_file.get_convertor();
+
+        T entry;
+        entry.st_name  = convertor( name );
+        entry.st_value = value;
+        entry.st_value = convertor( entry.st_value );
+        entry.st_size  = size;
+        entry.st_size  = convertor( entry.st_size );
+        entry.st_info  = convertor( info );
+        entry.st_other = convertor( other );
+        entry.st_shndx = convertor( shndx );
+
+        symbol_section->append_data( reinterpret_cast<char*>( &entry ),
+                                     sizeof( entry ) );
+
+        Elf_Word nRet = symbol_section->get_size() / sizeof( entry ) - 1;
+
+        return nRet;
+    }
+
+//------------------------------------------------------------------------------
+  private:
+    const elfio&   elf_file;
+    S*             symbol_section;
+    Elf_Half       hash_section_index;
+    const section* hash_section;
+};
+
+using symbol_section_accessor = symbol_section_accessor_template<section>;
+using const_symbol_section_accessor = symbol_section_accessor_template<const section>;
+
+} // namespace ELFIO
+
+#endif // ELFIO_SYMBOLS_HPP
diff --git a/third_party/elfio/elfio_utils.hpp b/third_party/elfio/elfio_utils.hpp
new file mode 100644
index 00000000000..2baf5a77ccb
--- /dev/null
+++ b/third_party/elfio/elfio_utils.hpp
@@ -0,0 +1,209 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_UTILS_HPP
+#define ELFIO_UTILS_HPP
+
+#define ELFIO_GET_ACCESS( TYPE, NAME, FIELD ) \
+    TYPE get_##NAME() const                   \
+    {                                         \
+        return (*convertor)( FIELD );         \
+    }
+#define ELFIO_SET_ACCESS( TYPE, NAME, FIELD ) \
+    void set_##NAME( TYPE value )             \
+    {                                         \
+        FIELD = value;                        \
+        FIELD = (*convertor)( FIELD );        \
+    }
+#define ELFIO_GET_SET_ACCESS( TYPE, NAME, FIELD ) \
+    TYPE get_##NAME() const                       \
+    {                                             \
+        return (*convertor)( FIELD );             \
+    }                                             \
+    void set_##NAME( TYPE value )                 \
+    {                                             \
+        FIELD = value;                            \
+        FIELD = (*convertor)( FIELD );            \
+    }
+
+#define ELFIO_GET_ACCESS_DECL( TYPE, NAME ) \
+    virtual TYPE get_##NAME() const = 0
+
+#define ELFIO_SET_ACCESS_DECL( TYPE, NAME ) \
+    virtual void set_##NAME( TYPE value ) = 0
+
+#define ELFIO_GET_SET_ACCESS_DECL( TYPE, NAME ) \
+    virtual TYPE get_##NAME() const = 0;        \
+    virtual void set_##NAME( TYPE value ) = 0
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+class endianess_convertor {
+  public:
+//------------------------------------------------------------------------------
+    endianess_convertor()
+    {
+        need_conversion = false;
+    }
+
+//------------------------------------------------------------------------------
+    void
+    setup( unsigned char elf_file_encoding )
+    {
+        need_conversion = ( elf_file_encoding != get_host_encoding() );
+    }
+
+//------------------------------------------------------------------------------
+    uint64_t
+    operator()( uint64_t value ) const
+    {
+        if ( !need_conversion ) {
+            return value;
+        }
+        value =
+            ( ( value & 0x00000000000000FFull ) << 56 ) |
+            ( ( value & 0x000000000000FF00ull ) << 40 ) |
+            ( ( value & 0x0000000000FF0000ull ) << 24 ) |
+            ( ( value & 0x00000000FF000000ull ) <<  8 ) |
+            ( ( value & 0x000000FF00000000ull ) >>  8 ) |
+            ( ( value & 0x0000FF0000000000ull ) >> 24 ) |
+            ( ( value & 0x00FF000000000000ull ) >> 40 ) |
+            ( ( value & 0xFF00000000000000ull ) >> 56 );
+
+        return value;
+    }
+
+//------------------------------------------------------------------------------
+    int64_t
+    operator()( int64_t value ) const
+    {
+        if ( !need_conversion ) {
+            return value;
+        }
+        return (int64_t)(*this)( (uint64_t)value );
+    }
+
+//------------------------------------------------------------------------------
+    uint32_t
+    operator()( uint32_t value ) const
+    {
+        if ( !need_conversion ) {
+            return value;
+        }
+        value =
+            ( ( value & 0x000000FF ) << 24 ) |
+            ( ( value & 0x0000FF00 ) <<  8 ) |
+            ( ( value & 0x00FF0000 ) >>  8 ) |
+            ( ( value & 0xFF000000 ) >> 24 );
+
+        return value;
+    }
+
+//------------------------------------------------------------------------------
+    int32_t
+    operator()( int32_t value ) const
+    {
+        if ( !need_conversion ) {
+            return value;
+        }
+        return (int32_t)(*this)( (uint32_t)value );
+    }
+
+//------------------------------------------------------------------------------
+    uint16_t
+    operator()( uint16_t value ) const
+    {
+        if ( !need_conversion ) {
+            return value;
+        }
+        value =
+            ( ( value & 0x00FF ) <<  8 ) |
+            ( ( value & 0xFF00 ) >>  8 );
+
+        return value;
+    }
+
+//------------------------------------------------------------------------------
+    int16_t
+    operator()( int16_t value ) const
+    {
+        if ( !need_conversion ) {
+            return value;
+        }
+        return (int16_t)(*this)( (uint16_t)value );
+    }
+
+//------------------------------------------------------------------------------
+    int8_t
+    operator()( int8_t value ) const
+    {
+        return value;
+    }
+
+//------------------------------------------------------------------------------
+    uint8_t
+    operator()( uint8_t value ) const
+    {
+        return value;
+    }
+
+//------------------------------------------------------------------------------
+  private:
+//------------------------------------------------------------------------------
+    unsigned char
+    get_host_encoding() const
+    {
+        static const int tmp = 1;
+        if ( 1 == *(const char*)&tmp ) {
+            return ELFDATA2LSB;
+        }
+        else {
+            return ELFDATA2MSB;
+        }
+    }
+
+//------------------------------------------------------------------------------
+  private:
+    bool need_conversion;
+};
+
+
+//------------------------------------------------------------------------------
+inline
+uint32_t
+elf_hash( const unsigned char *name )
+{
+    uint32_t h = 0, g;
+    while ( *name ) {
+        h = (h << 4) + *name++;
+        g = h & 0xf0000000;
+        if ( g != 0 )
+            h ^= g >> 24;
+        h &= ~g;
+    }
+    return h;
+}
+
+} // namespace ELFIO
+
+#endif // ELFIO_UTILS_HPP